From bf59180337ffb86bbfef47bdc6084e3250266c54 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 23 Apr 2016 22:38:49 +0900 Subject: [PATCH 01/14] starting working on positions --- src/core/index.rs | 4 +- src/core/postings.rs | 256 ++++++++++++++++++++++++++++--------------- src/core/reader.rs | 1 - src/core/writer.rs | 6 +- 4 files changed, 174 insertions(+), 93 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index a3d7abe78..19c268fef 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -238,7 +238,7 @@ pub struct SegmentInfo { pub enum SegmentComponent { INFO, POSTINGS, - // POSITIONS, + POSITIONS, FASTFIELDS, TERMS, STORE, @@ -264,7 +264,7 @@ impl Segment { fn path_suffix(component: &SegmentComponent)-> &'static str { match *component { - // SegmentComponent::POSITIONS => ".pos", + SegmentComponent::POSITIONS => ".pos", SegmentComponent::INFO => ".info", SegmentComponent::POSTINGS => ".idx", SegmentComponent::TERMS => ".term", diff --git a/src/core/postings.rs b/src/core/postings.rs index 7401d5076..e11b68117 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -35,8 +35,85 @@ impl BinarySerializable for TermInfo { } +pub trait U32sRecorder { + fn new() -> Self; + fn record(&mut self, val: u32); +} + +pub struct VecRecorder(Vec); + +impl U32sRecorder for VecRecorder { + fn new() -> VecRecorder { + VecRecorder(Vec::new()) + } + fn record(&mut self, val: u32) { + self.0.push(val); + } +} + +pub struct ObliviousRecorder; + +impl U32sRecorder for ObliviousRecorder { + fn new() -> ObliviousRecorder { + ObliviousRecorder + } + fn record(&mut self, val: u32) { + } +} + +struct TermPostingsWriter { + doc_ids: Vec, + term_freqs: TermFreqsRec, + positions: PositionsRec, + current_freq: u32, +} + +impl TermPostingsWriter { + pub fn new() -> TermPostingsWriter { + TermPostingsWriter { + doc_ids: Vec::new(), + term_freqs: TermFreqsRec::new(), + positions: PositionsRec::new(), + current_freq: 0, + } + } + + fn close_doc(&mut self,) { + self.term_freqs.record(self.current_freq); + self.current_freq = 0; + } + + fn close(&mut self,) { + if self.current_freq > 0 { + self.close_doc(); + } + } + + fn is_new_doc(&self, doc: &DocId) -> bool { + match self.doc_ids.last() { + Some(&last_doc) => last_doc != *doc, + None => true, + } + } + + pub fn doc_freq(&self) -> u32 { + self.doc_ids.len() as u32 + } + + pub fn suscribe(&mut self, doc: DocId, pos: u32) { + if self.is_new_doc(&doc) { + // this is the first time we meet this term for this document + // first close the previous document, and write its doc_freq. + self.close_doc(); + self.doc_ids.push(doc); + } + self.current_freq += 1; + self.positions.record(pos); + } +} + pub struct PostingsWriter { - postings: Vec>, + postings: Vec>, term_index: BTreeMap, } @@ -49,14 +126,12 @@ impl PostingsWriter { } } - pub fn suscribe(&mut self, doc: DocId, term: Term) { - let doc_ids: &mut Vec = self.get_term_postings(term); - if doc_ids.len() == 0 || doc_ids[doc_ids.len() - 1] < doc { - doc_ids.push(doc); - } + pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) { + let doc_ids: &mut TermPostingsWriter = self.get_term_postings(term); + doc_ids.suscribe(doc, pos); } - fn get_term_postings(&mut self, term: Term) -> &mut Vec { + fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter { match self.term_index.get(&term) { Some(unord_id) => { return &mut self.postings[*unord_id]; @@ -64,17 +139,17 @@ impl PostingsWriter { None => {} } let unord_id = self.term_index.len(); - self.postings.push(Vec::new()); + self.postings.push(TermPostingsWriter::new()); self.term_index.insert(term, unord_id.clone()); &mut self.postings[unord_id] } pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> { for (term, postings_id) in self.term_index.iter() { - let doc_ids = &self.postings[postings_id.clone()]; - let term_docfreq = doc_ids.len() as u32; + let term_postings_writer = &self.postings[postings_id.clone()]; + let term_docfreq = term_postings_writer.doc_freq(); try!(serializer.new_term(&term, term_docfreq)); - try!(serializer.write_docs(&doc_ids)); + try!(serializer.write_docs(&term_postings_writer.doc_ids)); } Ok(()) } @@ -93,60 +168,62 @@ pub trait Postings: Iterator { fn skip_next(&mut self, target: DocId) -> Option; } -pub struct IntersectionPostings { - postings: Vec, -} - -impl IntersectionPostings { - pub fn from_postings(postings: Vec) -> IntersectionPostings { - IntersectionPostings { - postings: postings, - } - } -} - -impl Iterator for IntersectionPostings { - type Item = DocId; - fn next(&mut self,) -> Option { - let mut candidate; - match self.postings[0].next() { - Some(val) => { - candidate = val; - }, - None => { - return None; - } - } - 'outer: loop { - for i in 1..self.postings.len() { - let skip_result = self.postings[i].skip_next(candidate); - match skip_result { - None => { - return None; - }, - Some(x) if x == candidate => { - }, - Some(greater) => { - unsafe { - let pa: *mut T = &mut self.postings[i]; - let pb: *mut T = &mut self.postings[0]; - ptr::swap(pa, pb); - } - candidate = greater; - continue 'outer; - }, - } - } - return Some(candidate); - } - - } -} +// pub struct IntersectionPostings { +// postings: Vec, +// } +// +// impl IntersectionPostings { +// pub fn from_postings(postings: Vec) -> IntersectionPostings { +// IntersectionPostings { +// postings: postings, +// } +// } +// } +// +// impl Iterator for IntersectionPostings { +// type Item = DocId; +// fn next(&mut self,) -> Option { +// let mut candidate; +// match self.postings[0].next() { +// Some(val) => { +// candidate = val; +// }, +// None => { +// return None; +// } +// } +// 'outer: loop { +// for i in 1..self.postings.len() { +// let skip_result = self.postings[i].skip_next(candidate); +// match skip_result { +// None => { +// return None; +// }, +// Some(x) if x == candidate => { +// }, +// Some(greater) => { +// unsafe { +// let pa: *mut T = &mut self.postings[i]; +// let pb: *mut T = &mut self.postings[0]; +// ptr::swap(pa, pb); +// } +// candidate = greater; +// continue 'outer; +// }, +// } +// } +// return Some(candidate); +// } +// +// } +// } pub struct PostingsSerializer { terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" postings_write: WritePtr, + positions_write: WritePtr, written_bytes_postings: usize, + written_bytes_positions: usize, encoder: simdcompression::Encoder, } @@ -156,10 +233,13 @@ impl PostingsSerializer { let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); + let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, postings_write: postings_write, + positions_write: positions_write, written_bytes_postings: 0, + written_bytes_positions: 0, encoder: simdcompression::Encoder::new(), }) } @@ -246,32 +326,32 @@ mod tests { } } } - - #[test] - fn test_intersection() { - { - let left = VecPostings::new(vec!(1, 3, 9)); - let right = VecPostings::new(vec!(3, 4, 9, 18)); - let inter = IntersectionPostings::from_postings(vec!(left, right)); - let vals: Vec = inter.collect(); - assert_eq!(vals, vec!(3, 9)); - } - { - let a = VecPostings::new(vec!(1, 3, 9)); - let b = VecPostings::new(vec!(3, 4, 9, 18)); - let c = VecPostings::new(vec!(1, 5, 9, 111)); - let inter = IntersectionPostings::from_postings(vec!(a, b, c)); - let vals: Vec = inter.collect(); - assert_eq!(vals, vec!(9)); - } - } - - #[bench] - fn bench_single_intersection(b: &mut Bencher) { - b.iter(|| { - let docs = VecPostings::new((0..1_000_000).collect()); - let intersection = IntersectionPostings::from_postings(vec!(docs)); - intersection.count() - }); - } + // + // #[test] + // fn test_intersection() { + // { + // let left = VecPostings::new(vec!(1, 3, 9)); + // let right = VecPostings::new(vec!(3, 4, 9, 18)); + // let inter = IntersectionPostings::from_postings(vec!(left, right)); + // let vals: Vec = inter.collect(); + // assert_eq!(vals, vec!(3, 9)); + // } + // { + // let a = VecPostings::new(vec!(1, 3, 9)); + // let b = VecPostings::new(vec!(3, 4, 9, 18)); + // let c = VecPostings::new(vec!(1, 5, 9, 111)); + // let inter = IntersectionPostings::from_postings(vec!(a, b, c)); + // let vals: Vec = inter.collect(); + // assert_eq!(vals, vec!(9)); + // } + // } + // + // #[bench] + // fn bench_single_intersection(b: &mut Bencher) { + // b.iter(|| { + // let docs = VecPostings::new((0..1_000_000).collect()); + // let intersection = IntersectionPostings::from_postings(vec!(docs)); + // intersection.count() + // }); + // } } diff --git a/src/core/reader.rs b/src/core/reader.rs index a9d7a15a5..57f1c405f 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -2,7 +2,6 @@ use core::index::{Segment, SegmentId}; use core::schema::Term; use core::store::StoreReader; use core::schema::Document; -use core::postings::IntersectionPostings; use core::directory::ReadOnlySource; use std::io::Cursor; use core::schema::DocId; diff --git a/src/core/writer.rs b/src/core/writer.rs index 446dabf65..6fd2333e6 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -159,11 +159,13 @@ impl SegmentWriter { let field_options = schema.text_field_options(&field_value.field); if field_options.is_tokenized_indexed() { let mut tokens = self.tokenizer.tokenize(&field_value.text); + let mut pos = 0u32; loop { match tokens.next() { Some(token) => { let term = Term::from_field_text(&field_value.field, token); - self.postings_writer.suscribe(doc_id, term); + self.postings_writer.suscribe(doc_id, pos.clone(), term); + pos += 1; }, None => { break; } } @@ -174,7 +176,7 @@ impl SegmentWriter { let field_options = schema.u32_field_options(&field_value.field); if field_options.is_indexed() { let term = Term::from_field_u32(&field_value.field, field_value.value); - self.postings_writer.suscribe(doc_id, term); + self.postings_writer.suscribe(doc_id, 0.clone(), term); } } self.fast_field_writers.add_document(&doc); From e239b1c989096c550726eb52c10775dedc2dd890 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 24 Apr 2016 09:52:43 +0900 Subject: [PATCH 02/14] added position in the write_doc interface. --- src/core/merger.rs | 4 +++- src/core/postings.rs | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/core/merger.rs b/src/core/merger.rs index 96655f7fd..882493755 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -181,7 +181,9 @@ impl IndexMerger { match postings_merger.next() { Some((term, doc_ids)) => { try!(postings_serializer.new_term(&Term::from(&term), doc_ids.len() as DocId)); - try!(postings_serializer.write_docs(doc_ids)); + for doc_id in doc_ids.iter() { + try!(postings_serializer.write_doc(doc_id.clone(), None)); + } } None => { break; } } diff --git a/src/core/postings.rs b/src/core/postings.rs index e11b68117..f309db8f8 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -149,7 +149,10 @@ impl PostingsWriter { let term_postings_writer = &self.postings[postings_id.clone()]; let term_docfreq = term_postings_writer.doc_freq(); try!(serializer.new_term(&term, term_docfreq)); - try!(serializer.write_docs(&term_postings_writer.doc_ids)); + for doc in term_postings_writer.doc_ids.iter() { + try!(serializer.write_doc(doc.clone(), None)); + } + } Ok(()) } @@ -225,6 +228,7 @@ pub struct PostingsSerializer { written_bytes_postings: usize, written_bytes_positions: usize, encoder: simdcompression::Encoder, + doc_ids: Vec, } impl PostingsSerializer { @@ -241,10 +245,13 @@ impl PostingsSerializer { written_bytes_postings: 0, written_bytes_positions: 0, encoder: simdcompression::Encoder::new(), + doc_ids: Vec::new(), }) } pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { + try!(self.close_term()); + self.doc_ids.clear(); let term_info = TermInfo { doc_freq: doc_freq, postings_offset: self.written_bytes_postings as u32, @@ -253,16 +260,33 @@ impl PostingsSerializer { .insert(term.as_slice(), &term_info) } - pub fn write_docs(&mut self, doc_ids: &[DocId]) -> io::Result<()> { - let docs_data = self.encoder.encode_sorted(doc_ids); - self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); - for num in docs_data { - self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); + pub fn close_term(&mut self,) -> io::Result<()> { + if !self.doc_ids.is_empty() { + let docs_data = self.encoder.encode_sorted(&self.doc_ids); + self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); + for num in docs_data { + self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); + } } Ok(()) } + pub fn write_doc(&mut self, doc_id: DocId, positions: Option<&[u32]>) -> io::Result<()> { + self.doc_ids.push(doc_id); + Ok(()) + } + + // pub fn add_doc(&mut self, doc_ids: &[DocId]) -> io::Result<()> { + // let docs_data = self.encoder.encode_sorted(doc_ids); + // self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); + // for num in docs_data { + // self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); + // } + // Ok(()) + // } + pub fn close(mut self,) -> io::Result<()> { + try!(self.close_term()); try!(self.terms_fst_builder.finish()); try!(self.postings_write.flush()); Ok(()) From acf7312af956e3849255287588569a15223280ee Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 24 Apr 2016 19:09:35 +0900 Subject: [PATCH 03/14] Added independant method for block compression, vint compression --- Cargo.toml | 1 - cpp/encode.cpp | 55 +++++++++- src/core/simdcompression.rs | 200 ++++++++++++++++++++++++++++++++---- src/lib.rs | 1 - 4 files changed, 232 insertions(+), 25 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 837e63965..5748dbd3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,6 @@ log = "0.3.5" combine = "1.2.0" tempdir = "0.3.4" bincode = "0.4.0" -serde = "0.6.11" libc = "0.2.6" argparse = "*" num_cpus = "0.2" diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 47da680cd..fa538b0f1 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -4,19 +4,72 @@ #include "codecfactory.h" #include "intersection.h" #include "variablebyte.h" +#include "util.h" using namespace SIMDCompressionLib; + // sorted static shared_ptr codec_sorted = CODECFactory::getFromName("s4-bp128-dm"); + // variable byte static VariableByte codec_unsorted = VariableByte(); -static SIMDBinaryPacking> codec_packed_sorted = SIMDBinaryPacking>(); +static SIMDBinaryPacking> simd_pack_sorted = SIMDBinaryPacking>(); + +static VariableByte vint_codec = VariableByte(); +// SIMDBinaryPacking extern "C" { + // encode 128 u32 at a time. + size_t encode_sorted_block128_native( + uint32_t* begin, + uint32_t* output, + const size_t output_capacity) { + size_t output_length = output_capacity; + simd_pack_sorted.encodeArray(begin, + 128, + output, + output_length); + return output_length; + } + + size_t decode_sorted_block128_native( + const uint32_t* compressed_data, + const size_t compressed_size, + uint32_t* uncompressed, + const size_t uncompressed_capacity) { + size_t num_ints = uncompressed_capacity; + simd_pack_sorted.decodeArray(compressed_data, compressed_size, uncompressed, num_ints); + return num_ints; + } + + size_t encode_sorted_vint_native( + uint32_t* begin, + const size_t num_els, + uint32_t* output, + const size_t output_capacity) { + size_t output_length = output_capacity; + vint_codec.encodeArray(begin, + num_els, + output, + output_length); + return output_length; + } + + size_t decode_sorted_vint_native( + const uint32_t* compressed_data, + const size_t compressed_size, + uint32_t* uncompressed, + const size_t uncompressed_capacity) { + size_t num_ints = uncompressed_capacity; + vint_codec.decodeArray(compressed_data, compressed_size, uncompressed, num_ints); + return num_ints; + } + + size_t encode_sorted_native( uint32_t* begin, const size_t num_els, diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index 4434c071c..9e816ca64 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -1,12 +1,25 @@ use libc::size_t; use std::ptr; +use std::iter; extern { // fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; // fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; + + // complete s4-bp128-dm fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + + // bp128, only encodes group of 128 u32 at a time + fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + + // vints, used as the left over codec for the <128 remaining values + fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + } pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { @@ -18,6 +31,127 @@ pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { } } + + +//------------------------- +// Vint + + +pub struct VIntEncoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl VIntEncoder { + + pub fn new() -> VIntEncoder { + VIntEncoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert!(input.len() < 128); + let input_len = input.len(); + let written_size: usize; + // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + written_size = encode_sorted_vint_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + + + +pub struct VIntDecoder; + +impl VIntDecoder { + + pub fn new() -> VIntDecoder { + VIntDecoder + } + + pub fn decode_sorted(&self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_vint_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + +//------------------------- +// Block128 + +pub struct Block128Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl Block128Encoder { + + pub fn new() -> Block128Encoder { + Block128Encoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert_eq!(input.len(), 128); + // TODO use clone_from when available + let written_size: usize; + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); + written_size = encode_sorted_native( + self.input_buffer.as_mut_ptr(), + 128, + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + +pub struct Block128Decoder; + +impl Block128Decoder { + + pub fn new() -> Block128Decoder { + Block128Decoder + } + + pub fn decode_sorted( + &self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + +//------------------------- +// s4-bp128-dm + + pub struct Encoder { input_buffer: Vec, output_buffer: Vec, @@ -52,28 +186,6 @@ impl Encoder { return &self.output_buffer[0..written_size]; } } - - - // pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] { - // self.input_buffer.clear(); - // let input_len = input.len(); - // if input_len + 10000 >= self.input_buffer.len() { - // let target_length = input_len + 1024; - // self.input_buffer.resize(target_length, 0); - // self.output_buffer.resize(target_length, 0); - // } - // // TODO use clone_from when available - // unsafe { - // ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - // let written_size = encode_unsorted_native( - // self.input_buffer.as_mut_ptr(), - // input_len as size_t, - // self.output_buffer.as_mut_ptr(), - // self.output_buffer.len() as size_t, - // ); - // return &self.output_buffer[0..written_size]; - // } - // } } @@ -149,6 +261,7 @@ mod tests { use super::*; use test::Bencher; + use std::iter; use rand::Rng; use rand::SeedableRng; use rand::XorShiftRng; @@ -182,6 +295,49 @@ mod tests { assert_eq!(decoded_data, input); } + #[test] + fn test_encode_block() { + let mut encoder = Block128Encoder::new(); + let expected_length = 21; + let input: Vec = (0u32..128u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = Block128Decoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } + + + + #[test] + fn test_encode_vint() { + { + let mut encoder = VIntEncoder::new(); + let expected_length = 31; + let input: Vec = (0u32..123u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = VIntDecoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(&decoded_data[0..123], &input[..]); + } + { + let mut encoder = VIntEncoder::new(); + let input = vec!(3, 17u32, 187); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), 1); + assert_eq!(encoded_data[0], 2167049859u32); + } + } + // #[test] // fn test_encode_unsorted() { // let mut encoder = Encoder::new(); diff --git a/src/lib.rs b/src/lib.rs index c22fbce4f..93735e573 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,6 @@ extern crate atomicwrites; extern crate tempdir; extern crate bincode; extern crate time; -extern crate serde; extern crate libc; extern crate lz4; extern crate uuid; From 093ed025e29bae0b10c0110450056aa8af873c6d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 28 Apr 2016 10:37:30 +0900 Subject: [PATCH 04/14] Added vint binary serializationb --- src/core/fastfield.rs | 4 +- src/core/serialize.rs | 166 +++++++++++++++++++++++------------------- 2 files changed, 95 insertions(+), 75 deletions(-) diff --git a/src/core/fastfield.rs b/src/core/fastfield.rs index 269f7fd83..8678a53fb 100644 --- a/src/core/fastfield.rs +++ b/src/core/fastfield.rs @@ -330,7 +330,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 29 as usize); + assert_eq!(source.len(), 26 as usize); } { let fast_field_readers = U32FastFieldsReader::open(source).unwrap(); @@ -365,7 +365,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 61 as usize); + assert_eq!(source.len(), 58 as usize); } { let fast_field_readers = U32FastFieldsReader::open(source).unwrap(); diff --git a/src/core/serialize.rs b/src/core/serialize.rs index 652ec2b77..48705f7a0 100644 --- a/src/core/serialize.rs +++ b/src/core/serialize.rs @@ -6,6 +6,64 @@ use std::io::Read; use std::io; use byteorder; +pub trait BinarySerializable : fmt::Debug + Sized { + fn serialize(&self, writer: &mut Write) -> io::Result; + fn deserialize(reader: &mut Read) -> io::Result; +} + +#[derive(Debug, Eq, PartialEq)] +pub struct VInt(pub u64); + +impl VInt { + pub fn val(&self,) -> u64 { + self.0.clone() + } +} + +impl BinarySerializable for VInt { + fn serialize(&self, writer: &mut Write) -> io::Result { + let mut remaining = self.0.clone(); + let mut written: usize = 0; + let mut buffer = [0u8; 10]; + loop { + let mut next_byte: u8 = (remaining % 128u64) as u8; + remaining /= 128u64; + if remaining == 0u64 { + buffer[written] = next_byte; + written += 1; + break; + } + else { + next_byte |= 128u8; + buffer[written] = next_byte; + written += 1; + } + } + try!(writer.write_all(&buffer[0..written])); + Ok(written) + } + + fn deserialize(reader: &mut Read) -> io::Result { + let mut bytes = reader.bytes(); + let mut result = 0u64; + let mut shift = 0u64; + loop { + match bytes.next() { + Some(Ok(b)) => { + result += ((b % 128u8) as u64) << shift; + if b & 128 == 0u8 { + break; + } + shift += 7; + } + _ => { + return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")) + } + } + } + Ok(VInt(result)) + } +} fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error { @@ -15,11 +73,6 @@ fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error { } } -pub trait BinarySerializable : fmt::Debug + Sized { - fn serialize(&self, writer: &mut Write) -> io::Result; - fn deserialize(reader: &mut Read) -> io::Result; -} - impl BinarySerializable for () { fn serialize(&self, _: &mut Write) -> io::Result { Ok(0) @@ -31,14 +84,14 @@ impl BinarySerializable for () { impl BinarySerializable for Vec { fn serialize(&self, writer: &mut Write) -> io::Result { - let mut total_size = try!((self.len() as u32).serialize(writer)); + let mut total_size = try!(VInt(self.len() as u64).serialize(writer)); for it in self.iter() { total_size += try!(it.serialize(writer)); } Ok(total_size) } fn deserialize(reader: &mut Read) -> io::Result> { - let num_items = try!(u32::deserialize(reader)); + let num_items = try!(VInt::deserialize(reader)).val(); let mut items: Vec = Vec::with_capacity(num_items as usize); for _ in 0..num_items { let item = try!(T::deserialize(reader)); @@ -99,17 +152,15 @@ impl BinarySerializable for u8 { impl BinarySerializable for String { fn serialize(&self, writer: &mut Write) -> io::Result { - // TODO error let data: &[u8] = self.as_bytes(); - let mut size = try!((data.len() as u32).serialize(writer)); + let mut size = try!(VInt(data.len() as u64).serialize(writer)); size += data.len(); try!(writer.write_all(data)); Ok(size) } fn deserialize(reader: &mut Read) -> io::Result { - // TODO error - let string_length = try!(u32::deserialize(reader)) as usize; + let string_length = try!(VInt::deserialize(reader)).val() as usize; let mut result = String::with_capacity(string_length); try!(reader.take(string_length as u64).read_to_string(&mut result)); Ok(result) @@ -120,84 +171,53 @@ impl BinarySerializable for String { #[cfg(test)] mod test { - use core::serialize::BinarySerializable; + fn serialize_test(v: T, num_bytes: usize) { + let mut buffer: Vec = Vec::new(); + assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes); + assert_eq!(buffer.len(), num_bytes); + let mut cursor = Cursor::new(&buffer[..]); + let deser = T::deserialize(&mut cursor).unwrap(); + assert_eq!(deser, v); + } + use std::io::Cursor; + use super::*; #[test] fn test_serialize_u8() { - let mut buffer: Vec = Vec::new(); - { - let x: u8 = 3; - x.serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), 1); - } - { - let x: u8 = 5; - x.serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), 2); - } - let mut cursor = Cursor::new(&buffer[..]); - assert_eq!(3, u8::deserialize(&mut cursor).unwrap()); - assert_eq!(5, u8::deserialize(&mut cursor).unwrap()); - assert!(u8::deserialize(&mut cursor).is_err()); + serialize_test(3u8, 1); + serialize_test(5u8, 1); } - #[test] fn test_serialize_u32() { - let mut buffer: Vec = Vec::new(); - { - let x: u32 = 3; - x.serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), 4); - } - { - let x: u32 = 5; - x.serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), 8); - } - let mut cursor = Cursor::new(&buffer[..]); - assert_eq!(3, u32::deserialize(&mut cursor).unwrap()); - assert_eq!(5, u32::deserialize(&mut cursor).unwrap()); - assert!(u32::deserialize(&mut cursor).is_err()); + serialize_test(3u32, 4); + serialize_test(5u32, 4); + serialize_test(u32::max_value(), 4); } #[test] fn test_serialize_string() { - let mut buffer: Vec = Vec::new(); - let first_length = 4 + 3 * 4; - let second_length = 4 + 3 * 8; - { - let x: String = String::from("ぽよぽよ"); - assert_eq!(x.serialize(&mut buffer).unwrap(), first_length); - assert_eq!(buffer.len(), first_length); - } - { - let x: String = String::from("富士さん見える。"); - assert_eq!(x.serialize(&mut buffer).unwrap(), second_length); - assert_eq!(buffer.len(), first_length + second_length); - } - let mut cursor = Cursor::new(&buffer[..]); - assert_eq!("ぽよぽよ", String::deserialize(&mut cursor).unwrap()); - assert_eq!("富士さん見える。", String::deserialize(&mut cursor).unwrap()); - assert!(u32::deserialize(&mut cursor).is_err()); + serialize_test(String::from(""), 1); + serialize_test(String::from("ぽよぽよ"), 1 + 3*4); + serialize_test(String::from("富士さん見える。"), 1 + 3*8); } #[test] fn test_serialize_vec() { - let mut buffer: Vec = Vec::new(); - let first_length = 4 + 3 * 4; - let second_length = 4 + 3 * 8; - let vec = vec!(String::from("ぽよぽよ"), String::from("富士さん見える。")); - assert_eq!(vec.serialize(&mut buffer).unwrap(), first_length + second_length + 4); - let mut cursor = Cursor::new(&buffer[..]); - { - let deser: Vec = Vec::deserialize(&mut cursor).unwrap(); - assert_eq!(deser.len(), 2); - assert_eq!("ぽよぽよ", deser[0]); - assert_eq!("富士さん見える。", deser[1]); - } + let v: Vec = Vec::new(); + serialize_test(v, 1); + serialize_test(vec!(1u32, 3u32), 1 + 4*2); } - + #[test] + fn test_serialize_vint() { + serialize_test(VInt(7u64), 1); + serialize_test(VInt(127u64), 1); + serialize_test(VInt(128u64), 2); + serialize_test(VInt(1234u64), 2); + serialize_test(VInt(16_383), 2); + serialize_test(VInt(16_384), 3); + serialize_test(VInt(u64::max_value()), 10); + } } From 2c2a3845bdd29822eb2930e964f148b142c7f791 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 30 Apr 2016 17:03:55 +0900 Subject: [PATCH 05/14] blop --- Cargo.toml | 8 ++++++ src/core/postings.rs | 66 ++++---------------------------------------- 2 files changed, 14 insertions(+), 60 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5748dbd3e..f9ecbebb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,3 +34,11 @@ gcc = "0.3.24" [[bin]] name = "tantivy-merge" path = "src/cli/merge.rs" + +# [profile.release] +# opt-level = 3 +# debug = true +# rpath = false +# lto = false +# debug-assertions = false +# codegen-units = 1 diff --git a/src/core/postings.rs b/src/core/postings.rs index f309db8f8..357a738a1 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -65,6 +65,7 @@ struct TermPostingsWriter, term_freqs: TermFreqsRec, positions: PositionsRec, + current_position: u32, current_freq: u32, } @@ -74,13 +75,15 @@ impl TermPostingsWriter< doc_ids: Vec::new(), term_freqs: TermFreqsRec::new(), positions: PositionsRec::new(), - current_freq: 0, + current_position: 0u32, + current_freq: 0u32, } } fn close_doc(&mut self,) { self.term_freqs.record(self.current_freq); self.current_freq = 0; + self.current_position = 0; } fn close(&mut self,) { @@ -108,7 +111,8 @@ impl TermPostingsWriter< self.doc_ids.push(doc); } self.current_freq += 1; - self.positions.record(pos); + self.positions.record(pos - self.current_position); + self.current_position = pos; } } @@ -152,7 +156,6 @@ impl PostingsWriter { for doc in term_postings_writer.doc_ids.iter() { try!(serializer.write_doc(doc.clone(), None)); } - } Ok(()) } @@ -171,55 +174,6 @@ pub trait Postings: Iterator { fn skip_next(&mut self, target: DocId) -> Option; } -// pub struct IntersectionPostings { -// postings: Vec, -// } -// -// impl IntersectionPostings { -// pub fn from_postings(postings: Vec) -> IntersectionPostings { -// IntersectionPostings { -// postings: postings, -// } -// } -// } -// -// impl Iterator for IntersectionPostings { -// type Item = DocId; -// fn next(&mut self,) -> Option { -// let mut candidate; -// match self.postings[0].next() { -// Some(val) => { -// candidate = val; -// }, -// None => { -// return None; -// } -// } -// 'outer: loop { -// for i in 1..self.postings.len() { -// let skip_result = self.postings[i].skip_next(candidate); -// match skip_result { -// None => { -// return None; -// }, -// Some(x) if x == candidate => { -// }, -// Some(greater) => { -// unsafe { -// let pa: *mut T = &mut self.postings[i]; -// let pb: *mut T = &mut self.postings[0]; -// ptr::swap(pa, pb); -// } -// candidate = greater; -// continue 'outer; -// }, -// } -// } -// return Some(candidate); -// } -// -// } -// } pub struct PostingsSerializer { terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" @@ -276,14 +230,6 @@ impl PostingsSerializer { Ok(()) } - // pub fn add_doc(&mut self, doc_ids: &[DocId]) -> io::Result<()> { - // let docs_data = self.encoder.encode_sorted(doc_ids); - // self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); - // for num in docs_data { - // self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); - // } - // Ok(()) - // } pub fn close(mut self,) -> io::Result<()> { try!(self.close_term()); From 26826ac4ea8f4c0ed5b8043f5214cd9db0619ca3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 30 Apr 2016 18:45:44 +0900 Subject: [PATCH 06/14] Moved postings and directory to a different module --- src/cli/merge.rs | 6 +- .../simdcompression.rs => compression/mod.rs} | 0 src/core/codec.rs | 3 +- src/core/collector.rs | 2 +- src/core/directory.rs | 305 ---------------- src/core/fastfield.rs | 13 +- src/core/index.rs | 4 +- src/core/merger.rs | 10 +- src/core/mod.rs | 4 - src/core/postings.rs | 327 ------------------ src/core/reader.rs | 24 +- src/core/schema.rs | 5 +- src/core/searcher.rs | 6 +- src/core/store.rs | 8 +- src/core/timer.rs | 4 +- src/core/writer.rs | 3 +- src/{core => datastruct}/fstmap.rs | 6 +- src/datastruct/mod.rs | 4 + src/directory/directory.rs | 23 ++ src/directory/mmap_directory.rs | 99 ++++++ src/directory/mod.rs | 113 ++++++ src/directory/ram_directory.rs | 87 +++++ src/lib.rs | 11 +- src/postings/mod.rs | 132 +++++++ src/postings/serializer.rs | 74 ++++ src/postings/term_info.rs | 26 ++ src/postings/writer.rs | 141 ++++++++ 27 files changed, 751 insertions(+), 689 deletions(-) rename src/{core/simdcompression.rs => compression/mod.rs} (100%) delete mode 100644 src/core/directory.rs delete mode 100644 src/core/postings.rs rename src/{core => datastruct}/fstmap.rs (98%) create mode 100644 src/datastruct/mod.rs create mode 100644 src/directory/directory.rs create mode 100644 src/directory/mmap_directory.rs create mode 100644 src/directory/mod.rs create mode 100644 src/directory/ram_directory.rs create mode 100644 src/postings/mod.rs create mode 100644 src/postings/serializer.rs create mode 100644 src/postings/term_info.rs create mode 100644 src/postings/writer.rs diff --git a/src/cli/merge.rs b/src/cli/merge.rs index 5ef43b825..d419967c0 100644 --- a/src/cli/merge.rs +++ b/src/cli/merge.rs @@ -1,13 +1,11 @@ extern crate argparse; extern crate tantivy; -use argparse::{ArgumentParser, StoreTrue, Store}; +use argparse::{ArgumentParser, Store}; use tantivy::Index; use std::path::Path; fn main() { - - let mut verbose = false; let mut directory = String::from("."); { let mut ap = ArgumentParser::new(); @@ -22,5 +20,5 @@ fn main() { let mut index_writer = index.writer().unwrap(); let segments = index.segments(); println!("Merging {} segments", segments.len()); - index_writer.merge(&segments); + index_writer.merge(&segments).unwrap(); } diff --git a/src/core/simdcompression.rs b/src/compression/mod.rs similarity index 100% rename from src/core/simdcompression.rs rename to src/compression/mod.rs diff --git a/src/core/codec.rs b/src/core/codec.rs index 12a2cf4ca..a65cf2e0b 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -6,9 +6,10 @@ use core::index::SegmentInfo; use core::index::SegmentComponent; use core::fastfield::FastFieldSerializer; use core::store::StoreWriter; -use core::postings::PostingsSerializer; use core::convert_to_ioerror; +use postings::PostingsSerializer; + pub struct SegmentSerializer { segment: Segment, store_writer: StoreWriter, diff --git a/src/core/collector.rs b/src/core/collector.rs index 63f376919..def40c60b 100644 --- a/src/core/collector.rs +++ b/src/core/collector.rs @@ -1,4 +1,4 @@ -use core::schema::DocId; +use DocId; use core::reader::SegmentReader; use core::searcher::SegmentLocalId; use core::searcher::DocAddress; diff --git a/src/core/directory.rs b/src/core/directory.rs deleted file mode 100644 index d6711f9d6..000000000 --- a/src/core/directory.rs +++ /dev/null @@ -1,305 +0,0 @@ -use std::io::BufWriter; -use std::marker::Send; -use std::marker::Sync; -use std::io; -use std::io::Cursor; -use std::io::Write; -use std::io::Seek; -use std::io::SeekFrom; -use std::fs::File; -use std::fmt; -use std::collections::HashMap; -use std::collections::hash_map::Entry as HashMapEntry; -use fst::raw::MmapReadOnly; -use atomicwrites; -use std::sync::Arc; -use std::sync::RwLock; -use tempdir::TempDir; -use std::ops::Deref; -use std::path::{Path, PathBuf}; - -/////////////////////////////////////////////////////////////// - -pub enum ReadOnlySource { - Mmap(MmapReadOnly), - Anonymous(Vec), -} - -impl Deref for ReadOnlySource { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - self.as_slice() - } -} - - -impl ReadOnlySource { - - pub fn len(&self,) -> usize { - self.as_slice().len() - } - - pub fn as_slice(&self,) -> &[u8] { - match *self { - ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() }, - ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(), - } - } - - pub fn cursor<'a>(&'a self) -> Cursor<&'a [u8]> { - Cursor::new(&self.deref()) - } - - pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource { - match *self { - ReadOnlySource::Mmap(ref mmap_read_only) => { - let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset); - ReadOnlySource::Mmap(sliced_mmap) - } - ReadOnlySource::Anonymous(ref shared_vec) => { - let sliced_data: Vec = Vec::from(&shared_vec[from_offset..to_offset]); - ReadOnlySource::Anonymous(sliced_data) - }, - } - } -} - - -impl Clone for ReadOnlySource { - fn clone(&self) -> Self { - self.slice(0, self.len()) - } -} - - -pub trait SeekableWrite: Seek + Write {} -impl SeekableWrite for T {} -pub type WritePtr = Box; - -// -// #[derive(Debug)] -// pub enum CreateError { -// RootDirectoryDoesNotExist, -// DirectoryAlreadyExists, -// CannotCreateTempDirectory(io::Error), -// } - -pub trait Directory: fmt::Debug + Send + Sync { - fn open_read(&self, path: &Path) -> io::Result; - fn open_write(&mut self, path: &Path) -> io::Result; - fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>; - fn sync(&self, path: &Path) -> io::Result<()>; - fn sync_directory(&self,) -> io::Result<()>; -} - - -//////////////////////////////////////////////////////////////// -// MmapDirectory - -pub struct MmapDirectory { - root_path: PathBuf, - mmap_cache: RwLock>, - _temp_directory: Option, -} - -impl fmt::Debug for MmapDirectory { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "MmapDirectory({:?})", self.root_path) - } -} - - -impl MmapDirectory { - - pub fn create_from_tempdir() -> io::Result { - // TODO error management - let tempdir = try!(TempDir::new("index")); - let tempdir_path = PathBuf::from(tempdir.path()); - let directory = MmapDirectory { - root_path: PathBuf::from(tempdir_path), - mmap_cache: RwLock::new(HashMap::new()), - _temp_directory: Some(tempdir) - }; - Ok(directory) - } - - pub fn create(filepath: &Path) -> io::Result { - Ok(MmapDirectory { - root_path: PathBuf::from(filepath), - mmap_cache: RwLock::new(HashMap::new()), - _temp_directory: None - }) - } - - fn resolve_path(&self, relative_path: &Path) -> PathBuf { - self.root_path.join(relative_path) - } - - -} - -impl Directory for MmapDirectory { - fn open_read(&self, path: &Path) -> io::Result { - let full_path = self.resolve_path(path); - let mut mmap_cache = self.mmap_cache.write().unwrap(); - let mmap = match mmap_cache.entry(full_path.clone()) { - HashMapEntry::Occupied(e) => e.get().clone(), - HashMapEntry::Vacant(vacant_entry) => { - let new_mmap = try!(MmapReadOnly::open_path(full_path.clone())); - vacant_entry.insert(new_mmap.clone()); - new_mmap - } - }; - Ok(ReadOnlySource::Mmap(mmap)) - } - fn open_write(&mut self, path: &Path) -> io::Result { - let full_path = self.resolve_path(path); - let file = try!(File::create(full_path)); - let buf_writer = BufWriter::new(file); - Ok(Box::new(buf_writer)) - } - - fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { - let full_path = self.resolve_path(path); - let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); - meta_file.write(|f| { - f.write_all(data) - }) - } - - fn sync(&self, path: &Path) -> io::Result<()> { - let full_path = self.resolve_path(path); - File::open(&full_path).and_then(|fd| fd.sync_all()) - } - - fn sync_directory(&self,) -> io::Result<()> { - File::open(&self.root_path).and_then(|fd| fd.sync_all()) - } -} - - - - -//////////////////////////////////////////////////////////////// -// RAMDirectory - - -#[derive(Clone)] -struct SharedVec(Arc>>>); - - -pub struct RAMDirectory { - fs: HashMap, -} - -impl SharedVec { - fn new() -> SharedVec { - SharedVec(Arc::new( RwLock::new(Cursor::new(Vec::new())) )) - } -} - -impl Write for SharedVec { - fn write(&mut self, buf: &[u8]) -> io::Result { - try!(self.0.write().unwrap().write(buf)); - Ok(buf.len()) - } - fn flush(&mut self) -> io::Result<()> { - Ok(()) - } -} - -impl Seek for SharedVec { - fn seek(&mut self, pos: SeekFrom) -> io::Result { - self.0.write().unwrap().seek(pos) - } -} - -impl fmt::Debug for RAMDirectory { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "RAMDirectory") - } -} - -impl RAMDirectory { - pub fn create() -> RAMDirectory { - RAMDirectory { - fs: HashMap::new() - } - } -} - -impl Directory for RAMDirectory { - fn open_read(&self, path: &Path) -> io::Result { - match self.fs.get(path) { - Some(ref data) => { - let data_copy = (*data).0.read().unwrap().clone(); - Ok(ReadOnlySource::Anonymous(data_copy.into_inner())) - }, - None => - Err(io::Error::new(io::ErrorKind::NotFound, format!("File has never been created. {:?}", path))) - } - } - fn open_write(&mut self, path: &Path) -> io::Result { - let full_path = PathBuf::from(&path); - let data = SharedVec::new(); - self.fs.insert(full_path, data.clone()); - Ok(Box::new(data)) - } - - fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { - let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite); - meta_file.write(|f| { - f.write_all(data) - }) - } - - fn sync(&self, _: &Path) -> io::Result<()> { - Ok(()) - } - - fn sync_directory(&self,) -> io::Result<()> { - Ok(()) - } -} - - -#[cfg(test)] -mod tests { - - use super::*; - use std::path::Path; - - #[test] - fn test_ram_directory() { - let mut ram_directory = RAMDirectory::create(); - test_directory(&mut ram_directory); - } - - #[test] - fn test_mmap_directory() { - let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap(); - test_directory(&mut mmap_directory); - } - - fn test_directory(directory: &mut Directory) { - { - let mut write_file = directory.open_write(Path::new("toto")).unwrap(); - write_file.write_all(&[4]).unwrap(); - write_file.write_all(&[3]).unwrap(); - write_file.write_all(&[7,3,5]).unwrap(); - } - let read_file = directory.open_read(Path::new("toto")).unwrap(); - let data: &[u8] = &*read_file; - assert_eq!(data.len(), 5); - assert_eq!(data[0], 4); - assert_eq!(data[1], 3); - assert_eq!(data[2], 7); - assert_eq!(data[3], 3); - assert_eq!(data[4], 5); - } - - - - -} diff --git a/src/core/fastfield.rs b/src/core/fastfield.rs index 8678a53fb..8daed2604 100644 --- a/src/core/fastfield.rs +++ b/src/core/fastfield.rs @@ -1,12 +1,9 @@ -use std::io::Write; use std::io; -use std::io::SeekFrom; -use std::io::Seek; -use core::directory::WritePtr; +use std::io::{SeekFrom, Seek, Write}; +use directory::{WritePtr, ReadOnlySource}; use core::serialize::BinarySerializable; -use core::directory::ReadOnlySource; use std::collections::HashMap; -use core::schema::DocId; +use DocId; use core::schema::Schema; use core::schema::Document; use std::ops::Deref; @@ -281,10 +278,8 @@ mod tests { use super::U32FastFieldsWriter; use core::schema::U32Field; use std::path::Path; - use core::directory::WritePtr; - use core::directory::Directory; + use directory::{Directory, WritePtr, RAMDirectory}; use core::schema::Document; - use core::directory::RAMDirectory; use core::schema::Schema; use core::schema::FAST_U32; use core::fastfield::FastFieldSerializer; diff --git a/src/core/index.rs b/src/core/index.rs index 19c268fef..6be29a140 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,14 +1,14 @@ use std::path::{PathBuf, Path}; use std::io; use core::schema::Schema; -use core::schema::DocId; +use DocId; use std::io::Write; use std::sync::{Arc, RwLock, RwLockWriteGuard, RwLockReadGuard}; use std::fmt; use rustc_serialize::json; use std::io::Read; use std::io::ErrorKind as IOErrorKind; -use core::directory::{Directory, MmapDirectory, RAMDirectory, ReadOnlySource, WritePtr}; +use directory::{Directory, MmapDirectory, RAMDirectory, ReadOnlySource, WritePtr}; use core::writer::IndexWriter; use core::searcher::Searcher; use uuid::Uuid; diff --git a/src/core/merger.rs b/src/core/merger.rs index 882493755..8f7437640 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -1,13 +1,15 @@ use std::io; use core::reader::SegmentReader; use core::index::Segment; -use core::schema::DocId; +use DocId; use core::index::SerializableSegment; use core::codec::SegmentSerializer; -use core::postings::PostingsSerializer; -use core::postings::TermInfo; + +use postings::PostingsSerializer; +use postings::TermInfo; + use std::collections::BinaryHeap; -use core::fstmap::FstMapIter; +use datastruct::FstMapIter; use core::schema::Term; use core::schema::Schema; use core::fastfield::FastFieldSerializer; diff --git a/src/core/mod.rs b/src/core/mod.rs index cc1be2737..a374e2b4b 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,6 +1,4 @@ -pub mod postings; pub mod schema; -pub mod directory; pub mod writer; pub mod analyzer; pub mod reader; @@ -9,8 +7,6 @@ pub mod searcher; pub mod collector; pub mod serialize; pub mod store; -pub mod simdcompression; -pub mod fstmap; pub mod index; pub mod fastfield; pub mod fastdivide; diff --git a/src/core/postings.rs b/src/core/postings.rs deleted file mode 100644 index 357a738a1..000000000 --- a/src/core/postings.rs +++ /dev/null @@ -1,327 +0,0 @@ -use core::schema::DocId; -use std::ptr; -use std::collections::BTreeMap; -use core::schema::Term; -use core::fstmap::FstMapBuilder; -use core::index::Segment; -use core::directory::WritePtr; -use core::index::SegmentComponent; -use core::simdcompression; -use core::serialize::BinarySerializable; -use std::io::{Read, Write}; -use std::io; - -#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)] -pub struct TermInfo { - pub doc_freq: u32, - pub postings_offset: u32, -} - -impl BinarySerializable for TermInfo { - fn serialize(&self, writer: &mut Write) -> io::Result { - Ok( - try!(self.doc_freq.serialize(writer)) + - try!(self.postings_offset.serialize(writer)) - ) - } - fn deserialize(reader: &mut Read) -> io::Result { - let doc_freq = try!(u32::deserialize(reader)); - let offset = try!(u32::deserialize(reader)); - Ok(TermInfo { - doc_freq: doc_freq, - postings_offset: offset, - }) - } -} - - -pub trait U32sRecorder { - fn new() -> Self; - fn record(&mut self, val: u32); -} - -pub struct VecRecorder(Vec); - -impl U32sRecorder for VecRecorder { - fn new() -> VecRecorder { - VecRecorder(Vec::new()) - } - fn record(&mut self, val: u32) { - self.0.push(val); - } -} - -pub struct ObliviousRecorder; - -impl U32sRecorder for ObliviousRecorder { - fn new() -> ObliviousRecorder { - ObliviousRecorder - } - fn record(&mut self, val: u32) { - } -} - -struct TermPostingsWriter { - doc_ids: Vec, - term_freqs: TermFreqsRec, - positions: PositionsRec, - current_position: u32, - current_freq: u32, -} - -impl TermPostingsWriter { - pub fn new() -> TermPostingsWriter { - TermPostingsWriter { - doc_ids: Vec::new(), - term_freqs: TermFreqsRec::new(), - positions: PositionsRec::new(), - current_position: 0u32, - current_freq: 0u32, - } - } - - fn close_doc(&mut self,) { - self.term_freqs.record(self.current_freq); - self.current_freq = 0; - self.current_position = 0; - } - - fn close(&mut self,) { - if self.current_freq > 0 { - self.close_doc(); - } - } - - fn is_new_doc(&self, doc: &DocId) -> bool { - match self.doc_ids.last() { - Some(&last_doc) => last_doc != *doc, - None => true, - } - } - - pub fn doc_freq(&self) -> u32 { - self.doc_ids.len() as u32 - } - - pub fn suscribe(&mut self, doc: DocId, pos: u32) { - if self.is_new_doc(&doc) { - // this is the first time we meet this term for this document - // first close the previous document, and write its doc_freq. - self.close_doc(); - self.doc_ids.push(doc); - } - self.current_freq += 1; - self.positions.record(pos - self.current_position); - self.current_position = pos; - } -} - -pub struct PostingsWriter { - postings: Vec>, - term_index: BTreeMap, -} - -impl PostingsWriter { - - pub fn new() -> PostingsWriter { - PostingsWriter { - postings: Vec::new(), - term_index: BTreeMap::new(), - } - } - - pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) { - let doc_ids: &mut TermPostingsWriter = self.get_term_postings(term); - doc_ids.suscribe(doc, pos); - } - - fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter { - match self.term_index.get(&term) { - Some(unord_id) => { - return &mut self.postings[*unord_id]; - }, - None => {} - } - let unord_id = self.term_index.len(); - self.postings.push(TermPostingsWriter::new()); - self.term_index.insert(term, unord_id.clone()); - &mut self.postings[unord_id] - } - - pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> { - for (term, postings_id) in self.term_index.iter() { - let term_postings_writer = &self.postings[postings_id.clone()]; - let term_docfreq = term_postings_writer.doc_freq(); - try!(serializer.new_term(&term, term_docfreq)); - for doc in term_postings_writer.doc_ids.iter() { - try!(serializer.write_doc(doc.clone(), None)); - } - } - Ok(()) - } - - -} - - -////////////////////////////////// - -pub trait Postings: Iterator { - // after skipping position - // the iterator in such a way that the - // next call to next() will return a - // value greater or equal to target. - fn skip_next(&mut self, target: DocId) -> Option; -} - - -pub struct PostingsSerializer { - terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" - postings_write: WritePtr, - positions_write: WritePtr, - written_bytes_postings: usize, - written_bytes_positions: usize, - encoder: simdcompression::Encoder, - doc_ids: Vec, -} - -impl PostingsSerializer { - - pub fn open(segment: &Segment) -> io::Result { - let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); - let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); - let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); - let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); - Ok(PostingsSerializer { - terms_fst_builder: terms_fst_builder, - postings_write: postings_write, - positions_write: positions_write, - written_bytes_postings: 0, - written_bytes_positions: 0, - encoder: simdcompression::Encoder::new(), - doc_ids: Vec::new(), - }) - } - - pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { - try!(self.close_term()); - self.doc_ids.clear(); - let term_info = TermInfo { - doc_freq: doc_freq, - postings_offset: self.written_bytes_postings as u32, - }; - self.terms_fst_builder - .insert(term.as_slice(), &term_info) - } - - pub fn close_term(&mut self,) -> io::Result<()> { - if !self.doc_ids.is_empty() { - let docs_data = self.encoder.encode_sorted(&self.doc_ids); - self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); - for num in docs_data { - self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); - } - } - Ok(()) - } - - pub fn write_doc(&mut self, doc_id: DocId, positions: Option<&[u32]>) -> io::Result<()> { - self.doc_ids.push(doc_id); - Ok(()) - } - - - pub fn close(mut self,) -> io::Result<()> { - try!(self.close_term()); - try!(self.terms_fst_builder.finish()); - try!(self.postings_write.flush()); - Ok(()) - } -} - - - -#[cfg(test)] -mod tests { - - use super::*; - use test::Bencher; - use core::schema::DocId; - - - #[derive(Debug)] - pub struct VecPostings { - doc_ids: Vec, - cursor: usize, - } - - impl VecPostings { - pub fn new(vals: Vec) -> VecPostings { - VecPostings { - doc_ids: vals, - cursor: 0, - } - } - } - - impl Postings for VecPostings { - // after skipping position - // the iterator in such a way that the - // next call to next() will return a - // value greater or equal to target. - fn skip_next(&mut self, target: DocId) -> Option { - loop { - match Iterator::next(self) { - Some(val) if val >= target => { - return Some(val); - }, - None => { - return None; - }, - _ => {} - } - } - } - } - - impl Iterator for VecPostings { - type Item = DocId; - fn next(&mut self,) -> Option { - if self.cursor >= self.doc_ids.len() { - None - } - else { - self.cursor += 1; - Some(self.doc_ids[self.cursor - 1]) - } - } - } - // - // #[test] - // fn test_intersection() { - // { - // let left = VecPostings::new(vec!(1, 3, 9)); - // let right = VecPostings::new(vec!(3, 4, 9, 18)); - // let inter = IntersectionPostings::from_postings(vec!(left, right)); - // let vals: Vec = inter.collect(); - // assert_eq!(vals, vec!(3, 9)); - // } - // { - // let a = VecPostings::new(vec!(1, 3, 9)); - // let b = VecPostings::new(vec!(3, 4, 9, 18)); - // let c = VecPostings::new(vec!(1, 5, 9, 111)); - // let inter = IntersectionPostings::from_postings(vec!(a, b, c)); - // let vals: Vec = inter.collect(); - // assert_eq!(vals, vec!(9)); - // } - // } - // - // #[bench] - // fn bench_single_intersection(b: &mut Bencher) { - // b.iter(|| { - // let docs = VecPostings::new((0..1_000_000).collect()); - // let intersection = IntersectionPostings::from_postings(vec!(docs)); - // intersection.count() - // }); - // } -} diff --git a/src/core/reader.rs b/src/core/reader.rs index 57f1c405f..ef1aedbe1 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -2,16 +2,14 @@ use core::index::{Segment, SegmentId}; use core::schema::Term; use core::store::StoreReader; use core::schema::Document; -use core::directory::ReadOnlySource; +use directory::ReadOnlySource; use std::io::Cursor; -use core::schema::DocId; +use DocId; use core::index::SegmentComponent; -use core::simdcompression::Decoder; use std::io; -use std::iter; use std::str; -use core::postings::TermInfo; -use core::fstmap::FstMap; +use postings::TermInfo; +use datastruct::FstMap; use std::fmt; use rustc_serialize::json; use core::index::SegmentInfo; @@ -21,7 +19,7 @@ use core::convert_to_ioerror; use core::serialize::BinarySerializable; use core::fastfield::U32FastFieldsReader; use core::fastfield::U32FastFieldReader; -use core::simdcompression; +use compression; use std::mem; impl fmt::Debug for SegmentReader { @@ -37,13 +35,13 @@ pub fn intersection(mut postings: Vec) -> SegmentPostings { .map(|v| v.len()) .min() .unwrap(); - let mut buffer: Vec = postings.pop().unwrap().0; + let buffer: Vec = postings.pop().unwrap().0; let mut output: Vec = Vec::with_capacity(min_len); unsafe { output.set_len(min_len); } let mut pair = (output, buffer); for posting in postings.iter() { pair = (pair.1, pair.0); - let output_len = simdcompression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice()); + let output_len = compression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice()); unsafe { pair.1.set_len(output_len); } } SegmentPostings(pair.1) @@ -77,8 +75,8 @@ impl SegmentPostings { let mut doc_ids: Vec = Vec::with_capacity(doc_freq as usize); unsafe { doc_ids.set_len(doc_freq as usize); } { - let decoder = Decoder::new(); - let num_doc_ids = decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids); + let decoder = compression::Decoder::new(); + decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids); SegmentPostings(doc_ids) } } @@ -212,7 +210,7 @@ impl SegmentReader { for term in terms.iter() { match self.get_term(term) { Some(term_info) => { - let decode_one_timer = decode_timer.open("decode_one"); + let _decode_one_timer = decode_timer.open("decode_one"); let segment_posting = self.read_postings(&term_info); segment_postings.push(segment_posting); } @@ -224,7 +222,7 @@ impl SegmentReader { } } { - let mut intersection_time = timer.open("intersection"); + let _intersection_time = timer.open("intersection"); intersection(segment_postings) } } diff --git a/src/core/schema.rs b/src/core/schema.rs index 1a1536a9c..e1457dee8 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::slice; use std::fmt; use std::io; + use std::io::Read; use core::serialize::BinarySerializable; use rustc_serialize::Decodable; @@ -12,10 +13,6 @@ use rustc_serialize::Encoder; use std::ops::BitOr; use std::borrow::Borrow; -/// u32 identifying a document within a segment. -/// Document gets their doc id assigned incrementally, -/// as they are added in the segment. -pub type DocId = u32; #[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] pub struct TextOptions { diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9be6e10f8..a475016a4 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -1,7 +1,7 @@ use core::reader::SegmentReader; use core::index::Index; use core::index::Segment; -use core::schema::DocId; +use DocId; use core::schema::Document; use core::collector::Collector; use std::io; @@ -56,12 +56,12 @@ impl Searcher { for (segment_ord, segment) in self.segments.iter().enumerate() { let mut segment_search_timer = search_timer.open("segment_search"); { - let set_segment_timer = segment_search_timer.open("set_segment"); + let _ = segment_search_timer.open("set_segment"); try!(collector.set_segment(segment_ord as SegmentLocalId, &segment)); } let postings = segment.search(terms, segment_search_timer.open("get_postings")); { - let collection_timer = segment_search_timer.open("collection"); + let _collection_timer = segment_search_timer.open("collection"); for doc_id in postings { collector.collect(doc_id); } diff --git a/src/core/store.rs b/src/core/store.rs index f2ddc1255..71872f240 100644 --- a/src/core/store.rs +++ b/src/core/store.rs @@ -1,10 +1,10 @@ -use core::directory::WritePtr; +use directory::{WritePtr, ReadOnlySource}; use std::cell::RefCell; -use core::schema::DocId; +use DocId; use core::schema::Document; use core::schema::TextFieldValue; use core::serialize::BinarySerializable; -use core::directory::ReadOnlySource; + use std::io::Write; use std::io::Read; use std::io::Cursor; @@ -212,7 +212,7 @@ mod tests { use core::schema::Schema; use core::schema::TextOptions; use core::schema::TextFieldValue; - use core::directory::{RAMDirectory, Directory, MmapDirectory, WritePtr}; + use directory::{RAMDirectory, Directory, MmapDirectory, WritePtr}; fn write_lorem_ipsum_store(writer: WritePtr) -> Schema { let mut schema = Schema::new(); diff --git a/src/core/timer.rs b/src/core/timer.rs index f873ad43c..8e5277dcc 100644 --- a/src/core/timer.rs +++ b/src/core/timer.rs @@ -72,10 +72,10 @@ mod tests { { let mut ab = a.open("b"); { - let abc = ab.open("c"); + let _abc = ab.open("c"); } { - let abd = ab.open("d"); + let _abd = ab.open("d"); } } } diff --git a/src/core/writer.rs b/src/core/writer.rs index 6fd2333e6..3c21979b6 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -1,3 +1,4 @@ +use DocId; use core::schema::*; use core::codec::*; use core::index::Index; @@ -6,7 +7,7 @@ use core::index::SerializableSegment; use core::analyzer::StreamingIterator; use core::index::Segment; use core::index::SegmentInfo; -use core::postings::PostingsWriter; +use postings::PostingsWriter; use core::fastfield::U32FastFieldsWriter; use std::clone::Clone; use std::sync::mpsc; diff --git a/src/core/fstmap.rs b/src/datastruct/fstmap.rs similarity index 98% rename from src/core/fstmap.rs rename to src/datastruct/fstmap.rs index f7b9c1ed3..424bda684 100644 --- a/src/core/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -5,7 +5,8 @@ use std::io::Cursor; use fst; use fst::raw::Fst; use fst::Streamer; -use core::directory::ReadOnlySource; + +use directory::ReadOnlySource; use core::serialize::BinarySerializable; use std::marker::PhantomData; @@ -125,9 +126,8 @@ impl FstMap { #[cfg(test)] mod tests { use super::*; - use core::directory::{RAMDirectory, Directory}; + use directory::{RAMDirectory, Directory}; use std::path::PathBuf; - use fst::Streamer; #[test] diff --git a/src/datastruct/mod.rs b/src/datastruct/mod.rs new file mode 100644 index 000000000..091533637 --- /dev/null +++ b/src/datastruct/mod.rs @@ -0,0 +1,4 @@ +mod fstmap; +pub use self::fstmap::FstMapBuilder; +pub use self::fstmap::FstMap; +pub use self::fstmap::FstMapIter; diff --git a/src/directory/directory.rs b/src/directory/directory.rs new file mode 100644 index 000000000..eca3692ec --- /dev/null +++ b/src/directory/directory.rs @@ -0,0 +1,23 @@ +use std::marker::Send; +use std::marker::Sync; +use std::io; +use std::fmt; +use std::path::Path; +use directory::{ReadOnlySource, WritePtr}; + +/////////////////////////////////////////////////////////////// +// +// #[derive(Debug)] +// pub enum CreateError { +// RootDirectoryDoesNotExist, +// DirectoryAlreadyExists, +// CannotCreateTempDirectory(io::Error), +// } + +pub trait Directory: fmt::Debug + Send + Sync { +fn open_read(&self, path: &Path) -> io::Result; + fn open_write(&mut self, path: &Path) -> io::Result; + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>; + fn sync(&self, path: &Path) -> io::Result<()>; + fn sync_directory(&self,) -> io::Result<()>; +} diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs new file mode 100644 index 000000000..1187ce31a --- /dev/null +++ b/src/directory/mmap_directory.rs @@ -0,0 +1,99 @@ +use std::path::{Path, PathBuf}; +use tempdir::TempDir; +use std::collections::HashMap; +use std::collections::hash_map::Entry as HashMapEntry; +use fst::raw::MmapReadOnly; +use std::fs::File; +use atomicwrites; +use std::sync::RwLock; +use std::fmt; +use std::io::Write; +use std::io; +use directory::Directory; +use directory::ReadOnlySource; +use directory::WritePtr; +use std::io::BufWriter; + +//////////////////////////////////////////////////////////////// +// MmapDirectory + +pub struct MmapDirectory { + root_path: PathBuf, + mmap_cache: RwLock>, + _temp_directory: Option, +} + +impl fmt::Debug for MmapDirectory { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "MmapDirectory({:?})", self.root_path) + } +} + + +impl MmapDirectory { + + pub fn create_from_tempdir() -> io::Result { + // TODO error management + let tempdir = try!(TempDir::new("index")); + let tempdir_path = PathBuf::from(tempdir.path()); + let directory = MmapDirectory { + root_path: PathBuf::from(tempdir_path), + mmap_cache: RwLock::new(HashMap::new()), + _temp_directory: Some(tempdir) + }; + Ok(directory) + } + + pub fn create(filepath: &Path) -> io::Result { + Ok(MmapDirectory { + root_path: PathBuf::from(filepath), + mmap_cache: RwLock::new(HashMap::new()), + _temp_directory: None + }) + } + + fn resolve_path(&self, relative_path: &Path) -> PathBuf { + self.root_path.join(relative_path) + } + + +} + +impl Directory for MmapDirectory { + fn open_read(&self, path: &Path) -> io::Result { + let full_path = self.resolve_path(path); + let mut mmap_cache = self.mmap_cache.write().unwrap(); + let mmap = match mmap_cache.entry(full_path.clone()) { + HashMapEntry::Occupied(e) => e.get().clone(), + HashMapEntry::Vacant(vacant_entry) => { + let new_mmap = try!(MmapReadOnly::open_path(full_path.clone())); + vacant_entry.insert(new_mmap.clone()); + new_mmap + } + }; + Ok(ReadOnlySource::Mmap(mmap)) + } + fn open_write(&mut self, path: &Path) -> io::Result { + let full_path = self.resolve_path(path); + let file = try!(File::create(full_path)); + let buf_writer = BufWriter::new(file); + Ok(Box::new(buf_writer)) + } + + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + let full_path = self.resolve_path(path); + let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); + meta_file.write(|f| { + f.write_all(data) + }) + } + + fn sync(&self, path: &Path) -> io::Result<()> { + let full_path = self.resolve_path(path); + File::open(&full_path).and_then(|fd| fd.sync_all()) + } + + fn sync_directory(&self,) -> io::Result<()> { + File::open(&self.root_path).and_then(|fd| fd.sync_all()) + } +} diff --git a/src/directory/mod.rs b/src/directory/mod.rs new file mode 100644 index 000000000..80be116fe --- /dev/null +++ b/src/directory/mod.rs @@ -0,0 +1,113 @@ +mod mmap_directory; +mod ram_directory; +mod directory; + +use std::ops::Deref; +use std::io::{Seek, Write, Cursor}; +use fst::raw::MmapReadOnly; + +pub use self::directory::Directory; +pub use self::ram_directory::RAMDirectory; +pub use self::mmap_directory::MmapDirectory; + + +//////////////////////////////////////// +// WritePtr + + +pub trait SeekableWrite: Seek + Write {} +impl SeekableWrite for T {} +pub type WritePtr = Box; + + +//////////////////////////////////////// +// Read only source. + + +pub enum ReadOnlySource { + Mmap(MmapReadOnly), + Anonymous(Vec), +} + +impl Deref for ReadOnlySource { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.as_slice() + } +} + +impl ReadOnlySource { + + pub fn len(&self,) -> usize { + self.as_slice().len() + } + + pub fn as_slice(&self,) -> &[u8] { + match *self { + ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() }, + ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(), + } + } + + pub fn cursor<'a>(&'a self) -> Cursor<&'a [u8]> { + Cursor::new(&self.deref()) + } + + pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource { + match *self { + ReadOnlySource::Mmap(ref mmap_read_only) => { + let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset); + ReadOnlySource::Mmap(sliced_mmap) + } + ReadOnlySource::Anonymous(ref shared_vec) => { + let sliced_data: Vec = Vec::from(&shared_vec[from_offset..to_offset]); + ReadOnlySource::Anonymous(sliced_data) + }, + } + } +} + +impl Clone for ReadOnlySource { + fn clone(&self) -> Self { + self.slice(0, self.len()) + } +} + + +#[cfg(test)] +mod tests { + + use super::*; + use std::path::Path; + + #[test] + fn test_ram_directory() { + let mut ram_directory = RAMDirectory::create(); + test_directory(&mut ram_directory); + } + + #[test] + fn test_mmap_directory() { + let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap(); + test_directory(&mut mmap_directory); + } + + fn test_directory(directory: &mut Directory) { + { + let mut write_file = directory.open_write(Path::new("toto")).unwrap(); + write_file.write_all(&[4]).unwrap(); + write_file.write_all(&[3]).unwrap(); + write_file.write_all(&[7,3,5]).unwrap(); + } + let read_file = directory.open_read(Path::new("toto")).unwrap(); + let data: &[u8] = &*read_file; + assert_eq!(data.len(), 5); + assert_eq!(data[0], 4); + assert_eq!(data[1], 3); + assert_eq!(data[2], 7); + assert_eq!(data[3], 3); + assert_eq!(data[4], 5); + } + +} diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs new file mode 100644 index 000000000..a467f27ce --- /dev/null +++ b/src/directory/ram_directory.rs @@ -0,0 +1,87 @@ +use directory::{Directory, ReadOnlySource}; +use std::io::{Cursor, Write, Seek, SeekFrom}; +use std::io; +use atomicwrites; +use std::fmt; +use std::sync::{Arc, RwLock}; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use directory::WritePtr; + +#[derive(Clone)] +struct SharedVec(Arc>>>); + + +pub struct RAMDirectory { + fs: HashMap, +} + +impl SharedVec { + fn new() -> SharedVec { + SharedVec(Arc::new( RwLock::new(Cursor::new(Vec::new())) )) + } +} + +impl Write for SharedVec { + fn write(&mut self, buf: &[u8]) -> io::Result { + try!(self.0.write().unwrap().write(buf)); + Ok(buf.len()) + } + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +impl Seek for SharedVec { + fn seek(&mut self, pos: SeekFrom) -> io::Result { + self.0.write().unwrap().seek(pos) + } +} + +impl fmt::Debug for RAMDirectory { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "RAMDirectory") + } +} + +impl RAMDirectory { + pub fn create() -> RAMDirectory { + RAMDirectory { + fs: HashMap::new() + } + } +} + +impl Directory for RAMDirectory { + fn open_read(&self, path: &Path) -> io::Result { + match self.fs.get(path) { + Some(ref data) => { + let data_copy = (*data).0.read().unwrap().clone(); + Ok(ReadOnlySource::Anonymous(data_copy.into_inner())) + }, + None => + Err(io::Error::new(io::ErrorKind::NotFound, format!("File has never been created. {:?}", path))) + } + } + fn open_write(&mut self, path: &Path) -> io::Result { + let full_path = PathBuf::from(&path); + let data = SharedVec::new(); + self.fs.insert(full_path, data.clone()); + Ok(Box::new(data)) + } + + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite); + meta_file.write(|f| { + f.write_all(data) + }) + } + + fn sync(&self, _: &Path) -> io::Result<()> { + Ok(()) + } + + fn sync_directory(&self,) -> io::Result<()> { + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 93735e573..8b7eb3e58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,19 +28,26 @@ extern crate num_cpus; #[cfg(test)] extern crate rand; mod core; +mod datastruct; +mod postings; +mod directory; +mod compression; +pub use directory::Directory; pub use core::analyzer; -pub use core::directory::Directory; pub use core::searcher::Searcher; pub use core::index::Index; pub use core::schema; pub use core::schema::Term; pub use core::schema::Document; pub use core::collector; -pub use core::schema::DocId; pub use core::reader::SegmentReader; pub use core::searcher::SegmentLocalId; +/// u32 identifying a document within a segment. +/// Document gets their doc id assigned incrementally, +/// as they are added in the segment. +pub type DocId = u32; #[cfg(test)] mod tests { diff --git a/src/postings/mod.rs b/src/postings/mod.rs new file mode 100644 index 000000000..2cfafb5c6 --- /dev/null +++ b/src/postings/mod.rs @@ -0,0 +1,132 @@ +// pub mod postings; +// pub mod schema; +// pub mod directory; +// pub mod writer; +// pub mod analyzer; +// pub mod reader; +// pub mod codec; +// pub mod searcher; +// pub mod collector; +// pub mod serialize; +// pub mod store; +// pub mod simdcompression; +// pub mod fstmap; +// pub mod index; +// pub mod fastfield; +// pub mod fastdivide; +// pub mod merger; +// pub mod timer; + +// use std::error; +// use std::io; + +// pub fn convert_to_ioerror(err: E) -> io::Error { +// io::Error::new( +// io::ErrorKind::InvalidData, +// err +// ) +// } + +mod serializer; +mod writer; +mod term_info; + +use DocId; +pub use self::serializer::PostingsSerializer; +pub use self::writer::PostingsWriter; +pub use self::term_info::TermInfo; + +pub trait Postings: Iterator { + // after skipping position + // the iterator in such a way that the + // next call to next() will return a + // value greater or equal to target. + fn skip_next(&mut self, target: DocId) -> Option; +} + + +#[cfg(test)] +mod tests { + + use super::*; + use DocId; + + + #[derive(Debug)] + pub struct VecPostings { + doc_ids: Vec, + cursor: usize, + } + + impl VecPostings { + pub fn new(vals: Vec) -> VecPostings { + VecPostings { + doc_ids: vals, + cursor: 0, + } + } + } + + impl Postings for VecPostings { + // after skipping position + // the iterator in such a way that the + // next call to next() will return a + // value greater or equal to target. + fn skip_next(&mut self, target: DocId) -> Option { + loop { + match Iterator::next(self) { + Some(val) if val >= target => { + return Some(val); + }, + None => { + return None; + }, + _ => {} + } + } + } + } + + impl Iterator for VecPostings { + type Item = DocId; + fn next(&mut self,) -> Option { + if self.cursor >= self.doc_ids.len() { + None + } + else { + self.cursor += 1; + Some(self.doc_ids[self.cursor - 1]) + } + } + } + + + // use test::Bencher; + // #[test] + // fn test_intersection() { + // { + // let left = VecPostings::new(vec!(1, 3, 9)); + // let right = VecPostings::new(vec!(3, 4, 9, 18)); + // let inter = IntersectionPostings::from_postings(vec!(left, right)); + // let vals: Vec = inter.collect(); + // assert_eq!(vals, vec!(3, 9)); + // } + // { + // let a = VecPostings::new(vec!(1, 3, 9)); + // let b = VecPostings::new(vec!(3, 4, 9, 18)); + // let c = VecPostings::new(vec!(1, 5, 9, 111)); + // let inter = IntersectionPostings::from_postings(vec!(a, b, c)); + // let vals: Vec = inter.collect(); + // assert_eq!(vals, vec!(9)); + // } + // } + // + // #[bench] + // fn bench_single_intersection(b: &mut Bencher) { + // b.iter(|| { + // let docs = VecPostings::new((0..1_000_000).collect()); + // let intersection = IntersectionPostings::from_postings(vec!(docs)); + // intersection.count() + // }); + // } +} diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs new file mode 100644 index 000000000..ba5e0a0d4 --- /dev/null +++ b/src/postings/serializer.rs @@ -0,0 +1,74 @@ +use datastruct::FstMapBuilder; +use super::TermInfo; +use core::schema::Term; +use directory::WritePtr; +use compression; +use DocId; +use core::index::Segment; +use std::io; +use core::index::SegmentComponent; +use core::serialize::BinarySerializable; + +pub struct PostingsSerializer { + terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" + postings_write: WritePtr, + positions_write: WritePtr, + written_bytes_postings: usize, + written_bytes_positions: usize, + encoder: compression::Encoder, + doc_ids: Vec, +} + +impl PostingsSerializer { + + pub fn open(segment: &Segment) -> io::Result { + let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); + let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); + let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); + let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); + Ok(PostingsSerializer { + terms_fst_builder: terms_fst_builder, + postings_write: postings_write, + positions_write: positions_write, + written_bytes_postings: 0, + written_bytes_positions: 0, + encoder: compression::Encoder::new(), + doc_ids: Vec::new(), + }) + } + + pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { + try!(self.close_term()); + self.doc_ids.clear(); + let term_info = TermInfo { + doc_freq: doc_freq, + postings_offset: self.written_bytes_postings as u32, + }; + self.terms_fst_builder + .insert(term.as_slice(), &term_info) + } + + pub fn close_term(&mut self,) -> io::Result<()> { + if !self.doc_ids.is_empty() { + let docs_data = self.encoder.encode_sorted(&self.doc_ids); + self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); + for num in docs_data { + self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); + } + } + Ok(()) + } + + pub fn write_doc(&mut self, doc_id: DocId, positions: Option<&[u32]>) -> io::Result<()> { + self.doc_ids.push(doc_id); + Ok(()) + } + + + pub fn close(mut self,) -> io::Result<()> { + try!(self.close_term()); + try!(self.terms_fst_builder.finish()); + try!(self.postings_write.flush()); + Ok(()) + } +} diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs new file mode 100644 index 000000000..8dc0728f6 --- /dev/null +++ b/src/postings/term_info.rs @@ -0,0 +1,26 @@ +use core::serialize::BinarySerializable; +use std::io; + +#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)] +pub struct TermInfo { + pub doc_freq: u32, + pub postings_offset: u32, +} + + +impl BinarySerializable for TermInfo { + fn serialize(&self, writer: &mut io::Write) -> io::Result { + Ok( + try!(self.doc_freq.serialize(writer)) + + try!(self.postings_offset.serialize(writer)) + ) + } + fn deserialize(reader: &mut io::Read) -> io::Result { + let doc_freq = try!(u32::deserialize(reader)); + let offset = try!(u32::deserialize(reader)); + Ok(TermInfo { + doc_freq: doc_freq, + postings_offset: offset, + }) + } +} diff --git a/src/postings/writer.rs b/src/postings/writer.rs new file mode 100644 index 000000000..6a16c3282 --- /dev/null +++ b/src/postings/writer.rs @@ -0,0 +1,141 @@ +use DocId; +use std::collections::BTreeMap; +use core::schema::Term; +use postings::PostingsSerializer; +use std::io; + +pub trait U32sRecorder { + fn new() -> Self; + fn record(&mut self, val: u32); +} + +pub struct VecRecorder(Vec); + +impl U32sRecorder for VecRecorder { + fn new() -> VecRecorder { + VecRecorder(Vec::new()) + } + fn record(&mut self, val: u32) { + self.0.push(val); + } +} + +pub struct ObliviousRecorder; + +impl U32sRecorder for ObliviousRecorder { + fn new() -> ObliviousRecorder { + ObliviousRecorder + } + fn record(&mut self, _: u32) { + } +} + + + + +struct TermPostingsWriter { + doc_ids: Vec, + term_freqs: TermFreqsRec, + positions: PositionsRec, + current_position: u32, + current_freq: u32, +} + +impl TermPostingsWriter { + pub fn new() -> TermPostingsWriter { + TermPostingsWriter { + doc_ids: Vec::new(), + term_freqs: TermFreqsRec::new(), + positions: PositionsRec::new(), + current_position: 0u32, + current_freq: 0u32, + } + } + + fn close_doc(&mut self,) { + self.term_freqs.record(self.current_freq); + self.current_freq = 0; + self.current_position = 0; + } + + fn close(&mut self,) { + if self.current_freq > 0 { + self.close_doc(); + } + } + + fn is_new_doc(&self, doc: &DocId) -> bool { + match self.doc_ids.last() { + Some(&last_doc) => last_doc != *doc, + None => true, + } + } + + pub fn doc_freq(&self) -> u32 { + self.doc_ids.len() as u32 + } + + pub fn suscribe(&mut self, doc: DocId, pos: u32) { + if self.is_new_doc(&doc) { + // this is the first time we meet this term for this document + // first close the previous document, and write its doc_freq. + self.close_doc(); + self.doc_ids.push(doc); + } + self.current_freq += 1; + self.positions.record(pos - self.current_position); + self.current_position = pos; + } +} + + + + + + +pub struct PostingsWriter { + postings: Vec>, + term_index: BTreeMap, +} + +impl PostingsWriter { + + pub fn new() -> PostingsWriter { + PostingsWriter { + postings: Vec::new(), + term_index: BTreeMap::new(), + } + } + + pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) { + let doc_ids: &mut TermPostingsWriter = self.get_term_postings(term); + doc_ids.suscribe(doc, pos); + } + + fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter { + match self.term_index.get(&term) { + Some(unord_id) => { + return &mut self.postings[*unord_id]; + }, + None => {} + } + let unord_id = self.term_index.len(); + self.postings.push(TermPostingsWriter::new()); + self.term_index.insert(term, unord_id.clone()); + &mut self.postings[unord_id] + } + + pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> { + for (term, postings_id) in self.term_index.iter() { + let term_postings_writer = &self.postings[postings_id.clone()]; + let term_docfreq = term_postings_writer.doc_freq(); + try!(serializer.new_term(&term, term_docfreq)); + for doc in term_postings_writer.doc_ids.iter() { + try!(serializer.write_doc(doc.clone(), None)); + } + } + Ok(()) + } + + +} From 6f148653ff7ce2b3c74b5cefdf9e52aadb6d6d47 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 30 Apr 2016 19:18:54 +0900 Subject: [PATCH 07/14] moved fastfield to its own mod. --- src/core/codec.rs | 2 +- src/core/collector.rs | 2 +- src/core/merger.rs | 2 +- src/core/mod.rs | 2 - src/core/reader.rs | 3 +- src/core/writer.rs | 2 +- src/{core => fastfield}/fastdivide.rs | 1 + src/{core/fastfield.rs => fastfield/mod.rs} | 284 +------------------- src/fastfield/reader.rs | 111 ++++++++ src/fastfield/serializer.rs | 87 ++++++ src/fastfield/writer.rs | 76 ++++++ src/lib.rs | 1 + 12 files changed, 295 insertions(+), 278 deletions(-) rename src/{core => fastfield}/fastdivide.rs (99%) rename src/{core/fastfield.rs => fastfield/mod.rs} (52%) create mode 100644 src/fastfield/reader.rs create mode 100644 src/fastfield/serializer.rs create mode 100644 src/fastfield/writer.rs diff --git a/src/core/codec.rs b/src/core/codec.rs index a65cf2e0b..fcddad101 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -4,7 +4,7 @@ use rustc_serialize::json; use core::index::Segment; use core::index::SegmentInfo; use core::index::SegmentComponent; -use core::fastfield::FastFieldSerializer; +use fastfield::FastFieldSerializer; use core::store::StoreWriter; use core::convert_to_ioerror; diff --git a/src/core/collector.rs b/src/core/collector.rs index def40c60b..5067b222a 100644 --- a/src/core/collector.rs +++ b/src/core/collector.rs @@ -2,7 +2,7 @@ use DocId; use core::reader::SegmentReader; use core::searcher::SegmentLocalId; use core::searcher::DocAddress; -use core::fastfield::U32FastFieldReader; +use fastfield::U32FastFieldReader; use core::schema::U32Field; use std::io; diff --git a/src/core/merger.rs b/src/core/merger.rs index 8f7437640..f719cbe8c 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -12,7 +12,7 @@ use std::collections::BinaryHeap; use datastruct::FstMapIter; use core::schema::Term; use core::schema::Schema; -use core::fastfield::FastFieldSerializer; +use fastfield::FastFieldSerializer; use core::store::StoreWriter; use core::index::SegmentInfo; use std::cmp::Ordering; diff --git a/src/core/mod.rs b/src/core/mod.rs index a374e2b4b..d1f5d8152 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,8 +8,6 @@ pub mod collector; pub mod serialize; pub mod store; pub mod index; -pub mod fastfield; -pub mod fastdivide; pub mod merger; pub mod timer; diff --git a/src/core/reader.rs b/src/core/reader.rs index ef1aedbe1..0e06036fc 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -17,8 +17,7 @@ use core::timer::OpenTimer; use core::schema::U32Field; use core::convert_to_ioerror; use core::serialize::BinarySerializable; -use core::fastfield::U32FastFieldsReader; -use core::fastfield::U32FastFieldReader; +use fastfield::{U32FastFieldsReader, U32FastFieldReader}; use compression; use std::mem; diff --git a/src/core/writer.rs b/src/core/writer.rs index 3c21979b6..96923cc37 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -8,7 +8,7 @@ use core::analyzer::StreamingIterator; use core::index::Segment; use core::index::SegmentInfo; use postings::PostingsWriter; -use core::fastfield::U32FastFieldsWriter; +use fastfield::U32FastFieldsWriter; use std::clone::Clone; use std::sync::mpsc; use std::thread; diff --git a/src/core/fastdivide.rs b/src/fastfield/fastdivide.rs similarity index 99% rename from src/core/fastdivide.rs rename to src/fastfield/fastdivide.rs index 7e0d06620..b568a3142 100644 --- a/src/core/fastdivide.rs +++ b/src/fastfield/fastdivide.rs @@ -5,6 +5,7 @@ use std::num::Wrapping; // ported from libdivide.h by ridiculous_fish + const LIBDIVIDE_32_SHIFT_MASK: u8 = 0x1F; const LIBDIVIDE_ADD_MARKER: u8 = 0x40; const LIBDIVIDE_U32_SHIFT_PATH: u8 = 0x80; diff --git a/src/core/fastfield.rs b/src/fastfield/mod.rs similarity index 52% rename from src/core/fastfield.rs rename to src/fastfield/mod.rs index 8daed2604..33f1d5788 100644 --- a/src/core/fastfield.rs +++ b/src/fastfield/mod.rs @@ -1,295 +1,39 @@ -use std::io; -use std::io::{SeekFrom, Seek, Write}; -use directory::{WritePtr, ReadOnlySource}; -use core::serialize::BinarySerializable; -use std::collections::HashMap; -use DocId; -use core::schema::Schema; -use core::schema::Document; -use std::ops::Deref; -use core::fastdivide::count_leading_zeros; -use core::fastdivide::DividerU32; -use core::schema::U32Field; +mod fastdivide; +mod reader; +mod writer; +mod serializer; -pub fn compute_num_bits(amplitude: u32) -> u8 { + +pub use self::fastdivide::DividerU32; +pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter}; +pub use self::reader::{U32FastFieldsReader, U32FastFieldReader}; +pub use self::serializer::FastFieldSerializer; + +use self::fastdivide::count_leading_zeros; + +fn compute_num_bits(amplitude: u32) -> u8 { 32u8 - count_leading_zeros(amplitude) } -pub struct FastFieldSerializer { - write: WritePtr, - written_size: usize, - fields: Vec<(U32Field, u32)>, - num_bits: u8, - - min_value: u32, - - field_open: bool, - mini_buffer_written: usize, - mini_buffer: u64, -} - -impl FastFieldSerializer { - pub fn new(mut write: WritePtr) -> io::Result { - // just making room for the pointer to header. - let written_size: usize = try!(0u32.serialize(&mut write)); - Ok(FastFieldSerializer { - write: write, - written_size: written_size, - fields: Vec::new(), - num_bits: 0u8, - field_open: false, - mini_buffer_written: 0, - mini_buffer: 0, - min_value: 0, - }) - } - - pub fn new_u32_fast_field(&mut self, field: U32Field, min_value: u32, max_value: u32) -> io::Result<()> { - if self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed")); - } - self.min_value = min_value; - self.field_open = true; - self.fields.push((field, self.written_size as u32)); - let write: &mut Write = &mut self.write; - self.written_size += try!(min_value.serialize(write)); - let amplitude = max_value - min_value; - self.written_size += try!(amplitude.serialize(write)); - self.num_bits = compute_num_bits(amplitude); - Ok(()) - } - - pub fn add_val(&mut self, val: u32) -> io::Result<()> { - let write: &mut Write = &mut self.write; - if self.mini_buffer_written + (self.num_bits as usize) > 64 { - self.written_size += try!(self.mini_buffer.serialize(write)); - self.mini_buffer = 0; - self.mini_buffer_written = 0; - } - self.mini_buffer |= ((val - self.min_value) as u64) << self.mini_buffer_written; - self.mini_buffer_written += self.num_bits as usize; - Ok(()) - } - - pub fn close_field(&mut self,) -> io::Result<()> { - if !self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed")); - } - self.field_open = false; - if self.mini_buffer_written > 0 { - self.mini_buffer_written = 0; - self.written_size += try!(self.mini_buffer.serialize(&mut self.write)); - } - self.mini_buffer = 0; - Ok(()) - } - - pub fn close(mut self,) -> io::Result { - if self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed")); - } - let header_offset: usize = self.written_size; - self.written_size += try!(self.fields.serialize(&mut self.write)); - try!(self.write.seek(SeekFrom::Start(0))); - try!((header_offset as u32).serialize(&mut self.write)); - Ok(self.written_size) - } -} - -pub struct U32FastFieldsWriter { - field_writers: Vec, -} - -impl U32FastFieldsWriter { - - pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter { - let u32_fields: Vec = schema.get_u32_fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.option.is_fast()) - .map(|(field_id, _)| U32Field(field_id as u8)) - .collect(); - U32FastFieldsWriter::new(u32_fields) - } - - pub fn new(fields: Vec) -> U32FastFieldsWriter { - U32FastFieldsWriter { - field_writers: fields - .iter() - .map(|field| U32FastFieldWriter::new(&field)) - .collect(), - } - } - - pub fn add_document(&mut self, doc: &Document) { - for field_writer in self.field_writers.iter_mut() { - field_writer.add_document(doc); - } - } - - pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { - for field_writer in self.field_writers.iter() { - try!(field_writer.serialize(serializer)); - } - Ok(()) - } -} - -pub struct U32FastFieldWriter { - field: U32Field, - vals: Vec, -} - -impl U32FastFieldWriter { - pub fn new(field: &U32Field) -> U32FastFieldWriter { - U32FastFieldWriter { - field: field.clone(), - vals: Vec::new(), - } - } - - pub fn add_val(&mut self, val: u32) { - self.vals.push(val); - } - - pub fn add_document(&mut self, doc: &Document) { - let val = doc.get_u32(&self.field).unwrap_or(0u32); - self.add_val(val); - } - - pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { - let zero = 0; - let min = self.vals.iter().min().unwrap_or(&zero).clone(); - let max = self.vals.iter().max().unwrap_or(&min).clone(); - try!(serializer.new_u32_fast_field(self.field.clone(), min, max)); - for val in self.vals.iter() { - try!(serializer.add_val(val.clone())); - } - serializer.close_field() - } -} - -pub struct U32FastFieldReader { - _data: ReadOnlySource, - data_ptr: *const u64, - min_val: u32, - max_val: u32, - num_bits: u8, - mask: u32, - num_in_pack: u32, - divider: DividerU32, -} - -impl U32FastFieldReader { - - pub fn min_val(&self,) -> u32 { - self.min_val - } - - pub fn max_val(&self,) -> u32 { - self.max_val - } - - pub fn open(data: ReadOnlySource) -> io::Result { - let min_val; - let amplitude; - { - let mut cursor = data.cursor(); - min_val = try!(u32::deserialize(&mut cursor)); - amplitude = try!(u32::deserialize(&mut cursor)); - } - let num_bits = compute_num_bits(amplitude); - let mask = (1 << num_bits) - 1; - let num_in_pack = 64u32 / (num_bits as u32); - let ptr: *const u8 = &(data.deref()[8 as usize]); - Ok(U32FastFieldReader { - _data: data, - data_ptr: ptr as *const u64, - min_val: min_val, - max_val: min_val + amplitude, - num_bits: num_bits, - mask: mask, - num_in_pack: num_in_pack, - divider: DividerU32::divide_by(num_in_pack), - }) - } - - pub fn get(&self, doc: DocId) -> u32 { - let long_addr = self.divider.divide(doc); - let ord_within_long = doc - long_addr * self.num_in_pack; - let bit_shift = (self.num_bits as u32) * ord_within_long; - let val_unshifted_unmasked: u64 = unsafe { *self.data_ptr.offset(long_addr as isize) }; - let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32; - return self.min_val + (val_shifted & self.mask); - } -} - -pub struct U32FastFieldsReader { - source: ReadOnlySource, - field_offsets: HashMap, -} - -impl U32FastFieldsReader { - pub fn open(source: ReadOnlySource) -> io::Result { - let header_offset; - let field_offsets: Vec<(U32Field, u32)>; - { - let mut cursor = source.cursor(); - header_offset = try!(u32::deserialize(&mut cursor)); - try!(cursor.seek(SeekFrom::Start(header_offset as u64))); - field_offsets = try!(Vec::deserialize(&mut cursor)); - } - let mut end_offsets: Vec = field_offsets - .iter() - .map(|&(_, offset)| offset.clone()) - .collect(); - end_offsets.push(header_offset); - let mut field_offsets_map: HashMap = HashMap::new(); - for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) { - let (field, start_offset) = field_start_offsets.clone(); - field_offsets_map.insert(field.clone(), (start_offset.clone(), stop_offset.clone())); - } - Ok(U32FastFieldsReader { - field_offsets: field_offsets_map, - source: source, - }) - } - - pub fn get_field(&self, field: &U32Field) -> io::Result { - match self.field_offsets.get(field) { - Some(&(start, stop)) => { - let field_source = self.source.slice(start as usize, stop as usize); - U32FastFieldReader::open(field_source) - } - None => { - Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field, has it been set as a fast field?")) - } - - } - - } -} - #[cfg(test)] mod tests { use super::compute_num_bits; use super::U32FastFieldsReader; use super::U32FastFieldsWriter; + use super::FastFieldSerializer; use core::schema::U32Field; use std::path::Path; use directory::{Directory, WritePtr, RAMDirectory}; use core::schema::Document; use core::schema::Schema; use core::schema::FAST_U32; - use core::fastfield::FastFieldSerializer; use test::Bencher; use test; use rand::Rng; use rand::SeedableRng; use rand::XorShiftRng; - #[test] fn test_compute_num_bits() { assert_eq!(compute_num_bits(1), 1u8); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs new file mode 100644 index 000000000..38a34834b --- /dev/null +++ b/src/fastfield/reader.rs @@ -0,0 +1,111 @@ +use std::io; +use directory::ReadOnlySource; +use fastfield::DividerU32; +use core::serialize::BinarySerializable; +use DocId; +use std::collections::HashMap; +use core::schema::U32Field; +use std::io::{SeekFrom, Seek}; +use std::ops::Deref; +use super::compute_num_bits; + +pub struct U32FastFieldReader { + _data: ReadOnlySource, + data_ptr: *const u64, + min_val: u32, + max_val: u32, + num_bits: u8, + mask: u32, + num_in_pack: u32, + divider: DividerU32, +} + +impl U32FastFieldReader { + + pub fn min_val(&self,) -> u32 { + self.min_val + } + + pub fn max_val(&self,) -> u32 { + self.max_val + } + + pub fn open(data: ReadOnlySource) -> io::Result { + let min_val; + let amplitude; + { + let mut cursor = data.cursor(); + min_val = try!(u32::deserialize(&mut cursor)); + amplitude = try!(u32::deserialize(&mut cursor)); + } + let num_bits = compute_num_bits(amplitude); + let mask = (1 << num_bits) - 1; + let num_in_pack = 64u32 / (num_bits as u32); + let ptr: *const u8 = &(data.deref()[8 as usize]); + Ok(U32FastFieldReader { + _data: data, + data_ptr: ptr as *const u64, + min_val: min_val, + max_val: min_val + amplitude, + num_bits: num_bits, + mask: mask, + num_in_pack: num_in_pack, + divider: DividerU32::divide_by(num_in_pack), + }) + } + + pub fn get(&self, doc: DocId) -> u32 { + let long_addr = self.divider.divide(doc); + let ord_within_long = doc - long_addr * self.num_in_pack; + let bit_shift = (self.num_bits as u32) * ord_within_long; + let val_unshifted_unmasked: u64 = unsafe { *self.data_ptr.offset(long_addr as isize) }; + let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32; + return self.min_val + (val_shifted & self.mask); + } +} + +pub struct U32FastFieldsReader { + source: ReadOnlySource, + field_offsets: HashMap, +} + +impl U32FastFieldsReader { + pub fn open(source: ReadOnlySource) -> io::Result { + let header_offset; + let field_offsets: Vec<(U32Field, u32)>; + { + let mut cursor = source.cursor(); + header_offset = try!(u32::deserialize(&mut cursor)); + try!(cursor.seek(SeekFrom::Start(header_offset as u64))); + field_offsets = try!(Vec::deserialize(&mut cursor)); + } + let mut end_offsets: Vec = field_offsets + .iter() + .map(|&(_, offset)| offset.clone()) + .collect(); + end_offsets.push(header_offset); + let mut field_offsets_map: HashMap = HashMap::new(); + for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) { + let (field, start_offset) = field_start_offsets.clone(); + field_offsets_map.insert(field.clone(), (start_offset.clone(), stop_offset.clone())); + } + Ok(U32FastFieldsReader { + field_offsets: field_offsets_map, + source: source, + }) + } + + pub fn get_field(&self, field: &U32Field) -> io::Result { + match self.field_offsets.get(field) { + Some(&(start, stop)) => { + let field_source = self.source.slice(start as usize, stop as usize); + U32FastFieldReader::open(field_source) + } + None => { + Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field, has it been set as a fast field?")) + } + + } + + } +} diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs new file mode 100644 index 000000000..7167630af --- /dev/null +++ b/src/fastfield/serializer.rs @@ -0,0 +1,87 @@ +use core::serialize::BinarySerializable; +use directory::WritePtr; +use core::schema::U32Field; +use std::io; +use std::io::{SeekFrom, Write}; +use super::compute_num_bits; + +pub struct FastFieldSerializer { + write: WritePtr, + written_size: usize, + fields: Vec<(U32Field, u32)>, + num_bits: u8, + + min_value: u32, + + field_open: bool, + mini_buffer_written: usize, + mini_buffer: u64, +} + +impl FastFieldSerializer { + pub fn new(mut write: WritePtr) -> io::Result { + // just making room for the pointer to header. + let written_size: usize = try!(0u32.serialize(&mut write)); + Ok(FastFieldSerializer { + write: write, + written_size: written_size, + fields: Vec::new(), + num_bits: 0u8, + field_open: false, + mini_buffer_written: 0, + mini_buffer: 0, + min_value: 0, + }) + } + + pub fn new_u32_fast_field(&mut self, field: U32Field, min_value: u32, max_value: u32) -> io::Result<()> { + if self.field_open { + return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed")); + } + self.min_value = min_value; + self.field_open = true; + self.fields.push((field, self.written_size as u32)); + let write: &mut Write = &mut self.write; + self.written_size += try!(min_value.serialize(write)); + let amplitude = max_value - min_value; + self.written_size += try!(amplitude.serialize(write)); + self.num_bits = compute_num_bits(amplitude); + Ok(()) + } + + pub fn add_val(&mut self, val: u32) -> io::Result<()> { + let write: &mut Write = &mut self.write; + if self.mini_buffer_written + (self.num_bits as usize) > 64 { + self.written_size += try!(self.mini_buffer.serialize(write)); + self.mini_buffer = 0; + self.mini_buffer_written = 0; + } + self.mini_buffer |= ((val - self.min_value) as u64) << self.mini_buffer_written; + self.mini_buffer_written += self.num_bits as usize; + Ok(()) + } + + pub fn close_field(&mut self,) -> io::Result<()> { + if !self.field_open { + return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed")); + } + self.field_open = false; + if self.mini_buffer_written > 0 { + self.mini_buffer_written = 0; + self.written_size += try!(self.mini_buffer.serialize(&mut self.write)); + } + self.mini_buffer = 0; + Ok(()) + } + + pub fn close(mut self,) -> io::Result { + if self.field_open { + return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed")); + } + let header_offset: usize = self.written_size; + self.written_size += try!(self.fields.serialize(&mut self.write)); + try!(self.write.seek(SeekFrom::Start(0))); + try!((header_offset as u32).serialize(&mut self.write)); + Ok(self.written_size) + } +} diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs new file mode 100644 index 000000000..b7e0e0633 --- /dev/null +++ b/src/fastfield/writer.rs @@ -0,0 +1,76 @@ +use core::schema::{Schema, U32Field, Document}; +use fastfield::FastFieldSerializer; +use std::io; + +pub struct U32FastFieldsWriter { + field_writers: Vec, +} + +impl U32FastFieldsWriter { + + pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter { + let u32_fields: Vec = schema.get_u32_fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| field_entry.option.is_fast()) + .map(|(field_id, _)| U32Field(field_id as u8)) + .collect(); + U32FastFieldsWriter::new(u32_fields) + } + + pub fn new(fields: Vec) -> U32FastFieldsWriter { + U32FastFieldsWriter { + field_writers: fields + .iter() + .map(|field| U32FastFieldWriter::new(&field)) + .collect(), + } + } + + pub fn add_document(&mut self, doc: &Document) { + for field_writer in self.field_writers.iter_mut() { + field_writer.add_document(doc); + } + } + + pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { + for field_writer in self.field_writers.iter() { + try!(field_writer.serialize(serializer)); + } + Ok(()) + } +} + +pub struct U32FastFieldWriter { + field: U32Field, + vals: Vec, +} + +impl U32FastFieldWriter { + pub fn new(field: &U32Field) -> U32FastFieldWriter { + U32FastFieldWriter { + field: field.clone(), + vals: Vec::new(), + } + } + + pub fn add_val(&mut self, val: u32) { + self.vals.push(val); + } + + pub fn add_document(&mut self, doc: &Document) { + let val = doc.get_u32(&self.field).unwrap_or(0u32); + self.add_val(val); + } + + pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { + let zero = 0; + let min = self.vals.iter().min().unwrap_or(&zero).clone(); + let max = self.vals.iter().max().unwrap_or(&min).clone(); + try!(serializer.new_u32_fast_field(self.field.clone(), min, max)); + for val in self.vals.iter() { + try!(serializer.add_val(val.clone())); + } + serializer.close_field() + } +} diff --git a/src/lib.rs b/src/lib.rs index 8b7eb3e58..5831e8c63 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,7 @@ mod datastruct; mod postings; mod directory; mod compression; +mod fastfield; pub use directory::Directory; pub use core::analyzer; From c9810e0b801d28efe2a9cc1ecd6fbe89f357c18e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 30 Apr 2016 21:20:21 +0900 Subject: [PATCH 08/14] isolated store --- src/core/codec.rs | 2 +- src/core/merger.rs | 2 +- src/core/mod.rs | 1 - src/core/reader.rs | 2 +- src/core/store.rs | 286 -------------------------------------------- src/lib.rs | 1 + src/store/mod.rs | 91 ++++++++++++++ src/store/reader.rs | 96 +++++++++++++++ src/store/writer.rs | 112 +++++++++++++++++ 9 files changed, 303 insertions(+), 290 deletions(-) delete mode 100644 src/core/store.rs create mode 100644 src/store/mod.rs create mode 100644 src/store/reader.rs create mode 100644 src/store/writer.rs diff --git a/src/core/codec.rs b/src/core/codec.rs index fcddad101..06ad5c86f 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -5,7 +5,7 @@ use core::index::Segment; use core::index::SegmentInfo; use core::index::SegmentComponent; use fastfield::FastFieldSerializer; -use core::store::StoreWriter; +use store::StoreWriter; use core::convert_to_ioerror; use postings::PostingsSerializer; diff --git a/src/core/merger.rs b/src/core/merger.rs index f719cbe8c..7ca0041db 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -13,7 +13,7 @@ use datastruct::FstMapIter; use core::schema::Term; use core::schema::Schema; use fastfield::FastFieldSerializer; -use core::store::StoreWriter; +use store::StoreWriter; use core::index::SegmentInfo; use std::cmp::Ordering; use core::schema::U32Field; diff --git a/src/core/mod.rs b/src/core/mod.rs index d1f5d8152..c8512e9a0 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -6,7 +6,6 @@ pub mod codec; pub mod searcher; pub mod collector; pub mod serialize; -pub mod store; pub mod index; pub mod merger; pub mod timer; diff --git a/src/core/reader.rs b/src/core/reader.rs index 0e06036fc..7e69fe127 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -1,6 +1,6 @@ use core::index::{Segment, SegmentId}; use core::schema::Term; -use core::store::StoreReader; +use store::StoreReader; use core::schema::Document; use directory::ReadOnlySource; use std::io::Cursor; diff --git a/src/core/store.rs b/src/core/store.rs deleted file mode 100644 index 71872f240..000000000 --- a/src/core/store.rs +++ /dev/null @@ -1,286 +0,0 @@ -use directory::{WritePtr, ReadOnlySource}; -use std::cell::RefCell; -use DocId; -use core::schema::Document; -use core::schema::TextFieldValue; -use core::serialize::BinarySerializable; - -use std::io::Write; -use std::io::Read; -use std::io::Cursor; -use std::io; -use std::io::SeekFrom; -use std::io::Seek; -use std::cmp::Ordering; -use lz4; - -// TODO cache uncompressed pages - -const BLOCK_SIZE: usize = 131_072; - -#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)] -struct OffsetIndex(DocId, u64); - -pub struct StoreWriter { - doc: DocId, - offsets: Vec, // TODO have a better index. - written: u64, - writer: WritePtr, - intermediary_buffer: Vec, - current_block: Vec, -} - -impl BinarySerializable for OffsetIndex { - fn serialize(&self, writer: &mut Write) -> io::Result { - let OffsetIndex(a, b) = *self; - Ok(try!(a.serialize(writer)) + try!(b.serialize(writer))) - } - fn deserialize(reader: &mut Read) -> io::Result { - let a = try!(DocId::deserialize(reader)); - let b = try!(u64::deserialize(reader)); - Ok(OffsetIndex(a, b)) - } -} - -impl StoreWriter { - - pub fn new(writer: WritePtr) -> StoreWriter { - StoreWriter { - doc: 0, - written: 0, - offsets: Vec::new(), - writer: writer, - intermediary_buffer: Vec::new(), - current_block: Vec::new(), - } - } - - pub fn stack_reader(&mut self, reader: &StoreReader) -> io::Result<()> { - if self.current_block.len() > 0 { - try!(self.write_and_compress_block()); - } - match reader.offsets.last() { - Some(&OffsetIndex(ref num_docs, ref body_size)) => { - try!(self.writer.write_all(&reader.data.as_slice()[0..*body_size as usize])); - for &OffsetIndex(doc, offset) in reader.offsets.iter() { - self.offsets.push(OffsetIndex(self.doc + doc, self.written + offset)); - } - self.written += *body_size; - self.doc += *num_docs; - Ok(()) - }, - None => { - Err(io::Error::new(io::ErrorKind::Other, "No offset for reader")) - } - } - } - - pub fn store<'a>(&mut self, field_values: &Vec<&'a TextFieldValue>) -> io::Result<()> { - self.intermediary_buffer.clear(); - try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); - for field_value in field_values.iter() { - try!((*field_value).serialize(&mut self.intermediary_buffer)); - } - try!((self.intermediary_buffer.len() as u32).serialize(&mut self.current_block)); - try!(self.current_block.write_all(&self.intermediary_buffer[..])); - self.doc += 1; - if self.current_block.len() > BLOCK_SIZE { - try!(self.write_and_compress_block()); - } - Ok(()) - } - - fn write_and_compress_block(&mut self,) -> io::Result<()> { - self.intermediary_buffer.clear(); - { - let mut encoder = lz4::EncoderBuilder::new() - .build(&mut self.intermediary_buffer) - .unwrap(); - try!(encoder.write_all(&self.current_block)); - let (_, encoder_result) = encoder.finish(); - try!(encoder_result); - } - let compressed_block_size = self.intermediary_buffer.len() as u64; - self.written += try!((compressed_block_size as u32).serialize(&mut self.writer)) as u64; - try!(self.writer.write_all(&self.intermediary_buffer)); - self.written += compressed_block_size; - self.offsets.push(OffsetIndex(self.doc, self.written)); - self.current_block.clear(); - Ok(()) - } - - pub fn close(&mut self,) -> io::Result<()> { - if self.current_block.len() > 0 { - try!(self.write_and_compress_block()); - } - let header_offset: u64 = self.written; - try!(self.offsets.serialize(&mut self.writer)); - try!(header_offset.serialize(&mut self.writer)); - self.writer.flush() - } - -} - - -pub struct StoreReader { - data: ReadOnlySource, - offsets: Vec, - current_block: RefCell>, -} - -impl StoreReader { - - fn read_header(data: &ReadOnlySource) -> Vec { - // TODO err - // the first offset is implicitely (0, 0) - let mut offsets = vec!(OffsetIndex(0, 0)); - let mut cursor = Cursor::new(data.as_slice()); - cursor.seek(SeekFrom::End(-8)).unwrap(); - let offset = u64::deserialize(&mut cursor).unwrap(); - cursor.seek(SeekFrom::Start(offset)).unwrap(); - offsets.append(&mut Vec::deserialize(&mut cursor).unwrap()); - offsets - } - - fn block_offset(&self, seek: &DocId) -> OffsetIndex { - fn search(offsets: &[OffsetIndex], seek: &DocId) -> OffsetIndex { - let m = offsets.len() / 2; - let pivot_offset = &offsets[m]; - if offsets.len() <= 1 { - return pivot_offset.clone() - } - match pivot_offset.0.cmp(seek) { - Ordering::Less => search(&offsets[m..], seek), - Ordering::Equal => pivot_offset.clone(), - Ordering::Greater => search(&offsets[..m], seek), - } - } - search(&self.offsets, seek) - } - - fn read_block(&self, block_offset: usize) -> io::Result<()> { - let mut current_block_mut = self.current_block.borrow_mut(); - current_block_mut.clear(); - let total_buffer = self.data.as_slice(); - let mut cursor = Cursor::new(&total_buffer[block_offset..]); - let block_length = u32::deserialize(&mut cursor).unwrap(); - let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)]; - let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap(); - lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()) - } - - pub fn get(&self, doc_id: &DocId) -> io::Result { - let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id); - try!(self.read_block(block_offset as usize)); - let mut current_block_mut = self.current_block.borrow_mut(); - let mut cursor = Cursor::new(&mut current_block_mut[..]); - for _ in first_doc_id..*doc_id { - let block_length = try!(u32::deserialize(&mut cursor)); - try!(cursor.seek(SeekFrom::Current(block_length as i64))); - } - try!(u32::deserialize(&mut cursor)); - let mut text_field_values = Vec::new(); - let num_fields = try!(u32::deserialize(&mut cursor)); - for _ in 0..num_fields { - let text_field_value = try!(TextFieldValue::deserialize(&mut cursor)); - text_field_values.push(text_field_value); - } - let u32_field_values = Vec::new(); - Ok(Document { - text_field_values: text_field_values, - u32_field_values: u32_field_values, - }) - } - - pub fn new(data: ReadOnlySource) -> StoreReader { - let offsets = StoreReader::read_header(&data); - StoreReader { - data: data, - offsets: offsets, - current_block: RefCell::new(Vec::new()), - } - } -} - - -#[cfg(test)] -mod tests { - - use super::*; - use test::Bencher; - use std::path::PathBuf; - use core::schema::Schema; - use core::schema::TextOptions; - use core::schema::TextFieldValue; - use directory::{RAMDirectory, Directory, MmapDirectory, WritePtr}; - - fn write_lorem_ipsum_store(writer: WritePtr) -> Schema { - let mut schema = Schema::new(); - let field_body = schema.add_text_field("body", &TextOptions::new().set_stored()); - let field_title = schema.add_text_field("title", &TextOptions::new().set_stored()); - let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."); - { - let mut store_writer = StoreWriter::new(writer); - for i in 0..1000 { - let mut fields: Vec = Vec::new(); - { - let field_value = TextFieldValue { - field: field_body.clone(), - text: lorem.clone(), - }; - fields.push(field_value); - } - { - let title_text = format!("Doc {}", i); - let field_value = TextFieldValue { - field: field_title.clone(), - text: title_text, - }; - fields.push(field_value); - } - let fields_refs: Vec<&TextFieldValue> = fields.iter().collect(); - store_writer.store(&fields_refs).unwrap(); - } - store_writer.close().unwrap(); - } - schema - } - - - #[test] - fn test_store() { - let path = PathBuf::from("store"); - let mut directory = RAMDirectory::create(); - let store_file = directory.open_write(&path).unwrap(); - let schema = write_lorem_ipsum_store(store_file); - let field_title = schema.text_field("title"); - let store_source = directory.open_read(&path).unwrap(); - let store = StoreReader::new(store_source); - for i in (0..10).map(|i| i * 3 / 2) { - assert_eq!(*store.get(&i).unwrap().get_first_text(&field_title).unwrap(), format!("Doc {}", i)); - } - } - - #[bench] - fn bench_store_encode(b: &mut Bencher) { - let mut directory = MmapDirectory::create_from_tempdir().unwrap(); - let path = PathBuf::from("store"); - b.iter(|| { - write_lorem_ipsum_store(directory.open_write(&path).unwrap()); - }); - } - - - #[bench] - fn bench_store_decode(b: &mut Bencher) { - let mut directory = MmapDirectory::create_from_tempdir().unwrap(); - let path = PathBuf::from("store"); - write_lorem_ipsum_store(directory.open_write(&path).unwrap()); - let store_source = directory.open_read(&path).unwrap(); - let store = StoreReader::new(store_source); - b.iter(|| { - store.get(&12).unwrap(); - }); - - } -} diff --git a/src/lib.rs b/src/lib.rs index 5831e8c63..d6b0f618b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,6 +33,7 @@ mod postings; mod directory; mod compression; mod fastfield; +mod store; pub use directory::Directory; pub use core::analyzer; diff --git a/src/store/mod.rs b/src/store/mod.rs new file mode 100644 index 000000000..e7f6c7ae2 --- /dev/null +++ b/src/store/mod.rs @@ -0,0 +1,91 @@ +mod reader; +mod writer; + +use DocId; +pub use self::reader::StoreReader; +pub use self::writer::StoreWriter; + +#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)] +pub struct OffsetIndex(DocId, u64); + +#[cfg(test)] +mod tests { + + use super::*; + use test::Bencher; + use std::path::PathBuf; + use core::schema::Schema; + use core::schema::TextOptions; + use core::schema::TextFieldValue; + use directory::{RAMDirectory, Directory, MmapDirectory, WritePtr}; + + fn write_lorem_ipsum_store(writer: WritePtr) -> Schema { + let mut schema = Schema::new(); + let field_body = schema.add_text_field("body", &TextOptions::new().set_stored()); + let field_title = schema.add_text_field("title", &TextOptions::new().set_stored()); + let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."); + { + let mut store_writer = StoreWriter::new(writer); + for i in 0..1000 { + let mut fields: Vec = Vec::new(); + { + let field_value = TextFieldValue { + field: field_body.clone(), + text: lorem.clone(), + }; + fields.push(field_value); + } + { + let title_text = format!("Doc {}", i); + let field_value = TextFieldValue { + field: field_title.clone(), + text: title_text, + }; + fields.push(field_value); + } + let fields_refs: Vec<&TextFieldValue> = fields.iter().collect(); + store_writer.store(&fields_refs).unwrap(); + } + store_writer.close().unwrap(); + } + schema + } + + + #[test] + fn test_store() { + let path = PathBuf::from("store"); + let mut directory = RAMDirectory::create(); + let store_file = directory.open_write(&path).unwrap(); + let schema = write_lorem_ipsum_store(store_file); + let field_title = schema.text_field("title"); + let store_source = directory.open_read(&path).unwrap(); + let store = StoreReader::new(store_source); + for i in (0..10).map(|i| i * 3 / 2) { + assert_eq!(*store.get(&i).unwrap().get_first_text(&field_title).unwrap(), format!("Doc {}", i)); + } + } + + #[bench] + fn bench_store_encode(b: &mut Bencher) { + let mut directory = MmapDirectory::create_from_tempdir().unwrap(); + let path = PathBuf::from("store"); + b.iter(|| { + write_lorem_ipsum_store(directory.open_write(&path).unwrap()); + }); + } + + + #[bench] + fn bench_store_decode(b: &mut Bencher) { + let mut directory = MmapDirectory::create_from_tempdir().unwrap(); + let path = PathBuf::from("store"); + write_lorem_ipsum_store(directory.open_write(&path).unwrap()); + let store_source = directory.open_read(&path).unwrap(); + let store = StoreReader::new(store_source); + b.iter(|| { + store.get(&12).unwrap(); + }); + + } +} diff --git a/src/store/reader.rs b/src/store/reader.rs new file mode 100644 index 000000000..dd6d56288 --- /dev/null +++ b/src/store/reader.rs @@ -0,0 +1,96 @@ +use directory::ReadOnlySource; +use std::cell::RefCell; +use DocId; +use core::schema::Document; +use core::schema::TextFieldValue; +use core::serialize::BinarySerializable; + +use std::io::Read; +use std::io::Cursor; +use std::io; +use std::io::SeekFrom; +use std::io::Seek; +use std::cmp::Ordering; +use lz4; + +use super::OffsetIndex; + +pub struct StoreReader { + pub data: ReadOnlySource, + pub offsets: Vec, + current_block: RefCell>, +} + +impl StoreReader { + + fn read_header(data: &ReadOnlySource) -> Vec { + // TODO err + // the first offset is implicitely (0, 0) + let mut offsets = vec!(OffsetIndex(0, 0)); + let mut cursor = Cursor::new(data.as_slice()); + cursor.seek(SeekFrom::End(-8)).unwrap(); + let offset = u64::deserialize(&mut cursor).unwrap(); + cursor.seek(SeekFrom::Start(offset)).unwrap(); + offsets.append(&mut Vec::deserialize(&mut cursor).unwrap()); + offsets + } + + fn block_offset(&self, seek: &DocId) -> OffsetIndex { + fn search(offsets: &[OffsetIndex], seek: &DocId) -> OffsetIndex { + let m = offsets.len() / 2; + let pivot_offset = &offsets[m]; + if offsets.len() <= 1 { + return pivot_offset.clone() + } + match pivot_offset.0.cmp(seek) { + Ordering::Less => search(&offsets[m..], seek), + Ordering::Equal => pivot_offset.clone(), + Ordering::Greater => search(&offsets[..m], seek), + } + } + search(&self.offsets, seek) + } + + fn read_block(&self, block_offset: usize) -> io::Result<()> { + let mut current_block_mut = self.current_block.borrow_mut(); + current_block_mut.clear(); + let total_buffer = self.data.as_slice(); + let mut cursor = Cursor::new(&total_buffer[block_offset..]); + let block_length = u32::deserialize(&mut cursor).unwrap(); + let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)]; + let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap(); + lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()) + } + + pub fn get(&self, doc_id: &DocId) -> io::Result { + let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id); + try!(self.read_block(block_offset as usize)); + let mut current_block_mut = self.current_block.borrow_mut(); + let mut cursor = Cursor::new(&mut current_block_mut[..]); + for _ in first_doc_id..*doc_id { + let block_length = try!(u32::deserialize(&mut cursor)); + try!(cursor.seek(SeekFrom::Current(block_length as i64))); + } + try!(u32::deserialize(&mut cursor)); + let mut text_field_values = Vec::new(); + let num_fields = try!(u32::deserialize(&mut cursor)); + for _ in 0..num_fields { + let text_field_value = try!(TextFieldValue::deserialize(&mut cursor)); + text_field_values.push(text_field_value); + } + let u32_field_values = Vec::new(); + Ok(Document { + text_field_values: text_field_values, + u32_field_values: u32_field_values, + }) + } + + pub fn new(data: ReadOnlySource) -> StoreReader { + let offsets = StoreReader::read_header(&data); + StoreReader { + data: data, + offsets: offsets, + current_block: RefCell::new(Vec::new()), + } + } +} diff --git a/src/store/writer.rs b/src/store/writer.rs new file mode 100644 index 000000000..d6d637261 --- /dev/null +++ b/src/store/writer.rs @@ -0,0 +1,112 @@ +use directory::WritePtr; +use DocId; +use core::schema::TextFieldValue; +use core::serialize::BinarySerializable; +use std::io::Write; +use std::io::Read; +use std::io; +use lz4; +use super::StoreReader; +use super::OffsetIndex; + +const BLOCK_SIZE: usize = 131_072; + +pub struct StoreWriter { + doc: DocId, + offsets: Vec, // TODO have a better index. + written: u64, + writer: WritePtr, + intermediary_buffer: Vec, + current_block: Vec, +} + +impl BinarySerializable for OffsetIndex { + fn serialize(&self, writer: &mut Write) -> io::Result { + let OffsetIndex(a, b) = *self; + Ok(try!(a.serialize(writer)) + try!(b.serialize(writer))) + } + fn deserialize(reader: &mut Read) -> io::Result { + let a = try!(DocId::deserialize(reader)); + let b = try!(u64::deserialize(reader)); + Ok(OffsetIndex(a, b)) + } +} + +impl StoreWriter { + + pub fn new(writer: WritePtr) -> StoreWriter { + StoreWriter { + doc: 0, + written: 0, + offsets: Vec::new(), + writer: writer, + intermediary_buffer: Vec::new(), + current_block: Vec::new(), + } + } + + pub fn stack_reader(&mut self, reader: &StoreReader) -> io::Result<()> { + if self.current_block.len() > 0 { + try!(self.write_and_compress_block()); + } + match reader.offsets.last() { + Some(&OffsetIndex(ref num_docs, ref body_size)) => { + try!(self.writer.write_all(&reader.data.as_slice()[0..*body_size as usize])); + for &OffsetIndex(doc, offset) in reader.offsets.iter() { + self.offsets.push(OffsetIndex(self.doc + doc, self.written + offset)); + } + self.written += *body_size; + self.doc += *num_docs; + Ok(()) + }, + None => { + Err(io::Error::new(io::ErrorKind::Other, "No offset for reader")) + } + } + } + + pub fn store<'a>(&mut self, field_values: &Vec<&'a TextFieldValue>) -> io::Result<()> { + self.intermediary_buffer.clear(); + try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); + for field_value in field_values.iter() { + try!((*field_value).serialize(&mut self.intermediary_buffer)); + } + try!((self.intermediary_buffer.len() as u32).serialize(&mut self.current_block)); + try!(self.current_block.write_all(&self.intermediary_buffer[..])); + self.doc += 1; + if self.current_block.len() > BLOCK_SIZE { + try!(self.write_and_compress_block()); + } + Ok(()) + } + + fn write_and_compress_block(&mut self,) -> io::Result<()> { + self.intermediary_buffer.clear(); + { + let mut encoder = lz4::EncoderBuilder::new() + .build(&mut self.intermediary_buffer) + .unwrap(); + try!(encoder.write_all(&self.current_block)); + let (_, encoder_result) = encoder.finish(); + try!(encoder_result); + } + let compressed_block_size = self.intermediary_buffer.len() as u64; + self.written += try!((compressed_block_size as u32).serialize(&mut self.writer)) as u64; + try!(self.writer.write_all(&self.intermediary_buffer)); + self.written += compressed_block_size; + self.offsets.push(OffsetIndex(self.doc, self.written)); + self.current_block.clear(); + Ok(()) + } + + pub fn close(&mut self,) -> io::Result<()> { + if self.current_block.len() > 0 { + try!(self.write_and_compress_block()); + } + let header_offset: u64 = self.written; + try!(self.offsets.serialize(&mut self.writer)); + try!(header_offset.serialize(&mut self.writer)); + self.writer.flush() + } + +} From 99cc5447208505cd5e9eed08af8c4989980010ee Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 09:54:51 +0900 Subject: [PATCH 09/14] moved schema in a different mod --- src/core/collector.rs | 2 +- src/core/index.rs | 2 +- src/core/merger.rs | 14 +- src/core/mod.rs | 1 - src/core/reader.rs | 6 +- src/core/schema.rs | 553 ------------------------------------ src/core/searcher.rs | 3 +- src/core/writer.rs | 5 +- src/fastfield/mod.rs | 8 +- src/fastfield/reader.rs | 2 +- src/fastfield/serializer.rs | 2 +- src/fastfield/writer.rs | 2 +- src/lib.rs | 6 +- src/postings/serializer.rs | 2 +- src/postings/writer.rs | 2 +- src/schema/document.rs | 97 +++++++ src/schema/mod.rs | 21 ++ src/schema/schema.rs | 180 ++++++++++++ src/schema/term.rs | 54 ++++ src/schema/text_field.rs | 167 +++++++++++ src/schema/u32_field.rs | 76 +++++ src/store/mod.rs | 6 +- src/store/reader.rs | 4 +- src/store/writer.rs | 2 +- 24 files changed, 628 insertions(+), 589 deletions(-) delete mode 100644 src/core/schema.rs create mode 100644 src/schema/document.rs create mode 100644 src/schema/mod.rs create mode 100644 src/schema/schema.rs create mode 100644 src/schema/term.rs create mode 100644 src/schema/text_field.rs create mode 100644 src/schema/u32_field.rs diff --git a/src/core/collector.rs b/src/core/collector.rs index 5067b222a..d9dfc3960 100644 --- a/src/core/collector.rs +++ b/src/core/collector.rs @@ -3,7 +3,7 @@ use core::reader::SegmentReader; use core::searcher::SegmentLocalId; use core::searcher::DocAddress; use fastfield::U32FastFieldReader; -use core::schema::U32Field; +use schema::U32Field; use std::io; pub trait Collector { diff --git a/src/core/index.rs b/src/core/index.rs index 6be29a140..d5a88e420 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,6 +1,6 @@ use std::path::{PathBuf, Path}; use std::io; -use core::schema::Schema; +use schema::Schema; use DocId; use std::io::Write; use std::sync::{Arc, RwLock, RwLockWriteGuard, RwLockReadGuard}; diff --git a/src/core/merger.rs b/src/core/merger.rs index 7ca0041db..4d7456510 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -10,15 +10,11 @@ use postings::TermInfo; use std::collections::BinaryHeap; use datastruct::FstMapIter; -use core::schema::Term; -use core::schema::Schema; +use schema::{Term, Schema, U32Field}; use fastfield::FastFieldSerializer; use store::StoreWriter; use core::index::SegmentInfo; -use std::cmp::Ordering; -use core::schema::U32Field; -use std::cmp::min; -use std::cmp::max; +use std::cmp::{min, max, Ordering}; struct PostingsMerger<'a> { doc_ids: Vec, @@ -214,10 +210,10 @@ impl SerializableSegment for IndexMerger { #[cfg(test)] mod tests { - use core::schema; - use core::schema::Document; + use schema; + use schema::Document; + use schema::Term; use core::index::Index; - use core::schema::Term; use core::searcher::DocAddress; use core::collector::FastFieldTestCollector; use core::collector::TestCollector; diff --git a/src/core/mod.rs b/src/core/mod.rs index c8512e9a0..f6661d3f6 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,4 +1,3 @@ -pub mod schema; pub mod writer; pub mod analyzer; pub mod reader; diff --git a/src/core/reader.rs b/src/core/reader.rs index 7e69fe127..410631453 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -1,7 +1,7 @@ use core::index::{Segment, SegmentId}; -use core::schema::Term; +use schema::Term; use store::StoreReader; -use core::schema::Document; +use schema::Document; use directory::ReadOnlySource; use std::io::Cursor; use DocId; @@ -14,7 +14,7 @@ use std::fmt; use rustc_serialize::json; use core::index::SegmentInfo; use core::timer::OpenTimer; -use core::schema::U32Field; +use schema::U32Field; use core::convert_to_ioerror; use core::serialize::BinarySerializable; use fastfield::{U32FastFieldsReader, U32FastFieldReader}; diff --git a/src/core/schema.rs b/src/core/schema.rs deleted file mode 100644 index e1457dee8..000000000 --- a/src/core/schema.rs +++ /dev/null @@ -1,553 +0,0 @@ -use std::io::Write; -use std::collections::HashMap; -use std::slice; -use std::fmt; -use std::io; - -use std::io::Read; -use core::serialize::BinarySerializable; -use rustc_serialize::Decodable; -use rustc_serialize::Encodable; -use rustc_serialize::Decoder; -use rustc_serialize::Encoder; -use std::ops::BitOr; -use std::borrow::Borrow; - - -#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] -pub struct TextOptions { - tokenized_indexed: bool, - stored: bool, - fast: bool, -} - -#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] -pub struct U32Options { - indexed: bool, - fast: bool, - stored: bool, -} - -/// The field will be tokenized and indexed -pub const TEXT: TextOptions = TextOptions { - tokenized_indexed: true, - stored: false, - fast: false, -}; - -/// The field will be tokenized and indexed -pub const FAST_U32: U32Options = U32Options { - indexed: false, - stored: false, - fast: true, -}; - -/// A stored fields of a document can be retrieved given its DocId. -/// Stored field are stored together and LZ4 compressed. -/// Reading the stored fields of a document is relatively slow. -/// (100 microsecs) -pub const STORED: TextOptions = TextOptions { - tokenized_indexed: false, - stored: true, - fast: false, -}; - -/// Fast field are used for field you need to access many times during -/// collection. (e.g: for sort, aggregates). -pub const FAST: TextOptions = TextOptions { - tokenized_indexed: false, - stored: false, - fast: true -}; - -impl BitOr for TextOptions { - - type Output = TextOptions; - - fn bitor(self, other: TextOptions) -> TextOptions { - let mut res = TextOptions::new(); - res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed; - res.stored = self.stored || other.stored; - res.fast = self.fast || other.fast; - res - } -} - -/// Field handle -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] -pub struct U32Field(pub u8); - -/// Field handle -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] -pub struct TextField(pub u8); - - -impl U32Options { - - pub fn new() -> U32Options { - U32Options { - fast: false, - indexed: false, - stored: false, - } - } - - pub fn is_indexed(&self,) -> bool { - self.indexed - } - - pub fn set_indexed(mut self,) -> U32Options { - self.indexed = true; - self - } - - pub fn is_fast(&self,) -> bool { - self.fast - } - - pub fn set_fast(mut self,) -> U32Options { - self.fast = true; - self - } -} - -impl TextOptions { - pub fn is_tokenized_indexed(&self,) -> bool { - self.tokenized_indexed - } - - pub fn is_stored(&self,) -> bool { - self.stored - } - - pub fn is_fast(&self,) -> bool { - self.fast - } - - pub fn set_stored(mut self,) -> TextOptions { - self.stored = true; - self - } - - pub fn set_fast(mut self,) -> TextOptions { - self.fast = true; - self - } - - pub fn set_tokenized_indexed(mut self,) -> TextOptions { - self.tokenized_indexed = true; - self - } - - pub fn new() -> TextOptions { - TextOptions { - fast: false, - tokenized_indexed: false, - stored: false, - } - } -} - -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] -pub struct U32FieldValue { - pub field: U32Field, - pub value: u32, -} - -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] -pub struct TextFieldValue { - pub field: TextField, - pub text: String, -} - -impl BinarySerializable for TextField { - fn serialize(&self, writer: &mut Write) -> io::Result { - let TextField(field_id) = *self; - field_id.serialize(writer) - } - - fn deserialize(reader: &mut Read) -> io::Result { - u8::deserialize(reader).map(TextField) - } -} - -impl BinarySerializable for U32Field { - fn serialize(&self, writer: &mut Write) -> io::Result { - let U32Field(field_id) = *self; - field_id.serialize(writer) - } - - fn deserialize(reader: &mut Read) -> io::Result { - u8::deserialize(reader).map(U32Field) - } -} - -impl BinarySerializable for TextFieldValue { - fn serialize(&self, writer: &mut Write) -> io::Result { - Ok( - try!(self.field.serialize(writer)) + - try!(self.text.serialize(writer)) - ) - } - fn deserialize(reader: &mut Read) -> io::Result { - let field = try!(TextField::deserialize(reader)); - let text = try!(String::deserialize(reader)); - Ok(TextFieldValue { - field: field, - text: text, - }) - } -} - - - -#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] -pub struct Term { - data: Vec, -} - -#[derive(Clone, Debug, RustcDecodable, RustcEncodable)] -struct TextFieldEntry { - name: String, - option: TextOptions, -} - -#[derive(Clone, Debug, RustcDecodable, RustcEncodable)] -pub struct U32FieldEntry { - pub name: String, - pub option: U32Options, -} - - -/// Tantivy has a very strict schema. -/// You need to specify in advance, whether a field is indexed or not, -/// stored or not, and RAM-based or not. -/// -/// This is done by creating a schema object, and -/// setting up the fields one by one. -/// It is for the moment impossible to remove fields. -/// -/// # Examples -/// -/// ``` -/// use tantivy::schema::{Schema, TextOptions}; -/// -/// fn create_schema() -> Schema { -/// let mut schema = Schema::new(); -/// let str_fieldtype = TextOptions::new(); -/// let text_fieldtype = TextOptions::new().set_tokenized_indexed(); -/// let id_field = schema.add_text_field("id", &str_fieldtype); -/// let url_field = schema.add_text_field("url", &str_fieldtype); -/// let body_field = schema.add_text_field("body", &text_fieldtype); -/// let id_field = schema.add_text_field("id", &str_fieldtype); -/// let url_field = schema.add_text_field("url", &str_fieldtype); -/// let title_field = schema.add_text_field("title", &text_fieldtype); -/// let body_field = schema.add_text_field("body", &text_fieldtype); -/// schema -/// } -/// -/// let schema = create_schema(); -#[derive(Clone, Debug)] -pub struct Schema { - text_fields: Vec, - text_fields_map: HashMap, // transient - u32_fields: Vec, - u32_fields_map: HashMap, // transient -} - -impl Decodable for Schema { - fn decode(d: &mut D) -> Result { - let mut schema = Schema::new(); - try!(d.read_seq(|d, num_fields| { - for _ in 0..num_fields { - let field_entry = try!(TextFieldEntry::decode(d)); - let field_options: &TextOptions = &field_entry.option; - schema.add_text_field(&field_entry.name, field_options); - } - Ok(()) - })); - Ok(schema) - } -} - -impl Encodable for Schema { - fn encode(&self, s: &mut S) -> Result<(), S::Error> { - try!(s.emit_seq(self.text_fields.len(), - |mut e| { - for (ord, field) in self.text_fields.iter().enumerate() { - try!(e.emit_seq_elt(ord, |e| field.encode(e))); - } - Ok(()) - })); - Ok(()) - } -} - -impl Schema { - - /// Creates a new, empty schema. - pub fn new() -> Schema { - Schema { - text_fields: Vec::new(), - text_fields_map: HashMap::new(), - u32_fields: Vec::new(), - u32_fields_map: HashMap::new(), - } - } - - pub fn get_u32_fields(&self,) -> &Vec { - &self.u32_fields - } - - /// Given a name, returns the field handle, as well as its associated TextOptions - pub fn get_text_field(&self, field_name: &str) -> Option<(TextField, TextOptions)> { - self.text_fields_map - .get(field_name) - .map(|&TextField(field_id)| { - let field_options = self.text_fields[field_id as usize].option.clone(); - (TextField(field_id), field_options) - }) - } - - pub fn get_u32_field(&self, field_name: &str) -> Option<(U32Field, U32Options)> { - self.u32_fields_map - .get(field_name) - .map(|&U32Field(field_id)| { - let u32_field_options = self.u32_fields[field_id as usize].option.clone(); - (U32Field(field_id), u32_field_options) - }) - } - - /// Returns the field options associated with a given name. - /// - /// # Panics - /// Panics if the field name does not exist. - /// It is meant as an helper for user who created - /// and control the content of their schema. - /// - /// If panicking is not an option for you, - /// you may use `get(&self, field_name: &str)`. - pub fn text_field(&self, fieldname: &str) -> TextField { - self.text_fields_map.get(fieldname).map(|field| field.clone()).unwrap() - } - - pub fn u32_field(&self, fieldname: &str) -> U32Field { - self.u32_fields_map.get(fieldname).map(|field| field.clone()).unwrap() - } - - /// Returns the field options associated to a field handle. - pub fn text_field_options(&self, field: &TextField) -> TextOptions { - let TextField(field_id) = *field; - self.text_fields[field_id as usize].option.clone() - } - - pub fn u32_field_options(&self, field: &U32Field) -> U32Options { - let U32Field(field_id) = *field; - self.u32_fields[field_id as usize].option.clone() - } - - /// Creates a new field. - /// Return the associated field handle. - pub fn add_text_field>(&mut self, field_name_str: &str, field_options: RefTextOptions) -> TextField { - let field = TextField(self.text_fields.len() as u8); - // TODO case if field already exists - let field_name = String::from(field_name_str); - self.text_fields.push(TextFieldEntry { - name: field_name.clone(), - option: field_options.borrow().clone(), - }); - self.text_fields_map.insert(field_name, field.clone()); - field - } - - /// Creates a new field. - /// Return the associated field handle. - pub fn add_u32_field>(&mut self, field_name_str: &str, field_options: RefU32Options) -> U32Field { - let field = U32Field(self.u32_fields.len() as u8); - // TODO case if field already exists - let field_name = String::from(field_name_str); - self.u32_fields.push(U32FieldEntry { - name: field_name.clone(), - option: field_options.borrow().clone(), - }); - self.u32_fields_map.insert(field_name, field.clone()); - field - } - -} - - -impl Term { - // pub fn field_text(&self,) -> TextField { - // TextField(self.data[0]) - // } - // - // pub fn text(&self,) -> &str { - // str::from_utf8(&self.data[1..]).unwrap() - // } - - pub fn from_field_u32(field: &U32Field, val: u32) -> Term { - let mut buffer = Vec::with_capacity(1 + 4); - let U32Field(field_idx) = *field; - buffer.clear(); - buffer.push(128 | field_idx); - val.serialize(&mut buffer).unwrap(); - Term { - data: buffer, - } - } - - pub fn from_field_text(field: &TextField, text: &str) -> Term { - let mut buffer = Vec::with_capacity(1 + text.len()); - let TextField(field_idx) = *field; - buffer.clear(); - buffer.push(field_idx); - buffer.extend(text.as_bytes()); - Term { - data: buffer, - } - } - - pub fn from(data: &[u8]) -> Term { - Term { - data: Vec::from(data), - } - } - - pub fn as_slice(&self,)->&[u8] { - &self.data - } -} - -impl fmt::Debug for Term { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Term({})", self.data[0]) - } -} - -/// -/// Document are really just a list of field values. -/// -/// # Examples -/// -/// ``` -/// use tantivy::schema::Schema; -/// use tantivy::schema::TEXT; -/// -/// let mut schema = Schema::new(); -/// schema.add_text_field("body", &TEXT); -/// let field_text = schema.text_field("body"); -/// ``` -/// -#[derive(Debug)] -pub struct Document { - pub text_field_values: Vec, - pub u32_field_values: Vec, -} - - -impl Document { - - pub fn new() -> Document { - Document { - text_field_values: Vec::new(), - u32_field_values: Vec::new(), - } - } - - pub fn from(text_field_values: Vec, - u32_field_values: Vec) -> Document { - Document { - text_field_values: text_field_values, - u32_field_values: u32_field_values - } - } - - pub fn len(&self,) -> usize { - self.text_field_values.len() - } - - pub fn set(&mut self, field: &TextField, text: &str) { - self.add(TextFieldValue { - field: field.clone(), - text: String::from(text) - }); - } - - pub fn set_u32(&mut self, field: &U32Field, value: u32) { - self.u32_field_values.push(U32FieldValue { - field: field.clone(), - value: value - }); - } - - pub fn add(&mut self, field_value: TextFieldValue) { - self.text_field_values.push(field_value); - } - - - pub fn text_fields<'a>(&'a self,) -> slice::Iter<'a, TextFieldValue> { - self.text_field_values.iter() - } - - pub fn u32_fields<'a>(&'a self,) -> slice::Iter<'a, U32FieldValue> { - self.u32_field_values.iter() - } - - pub fn get_u32(&self, field: &U32Field) -> Option { - self.u32_field_values - .iter() - .filter(|field_value| field_value.field == *field) - .map(|field_value| &field_value.value) - .cloned() - .next() - } - - pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> { - self.text_field_values - .iter() - .filter(|field_value| field_value.field == *field) - .map(|field_value| &field_value.text) - .collect() - } - - pub fn get_first_text<'a>(&'a self, field: &TextField) -> Option<&'a String> { - self.text_field_values - .iter() - .filter(|field_value| field_value.field == *field) - .map(|field_value| &field_value.text) - .next() - } -} - - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_field_options() { - { - let field_options = STORED | FAST; - assert!(field_options.is_stored()); - assert!(field_options.is_fast()); - assert!(!field_options.is_tokenized_indexed()); - } - { - let field_options = STORED | TEXT; - assert!(field_options.is_stored()); - assert!(!field_options.is_fast()); - assert!(field_options.is_tokenized_indexed()); - } - } - - #[test] - fn test_schema() { - { - let mut schema = Schema::new(); - schema.add_text_field("body", &TEXT); - let field = schema.text_field("body"); - assert!(schema.text_field_options(&field).is_tokenized_indexed()); - } - } -} diff --git a/src/core/searcher.rs b/src/core/searcher.rs index a475016a4..6197fff3d 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -2,11 +2,10 @@ use core::reader::SegmentReader; use core::index::Index; use core::index::Segment; use DocId; -use core::schema::Document; +use schema::{Document, Term}; use core::collector::Collector; use std::io; use core::timer::TimerTree; -use core::schema::Term; #[derive(Debug)] pub struct Searcher { diff --git a/src/core/writer.rs b/src/core/writer.rs index 96923cc37..4d4f1ba2c 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -1,5 +1,8 @@ use DocId; -use core::schema::*; +use schema::Schema; +use schema::Document; +use schema::Term; +use schema::TextFieldValue; use core::codec::*; use core::index::Index; use core::analyzer::SimpleTokenizer; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 33f1d5788..2f2d74fbb 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -22,12 +22,12 @@ mod tests { use super::U32FastFieldsReader; use super::U32FastFieldsWriter; use super::FastFieldSerializer; - use core::schema::U32Field; + use schema::U32Field; use std::path::Path; use directory::{Directory, WritePtr, RAMDirectory}; - use core::schema::Document; - use core::schema::Schema; - use core::schema::FAST_U32; + use schema::Document; + use schema::Schema; + use schema::FAST_U32; use test::Bencher; use test; use rand::Rng; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 38a34834b..4aa9c1e6d 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -4,7 +4,7 @@ use fastfield::DividerU32; use core::serialize::BinarySerializable; use DocId; use std::collections::HashMap; -use core::schema::U32Field; +use schema::U32Field; use std::io::{SeekFrom, Seek}; use std::ops::Deref; use super::compute_num_bits; diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 7167630af..a9abc4432 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -1,6 +1,6 @@ use core::serialize::BinarySerializable; use directory::WritePtr; -use core::schema::U32Field; +use schema::U32Field; use std::io; use std::io::{SeekFrom, Write}; use super::compute_num_bits; diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index b7e0e0633..0408a71fb 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,4 +1,4 @@ -use core::schema::{Schema, U32Field, Document}; +use schema::{Schema, U32Field, Document}; use fastfield::FastFieldSerializer; use std::io; diff --git a/src/lib.rs b/src/lib.rs index d6b0f618b..f5f227b26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,14 +34,14 @@ mod directory; mod compression; mod fastfield; mod store; +pub mod schema; pub use directory::Directory; pub use core::analyzer; pub use core::searcher::Searcher; pub use core::index::Index; -pub use core::schema; -pub use core::schema::Term; -pub use core::schema::Document; +pub use schema::Term; +pub use schema::Document; pub use core::collector; pub use core::reader::SegmentReader; pub use core::searcher::SegmentLocalId; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index ba5e0a0d4..b48191ca7 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,6 +1,6 @@ use datastruct::FstMapBuilder; use super::TermInfo; -use core::schema::Term; +use schema::Term; use directory::WritePtr; use compression; use DocId; diff --git a/src/postings/writer.rs b/src/postings/writer.rs index 6a16c3282..a8bb70f1b 100644 --- a/src/postings/writer.rs +++ b/src/postings/writer.rs @@ -1,6 +1,6 @@ use DocId; use std::collections::BTreeMap; -use core::schema::Term; +use schema::Term; use postings::PostingsSerializer; use std::io; diff --git a/src/schema/document.rs b/src/schema/document.rs new file mode 100644 index 000000000..e7728d969 --- /dev/null +++ b/src/schema/document.rs @@ -0,0 +1,97 @@ +use std::slice; + +use super::*; + +/// +/// Document are really just a list of field values. +/// +/// # Examples +/// +/// ``` +/// use tantivy::schema::Schema; +/// use tantivy::schema::TEXT; +/// +/// let mut schema = Schema::new(); +/// schema.add_text_field("body", &TEXT); +/// let field_text = schema.text_field("body"); +/// ``` +/// +#[derive(Debug)] +pub struct Document { + pub text_field_values: Vec, + pub u32_field_values: Vec, +} + +impl Document { + + pub fn new() -> Document { + Document { + text_field_values: Vec::new(), + u32_field_values: Vec::new(), + } + } + + pub fn from(text_field_values: Vec, + u32_field_values: Vec) -> Document { + Document { + text_field_values: text_field_values, + u32_field_values: u32_field_values + } + } + + pub fn len(&self,) -> usize { + self.text_field_values.len() + } + + pub fn set(&mut self, field: &TextField, text: &str) { + self.add(TextFieldValue { + field: field.clone(), + text: String::from(text) + }); + } + + pub fn set_u32(&mut self, field: &U32Field, value: u32) { + self.u32_field_values.push(U32FieldValue { + field: field.clone(), + value: value + }); + } + + pub fn add(&mut self, field_value: TextFieldValue) { + self.text_field_values.push(field_value); + } + + + pub fn text_fields<'a>(&'a self,) -> slice::Iter<'a, TextFieldValue> { + self.text_field_values.iter() + } + + pub fn u32_fields<'a>(&'a self,) -> slice::Iter<'a, U32FieldValue> { + self.u32_field_values.iter() + } + + pub fn get_u32(&self, field: &U32Field) -> Option { + self.u32_field_values + .iter() + .filter(|field_value| field_value.field == *field) + .map(|field_value| &field_value.value) + .cloned() + .next() + } + + pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> { + self.text_field_values + .iter() + .filter(|field_value| field_value.field == *field) + .map(|field_value| &field_value.text) + .collect() + } + + pub fn get_first_text<'a>(&'a self, field: &TextField) -> Option<&'a String> { + self.text_field_values + .iter() + .filter(|field_value| field_value.field == *field) + .map(|field_value| &field_value.text) + .next() + } +} diff --git a/src/schema/mod.rs b/src/schema/mod.rs new file mode 100644 index 000000000..62a22dc0f --- /dev/null +++ b/src/schema/mod.rs @@ -0,0 +1,21 @@ +mod schema; +mod term; +mod document; +mod text_field; +mod u32_field; + +pub use self::schema::Schema; +pub use self::document::Document; +pub use self::term::Term; +pub use self::text_field::TextField; +pub use self::text_field::TextFieldValue; +pub use self::text_field::TextOptions; +pub use self::text_field::FAST; +pub use self::text_field::TEXT; +pub use self::text_field::STORED; + + +pub use self::u32_field::U32Field; +pub use self::u32_field::U32FieldValue; +pub use self::u32_field::U32Options; +pub use self::u32_field::FAST_U32; diff --git a/src/schema/schema.rs b/src/schema/schema.rs new file mode 100644 index 000000000..1a72c7f03 --- /dev/null +++ b/src/schema/schema.rs @@ -0,0 +1,180 @@ +use std::collections::HashMap; + +use rustc_serialize::Decodable; +use rustc_serialize::Encodable; +use rustc_serialize::Decoder; +use rustc_serialize::Encoder; +use std::borrow::Borrow; +use super::*; + +#[derive(Clone, Debug, RustcDecodable, RustcEncodable)] +pub struct TextFieldEntry { + name: String, + option: TextOptions, +} + + +#[derive(Clone, Debug, RustcDecodable, RustcEncodable)] +pub struct U32FieldEntry { + pub name: String, + pub option: U32Options, +} + + + +/// Tantivy has a very strict schema. +/// You need to specify in advance, whether a field is indexed or not, +/// stored or not, and RAM-based or not. +/// +/// This is done by creating a schema object, and +/// setting up the fields one by one. +/// It is for the moment impossible to remove fields. +/// +/// # Examples +/// +/// ``` +/// use tantivy::schema::{Schema, TextOptions}; +/// +/// fn create_schema() -> Schema { +/// let mut schema = Schema::new(); +/// let str_fieldtype = TextOptions::new(); +/// let text_fieldtype = TextOptions::new().set_tokenized_indexed(); +/// let id_field = schema.add_text_field("id", &str_fieldtype); +/// let url_field = schema.add_text_field("url", &str_fieldtype); +/// let body_field = schema.add_text_field("body", &text_fieldtype); +/// let id_field = schema.add_text_field("id", &str_fieldtype); +/// let url_field = schema.add_text_field("url", &str_fieldtype); +/// let title_field = schema.add_text_field("title", &text_fieldtype); +/// let body_field = schema.add_text_field("body", &text_fieldtype); +/// schema +/// } +/// +/// let schema = create_schema(); +#[derive(Clone, Debug)] +pub struct Schema { + text_fields: Vec, + text_fields_map: HashMap, // transient + u32_fields: Vec, + u32_fields_map: HashMap, // transient +} + +impl Decodable for Schema { + fn decode(d: &mut D) -> Result { + let mut schema = Schema::new(); + try!(d.read_seq(|d, num_fields| { + for _ in 0..num_fields { + let field_entry = try!(TextFieldEntry::decode(d)); + let field_options: &TextOptions = &field_entry.option; + schema.add_text_field(&field_entry.name, field_options); + } + Ok(()) + })); + Ok(schema) + } +} + +impl Encodable for Schema { + fn encode(&self, s: &mut S) -> Result<(), S::Error> { + try!(s.emit_seq(self.text_fields.len(), + |mut e| { + for (ord, field) in self.text_fields.iter().enumerate() { + try!(e.emit_seq_elt(ord, |e| field.encode(e))); + } + Ok(()) + })); + Ok(()) + } +} + +impl Schema { + + /// Creates a new, empty schema. + pub fn new() -> Schema { + Schema { + text_fields: Vec::new(), + text_fields_map: HashMap::new(), + u32_fields: Vec::new(), + u32_fields_map: HashMap::new(), + } + } + + pub fn get_u32_fields(&self,) -> &Vec { + &self.u32_fields + } + + /// Given a name, returns the field handle, as well as its associated TextOptions + pub fn get_text_field(&self, field_name: &str) -> Option<(TextField, TextOptions)> { + self.text_fields_map + .get(field_name) + .map(|&TextField(field_id)| { + let field_options = self.text_fields[field_id as usize].option.clone(); + (TextField(field_id), field_options) + }) + } + + pub fn get_u32_field(&self, field_name: &str) -> Option<(U32Field, U32Options)> { + self.u32_fields_map + .get(field_name) + .map(|&U32Field(field_id)| { + let u32_field_options = self.u32_fields[field_id as usize].option.clone(); + (U32Field(field_id), u32_field_options) + }) + } + + /// Returns the field options associated with a given name. + /// + /// # Panics + /// Panics if the field name does not exist. + /// It is meant as an helper for user who created + /// and control the content of their schema. + /// + /// If panicking is not an option for you, + /// you may use `get(&self, field_name: &str)`. + pub fn text_field(&self, fieldname: &str) -> TextField { + self.text_fields_map.get(fieldname).map(|field| field.clone()).unwrap() + } + + pub fn u32_field(&self, fieldname: &str) -> U32Field { + self.u32_fields_map.get(fieldname).map(|field| field.clone()).unwrap() + } + + /// Returns the field options associated to a field handle. + pub fn text_field_options(&self, field: &TextField) -> TextOptions { + let TextField(field_id) = *field; + self.text_fields[field_id as usize].option.clone() + } + + pub fn u32_field_options(&self, field: &U32Field) -> U32Options { + let U32Field(field_id) = *field; + self.u32_fields[field_id as usize].option.clone() + } + + /// Creates a new field. + /// Return the associated field handle. + pub fn add_text_field>(&mut self, field_name_str: &str, field_options: RefTextOptions) -> TextField { + let field = TextField(self.text_fields.len() as u8); + // TODO case if field already exists + let field_name = String::from(field_name_str); + self.text_fields.push(TextFieldEntry { + name: field_name.clone(), + option: field_options.borrow().clone(), + }); + self.text_fields_map.insert(field_name, field.clone()); + field + } + + /// Creates a new field. + /// Return the associated field handle. + pub fn add_u32_field>(&mut self, field_name_str: &str, field_options: RefU32Options) -> U32Field { + let field = U32Field(self.u32_fields.len() as u8); + // TODO case if field already exists + let field_name = String::from(field_name_str); + self.u32_fields.push(U32FieldEntry { + name: field_name.clone(), + option: field_options.borrow().clone(), + }); + self.u32_fields_map.insert(field_name, field.clone()); + field + } + +} diff --git a/src/schema/term.rs b/src/schema/term.rs new file mode 100644 index 000000000..5494eef40 --- /dev/null +++ b/src/schema/term.rs @@ -0,0 +1,54 @@ +use std::io::Write; +use std::fmt; + +use core::serialize::BinarySerializable; +use super::U32Field; +use super::TextField; + +#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] +pub struct Term { + data: Vec, +} + + + +impl Term { + + pub fn from_field_u32(field: &U32Field, val: u32) -> Term { + let mut buffer = Vec::with_capacity(1 + 4); + let U32Field(field_idx) = *field; + buffer.clear(); + buffer.push(128 | field_idx); + val.serialize(&mut buffer).unwrap(); + Term { + data: buffer, + } + } + + pub fn from_field_text(field: &TextField, text: &str) -> Term { + let mut buffer = Vec::with_capacity(1 + text.len()); + let TextField(field_idx) = *field; + buffer.clear(); + buffer.push(field_idx); + buffer.extend(text.as_bytes()); + Term { + data: buffer, + } + } + + pub fn from(data: &[u8]) -> Term { + Term { + data: Vec::from(data), + } + } + + pub fn as_slice(&self,)->&[u8] { + &self.data + } +} + +impl fmt::Debug for Term { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Term({})", self.data[0]) + } +} diff --git a/src/schema/text_field.rs b/src/schema/text_field.rs new file mode 100644 index 000000000..a72faaf9b --- /dev/null +++ b/src/schema/text_field.rs @@ -0,0 +1,167 @@ +use std::io::Write; +use std::io; + +use std::io::Read; +use core::serialize::BinarySerializable; +use rustc_serialize::Encodable; +use rustc_serialize::Decoder; +use rustc_serialize::Encoder; +use std::ops::BitOr; + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] +pub struct TextField(pub u8); + + +#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] +pub struct TextOptions { + tokenized_indexed: bool, + stored: bool, + fast: bool, +} + +impl TextOptions { + pub fn is_tokenized_indexed(&self,) -> bool { + self.tokenized_indexed + } + + pub fn is_stored(&self,) -> bool { + self.stored + } + + pub fn is_fast(&self,) -> bool { + self.fast + } + + pub fn set_stored(mut self,) -> TextOptions { + self.stored = true; + self + } + + pub fn set_fast(mut self,) -> TextOptions { + self.fast = true; + self + } + + pub fn set_tokenized_indexed(mut self,) -> TextOptions { + self.tokenized_indexed = true; + self + } + + pub fn new() -> TextOptions { + TextOptions { + fast: false, + tokenized_indexed: false, + stored: false, + } + } +} + + +impl BinarySerializable for TextField { + fn serialize(&self, writer: &mut Write) -> io::Result { + let TextField(field_id) = *self; + field_id.serialize(writer) + } + + fn deserialize(reader: &mut Read) -> io::Result { + u8::deserialize(reader).map(TextField) + } +} + + +impl BinarySerializable for TextFieldValue { + fn serialize(&self, writer: &mut Write) -> io::Result { + Ok( + try!(self.field.serialize(writer)) + + try!(self.text.serialize(writer)) + ) + } + fn deserialize(reader: &mut Read) -> io::Result { + let field = try!(TextField::deserialize(reader)); + let text = try!(String::deserialize(reader)); + Ok(TextFieldValue { + field: field, + text: text, + }) + } +} + + + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] +pub struct TextFieldValue { + pub field: TextField, + pub text: String, +} + + + + + +/// The field will be tokenized and indexed +pub const TEXT: TextOptions = TextOptions { + tokenized_indexed: true, + stored: false, + fast: false, +}; + +/// A stored fields of a document can be retrieved given its DocId. +/// Stored field are stored together and LZ4 compressed. +/// Reading the stored fields of a document is relatively slow. +/// (100 microsecs) +pub const STORED: TextOptions = TextOptions { + tokenized_indexed: false, + stored: true, + fast: false, +}; + +/// Fast field are used for field you need to access many times during +/// collection. (e.g: for sort, aggregates). +pub const FAST: TextOptions = TextOptions { + tokenized_indexed: false, + stored: false, + fast: true +}; + + +impl BitOr for TextOptions { + + type Output = TextOptions; + + fn bitor(self, other: TextOptions) -> TextOptions { + let mut res = TextOptions::new(); + res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed; + res.stored = self.stored || other.stored; + res.fast = self.fast || other.fast; + res + } +} + + +#[cfg(test)] +mod tests { + use schema::Schema; + use super::*; + + #[test] + fn test_field_options() { + { + let field_options = STORED | FAST; + assert!(field_options.is_stored()); + assert!(field_options.is_fast()); + assert!(!field_options.is_tokenized_indexed()); + } + { + let field_options = STORED | TEXT; + assert!(field_options.is_stored()); + assert!(!field_options.is_fast()); + assert!(field_options.is_tokenized_indexed()); + } + { + let mut schema = Schema::new(); + let _body_field: TextField = schema.add_text_field("body", &TEXT); + let field = schema.text_field("body"); + assert!(schema.text_field_options(&field).is_tokenized_indexed()); + } + } +} diff --git a/src/schema/u32_field.rs b/src/schema/u32_field.rs new file mode 100644 index 000000000..21a1ff54d --- /dev/null +++ b/src/schema/u32_field.rs @@ -0,0 +1,76 @@ +use std::io; +use std::io::Write; +use std::io::Read; + +use core::serialize::BinarySerializable; +use rustc_serialize::Encodable; +use rustc_serialize::Decoder; +use rustc_serialize::Encoder; + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] +pub struct U32Field(pub u8); + +impl BinarySerializable for U32Field { + fn serialize(&self, writer: &mut Write) -> io::Result { + let U32Field(field_id) = *self; + field_id.serialize(writer) + } + + fn deserialize(reader: &mut Read) -> io::Result { + u8::deserialize(reader).map(U32Field) + } +} + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] +pub struct U32FieldValue { + pub field: U32Field, + pub value: u32, +} + + +#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] +pub struct U32Options { + indexed: bool, + fast: bool, + stored: bool, +} + + +impl U32Options { + + pub fn new() -> U32Options { + U32Options { + fast: false, + indexed: false, + stored: false, + } + } + + pub fn is_indexed(&self,) -> bool { + self.indexed + } + + pub fn set_indexed(mut self,) -> U32Options { + self.indexed = true; + self + } + + pub fn is_fast(&self,) -> bool { + self.fast + } + + pub fn set_fast(mut self,) -> U32Options { + self.fast = true; + self + } +} + + + + +/// The field will be tokenized and indexed +pub const FAST_U32: U32Options = U32Options { + indexed: false, + stored: false, + fast: true, +}; diff --git a/src/store/mod.rs b/src/store/mod.rs index e7f6c7ae2..3b6a4f6f7 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -14,9 +14,9 @@ mod tests { use super::*; use test::Bencher; use std::path::PathBuf; - use core::schema::Schema; - use core::schema::TextOptions; - use core::schema::TextFieldValue; + use schema::Schema; + use schema::TextOptions; + use schema::TextFieldValue; use directory::{RAMDirectory, Directory, MmapDirectory, WritePtr}; fn write_lorem_ipsum_store(writer: WritePtr) -> Schema { diff --git a/src/store/reader.rs b/src/store/reader.rs index dd6d56288..9aab2034c 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -1,8 +1,8 @@ use directory::ReadOnlySource; use std::cell::RefCell; use DocId; -use core::schema::Document; -use core::schema::TextFieldValue; +use schema::Document; +use schema::TextFieldValue; use core::serialize::BinarySerializable; use std::io::Read; diff --git a/src/store/writer.rs b/src/store/writer.rs index d6d637261..8ebcb5ec0 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,6 +1,6 @@ use directory::WritePtr; use DocId; -use core::schema::TextFieldValue; +use schema::TextFieldValue; use core::serialize::BinarySerializable; use std::io::Write; use std::io::Read; From 4cb3a7f077c527eeecfa8efa68a0889d9e5b6ac8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 11:26:34 +0900 Subject: [PATCH 10/14] moved common --- src/common/mod.rs | 9 +++++ src/{core => common}/serialize.rs | 64 +++---------------------------- src/{core => common}/timer.rs | 0 src/common/vint.rs | 58 ++++++++++++++++++++++++++++ src/core/mod.rs | 2 - src/core/reader.rs | 4 +- src/core/searcher.rs | 2 +- src/datastruct/fstmap.rs | 2 +- src/fastfield/reader.rs | 10 +++-- src/fastfield/serializer.rs | 2 +- src/lib.rs | 2 + src/postings/serializer.rs | 2 +- src/postings/term_info.rs | 2 +- src/schema/term.rs | 2 +- src/schema/text_field.rs | 2 +- src/schema/u32_field.rs | 2 +- src/store/reader.rs | 2 +- src/store/writer.rs | 2 +- 18 files changed, 93 insertions(+), 76 deletions(-) create mode 100644 src/common/mod.rs rename src/{core => common}/serialize.rs (77%) rename src/{core => common}/timer.rs (100%) create mode 100644 src/common/vint.rs diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 000000000..b7c0bfb5e --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,9 @@ +mod serialize; +mod timer; +mod vint; + +pub use self::serialize::BinarySerializable; +pub use self::timer::Timing; +pub use self::timer::TimerTree; +pub use self::timer::OpenTimer; +pub use self::vint::VInt; diff --git a/src/core/serialize.rs b/src/common/serialize.rs similarity index 77% rename from src/core/serialize.rs rename to src/common/serialize.rs index 48705f7a0..1a8e32fbb 100644 --- a/src/core/serialize.rs +++ b/src/common/serialize.rs @@ -4,6 +4,7 @@ use std::fmt; use std::io::Write; use std::io::Read; use std::io; +use common::VInt; use byteorder; pub trait BinarySerializable : fmt::Debug + Sized { @@ -11,61 +12,6 @@ pub trait BinarySerializable : fmt::Debug + Sized { fn deserialize(reader: &mut Read) -> io::Result; } -#[derive(Debug, Eq, PartialEq)] -pub struct VInt(pub u64); - -impl VInt { - pub fn val(&self,) -> u64 { - self.0.clone() - } -} - -impl BinarySerializable for VInt { - fn serialize(&self, writer: &mut Write) -> io::Result { - let mut remaining = self.0.clone(); - let mut written: usize = 0; - let mut buffer = [0u8; 10]; - loop { - let mut next_byte: u8 = (remaining % 128u64) as u8; - remaining /= 128u64; - if remaining == 0u64 { - buffer[written] = next_byte; - written += 1; - break; - } - else { - next_byte |= 128u8; - buffer[written] = next_byte; - written += 1; - } - } - try!(writer.write_all(&buffer[0..written])); - Ok(written) - } - - fn deserialize(reader: &mut Read) -> io::Result { - let mut bytes = reader.bytes(); - let mut result = 0u64; - let mut shift = 0u64; - loop { - match bytes.next() { - Some(Ok(b)) => { - result += ((b % 128u8) as u64) << shift; - if b & 128 == 0u8 { - break; - } - shift += 7; - } - _ => { - return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")) - } - } - } - Ok(VInt(result)) - } -} - - fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error { match byteorder_error { byteorder::Error::UnexpectedEOF => io::Error::new(io::ErrorKind::InvalidData, "Reached EOF unexpectedly"), @@ -171,6 +117,11 @@ impl BinarySerializable for String { #[cfg(test)] mod test { + + use std::io::Cursor; + use common::VInt; + use super::*; + fn serialize_test(v: T, num_bytes: usize) { let mut buffer: Vec = Vec::new(); assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes); @@ -180,9 +131,6 @@ mod test { assert_eq!(deser, v); } - use std::io::Cursor; - use super::*; - #[test] fn test_serialize_u8() { serialize_test(3u8, 1); diff --git a/src/core/timer.rs b/src/common/timer.rs similarity index 100% rename from src/core/timer.rs rename to src/common/timer.rs diff --git a/src/common/vint.rs b/src/common/vint.rs new file mode 100644 index 000000000..5c3eeabe9 --- /dev/null +++ b/src/common/vint.rs @@ -0,0 +1,58 @@ +use super::BinarySerializable; +use std::io; +use std::io::Write; +use std::io::Read; + +#[derive(Debug, Eq, PartialEq)] +pub struct VInt(pub u64); + +impl VInt { + pub fn val(&self,) -> u64 { + self.0.clone() + } +} + +impl BinarySerializable for VInt { + fn serialize(&self, writer: &mut Write) -> io::Result { + let mut remaining = self.0.clone(); + let mut written: usize = 0; + let mut buffer = [0u8; 10]; + loop { + let mut next_byte: u8 = (remaining % 128u64) as u8; + remaining /= 128u64; + if remaining == 0u64 { + buffer[written] = next_byte; + written += 1; + break; + } + else { + next_byte |= 128u8; + buffer[written] = next_byte; + written += 1; + } + } + try!(writer.write_all(&buffer[0..written])); + Ok(written) + } + + fn deserialize(reader: &mut Read) -> io::Result { + let mut bytes = reader.bytes(); + let mut result = 0u64; + let mut shift = 0u64; + loop { + match bytes.next() { + Some(Ok(b)) => { + result += ((b % 128u8) as u64) << shift; + if b & 128 == 0u8 { + break; + } + shift += 7; + } + _ => { + return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")) + } + } + } + Ok(VInt(result)) + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs index f6661d3f6..d5fc86b7e 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -4,10 +4,8 @@ pub mod reader; pub mod codec; pub mod searcher; pub mod collector; -pub mod serialize; pub mod index; pub mod merger; -pub mod timer; use std::error; use std::io; diff --git a/src/core/reader.rs b/src/core/reader.rs index 410631453..59afdfe1f 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -13,10 +13,10 @@ use datastruct::FstMap; use std::fmt; use rustc_serialize::json; use core::index::SegmentInfo; -use core::timer::OpenTimer; +use common::OpenTimer; use schema::U32Field; use core::convert_to_ioerror; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use fastfield::{U32FastFieldsReader, U32FastFieldReader}; use compression; use std::mem; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 6197fff3d..ddc836f8c 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -5,7 +5,7 @@ use DocId; use schema::{Document, Term}; use core::collector::Collector; use std::io; -use core::timer::TimerTree; +use common::TimerTree; #[derive(Debug)] pub struct Searcher { diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs index 424bda684..e84923a11 100644 --- a/src/datastruct/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -7,7 +7,7 @@ use fst::raw::Fst; use fst::Streamer; use directory::ReadOnlySource; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use std::marker::PhantomData; fn convert_fst_error(e: fst::Error) -> io::Error { diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 4aa9c1e6d..05d1f6f94 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,12 +1,14 @@ use std::io; +use std::io::{SeekFrom, Seek}; +use std::collections::HashMap; +use std::ops::Deref; + use directory::ReadOnlySource; use fastfield::DividerU32; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use DocId; -use std::collections::HashMap; use schema::U32Field; -use std::io::{SeekFrom, Seek}; -use std::ops::Deref; + use super::compute_num_bits; pub struct U32FastFieldReader { diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index a9abc4432..a30a3dc43 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -1,4 +1,4 @@ -use core::serialize::BinarySerializable; +use common::BinarySerializable; use directory::WritePtr; use schema::U32Field; use std::io; diff --git a/src/lib.rs b/src/lib.rs index f5f227b26..883b43e5e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,6 +34,8 @@ mod directory; mod compression; mod fastfield; mod store; +mod common; + pub mod schema; pub use directory::Directory; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index b48191ca7..5b497650a 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -7,7 +7,7 @@ use DocId; use core::index::Segment; use std::io; use core::index::SegmentComponent; -use core::serialize::BinarySerializable; +use common::BinarySerializable; pub struct PostingsSerializer { terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 8dc0728f6..a49f91b9d 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,4 +1,4 @@ -use core::serialize::BinarySerializable; +use common::BinarySerializable; use std::io; #[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)] diff --git a/src/schema/term.rs b/src/schema/term.rs index 5494eef40..1e9dbdcd2 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,7 +1,7 @@ use std::io::Write; use std::fmt; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use super::U32Field; use super::TextField; diff --git a/src/schema/text_field.rs b/src/schema/text_field.rs index a72faaf9b..8915b7648 100644 --- a/src/schema/text_field.rs +++ b/src/schema/text_field.rs @@ -2,7 +2,7 @@ use std::io::Write; use std::io; use std::io::Read; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use rustc_serialize::Encodable; use rustc_serialize::Decoder; use rustc_serialize::Encoder; diff --git a/src/schema/u32_field.rs b/src/schema/u32_field.rs index 21a1ff54d..06a214bb2 100644 --- a/src/schema/u32_field.rs +++ b/src/schema/u32_field.rs @@ -2,7 +2,7 @@ use std::io; use std::io::Write; use std::io::Read; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use rustc_serialize::Encodable; use rustc_serialize::Decoder; use rustc_serialize::Encoder; diff --git a/src/store/reader.rs b/src/store/reader.rs index 9aab2034c..a5b240c51 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -3,7 +3,7 @@ use std::cell::RefCell; use DocId; use schema::Document; use schema::TextFieldValue; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use std::io::Read; use std::io::Cursor; diff --git a/src/store/writer.rs b/src/store/writer.rs index 8ebcb5ec0..53f48a0ce 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,7 +1,7 @@ use directory::WritePtr; use DocId; use schema::TextFieldValue; -use core::serialize::BinarySerializable; +use common::BinarySerializable; use std::io::Write; use std::io::Read; use std::io; From 5b7f2f7100060fc71f08dc04b00559cc3eced792 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 11:58:54 +0900 Subject: [PATCH 11/14] Renamed S4BP128* --- cpp/encode.cpp | 23 +++++++++++++---------- src/compression/mod.rs | 36 ++++++++++++++++++------------------ src/core/reader.rs | 3 ++- src/postings/serializer.rs | 6 +++--- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/cpp/encode.cpp b/cpp/encode.cpp index fa538b0f1..f4f6f24af 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -70,7 +70,7 @@ extern "C" { } - size_t encode_sorted_native( + size_t encode_s4_bp128_dm_native( uint32_t* begin, const size_t num_els, uint32_t* output, @@ -83,6 +83,17 @@ extern "C" { return output_length; } + size_t decode_s4_bp128_dm_native( + const uint32_t* compressed_data, + const size_t compressed_size, + uint32_t* uncompressed, + const size_t uncompressed_capacity) { + size_t num_ints = uncompressed_capacity; + codec_sorted -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints); + return num_ints; + } + + size_t encode_unsorted_native( uint32_t* begin, const size_t num_els, @@ -96,15 +107,7 @@ extern "C" { return output_length; } - size_t decode_sorted_native( - const uint32_t* compressed_data, - const size_t compressed_size, - uint32_t* uncompressed, - const size_t uncompressed_capacity) { - size_t num_ints = uncompressed_capacity; - codec_sorted -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints); - return num_ints; - } + size_t decode_unsorted_native( const uint32_t* compressed_data, diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 9e816ca64..ca97f6b5b 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -9,8 +9,8 @@ extern { fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; // complete s4-bp128-dm - fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; // bp128, only encodes group of 128 u32 at a time fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; @@ -115,7 +115,7 @@ impl Block128Encoder { let written_size: usize; unsafe { ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); - written_size = encode_sorted_native( + written_size = encode_s4_bp128_dm_native( self.input_buffer.as_mut_ptr(), 128, self.output_buffer.as_mut_ptr(), @@ -139,7 +139,7 @@ impl Block128Decoder { compressed_data: &[u32], uncompressed_values: &mut [u32]) -> size_t { unsafe { - return decode_sorted_native( + return decode_s4_bp128_dm_native( compressed_data.as_ptr(), compressed_data.len() as size_t, uncompressed_values.as_mut_ptr(), @@ -152,15 +152,15 @@ impl Block128Decoder { // s4-bp128-dm -pub struct Encoder { +pub struct S4BP128Encoder { input_buffer: Vec, output_buffer: Vec, } -impl Encoder { +impl S4BP128Encoder { - pub fn new() -> Encoder { - Encoder { + pub fn new() -> S4BP128Encoder { + S4BP128Encoder { input_buffer: Vec::new(), output_buffer: Vec::new(), } @@ -177,7 +177,7 @@ impl Encoder { // TODO use clone_from when available unsafe { ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - let written_size = encode_sorted_native( + let written_size = encode_s4_bp128_dm_native( self.input_buffer.as_mut_ptr(), input_len as size_t, self.output_buffer.as_mut_ptr(), @@ -190,19 +190,19 @@ impl Encoder { -pub struct Decoder; +pub struct S4BP128Decoder; -impl Decoder { +impl S4BP128Decoder { - pub fn new() -> Decoder { - Decoder + pub fn new() -> S4BP128Decoder { + S4BP128Decoder } pub fn decode_sorted(&self, compressed_data: &[u32], uncompressed_values: &mut [u32]) -> size_t { unsafe { - return decode_sorted_native( + return decode_s4_bp128_dm_native( compressed_data.as_ptr(), compressed_data.len() as size_t, uncompressed_values.as_mut_ptr(), @@ -281,7 +281,7 @@ mod tests { #[test] fn test_encode_big() { - let mut encoder = Encoder::new(); + let mut encoder = S4BP128Encoder::new(); let num_ints = 10000 as usize; let expected_length = 1274; let input: Vec = (0..num_ints as u32) @@ -289,7 +289,7 @@ mod tests { .into_iter().collect(); let encoded_data = encoder.encode_sorted(&input); assert_eq!(encoded_data.len(), expected_length); - let decoder = Decoder::new(); + let decoder = S4BP128Decoder::new(); let mut decoded_data: Vec = (0..num_ints as u32).collect(); assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); assert_eq!(decoded_data, input); @@ -368,10 +368,10 @@ mod tests { fn bench_decode(b: &mut Bencher) { const TEST_SIZE: usize = 1_000_000; let arr = generate_array(TEST_SIZE, 0.1); - let mut encoder = Encoder::new(); + let mut encoder = S4BP128Encoder::new(); let encoded = encoder.encode_sorted(&arr); let mut uncompressed: Vec = (0..TEST_SIZE as u32).collect(); - let decoder = Decoder; + let decoder = S4BP128Decoder; b.iter(|| { decoder.decode_sorted(&encoded, &mut uncompressed); }); diff --git a/src/core/reader.rs b/src/core/reader.rs index 59afdfe1f..c271efa04 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -19,6 +19,7 @@ use core::convert_to_ioerror; use common::BinarySerializable; use fastfield::{U32FastFieldsReader, U32FastFieldReader}; use compression; +use compression::S4BP128Decoder; use std::mem; impl fmt::Debug for SegmentReader { @@ -74,7 +75,7 @@ impl SegmentPostings { let mut doc_ids: Vec = Vec::with_capacity(doc_freq as usize); unsafe { doc_ids.set_len(doc_freq as usize); } { - let decoder = compression::Decoder::new(); + let decoder = compression::S4BP128Decoder::new(); decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids); SegmentPostings(doc_ids) } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 5b497650a..1e4eda50e 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -2,7 +2,7 @@ use datastruct::FstMapBuilder; use super::TermInfo; use schema::Term; use directory::WritePtr; -use compression; +use compression::S4BP128Encoder; use DocId; use core::index::Segment; use std::io; @@ -15,7 +15,7 @@ pub struct PostingsSerializer { positions_write: WritePtr, written_bytes_postings: usize, written_bytes_positions: usize, - encoder: compression::Encoder, + encoder: S4BP128Encoder, doc_ids: Vec, } @@ -32,7 +32,7 @@ impl PostingsSerializer { positions_write: positions_write, written_bytes_postings: 0, written_bytes_positions: 0, - encoder: compression::Encoder::new(), + encoder: S4BP128Encoder::new(), doc_ids: Vec::new(), }) } From 056e4e6cf3dbc76bea4b1f03fc503351384dd47d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 14:58:46 +0900 Subject: [PATCH 12/14] Moved collector out. --- src/collector/count_collector.rs | 32 +++ src/collector/first_n_collector.rs | 40 +++ src/collector/mod.rs | 109 +++++++++ src/collector/multi_collector.rs | 33 +++ src/compression/block128.rs | 87 +++++++ src/compression/intersection.rs | 14 ++ src/compression/mod.rs | 376 +---------------------------- src/compression/s4bp128.rs | 122 ++++++++++ src/compression/vints.rs | 97 ++++++++ src/core/reader.rs | 2 +- 10 files changed, 545 insertions(+), 367 deletions(-) create mode 100644 src/collector/count_collector.rs create mode 100644 src/collector/first_n_collector.rs create mode 100644 src/collector/mod.rs create mode 100644 src/collector/multi_collector.rs create mode 100644 src/compression/block128.rs create mode 100644 src/compression/intersection.rs create mode 100644 src/compression/s4bp128.rs create mode 100644 src/compression/vints.rs diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs new file mode 100644 index 000000000..74794550c --- /dev/null +++ b/src/collector/count_collector.rs @@ -0,0 +1,32 @@ +use std::io; +use super::Collector; +use DocId; +use SegmentReader; +use SegmentLocalId; + +pub struct CountCollector { + count: usize, +} + +impl CountCollector { + pub fn new() -> CountCollector { + CountCollector { + count: 0, + } + } + + pub fn count(&self,) -> usize { + self.count + } +} + +impl Collector for CountCollector { + + fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + Ok(()) + } + + fn collect(&mut self, _: DocId) { + self.count += 1; + } +} diff --git a/src/collector/first_n_collector.rs b/src/collector/first_n_collector.rs new file mode 100644 index 000000000..9e570247b --- /dev/null +++ b/src/collector/first_n_collector.rs @@ -0,0 +1,40 @@ +use std::io; +use super::Collector; +use DocId; +use SegmentReader; +use SegmentLocalId; +use core::searcher::DocAddress; + +pub struct FirstNCollector { + docs: Vec, + current_segment: u32, + limit: usize, +} + +impl FirstNCollector { + pub fn with_limit(limit: usize) -> FirstNCollector { + FirstNCollector { + docs: Vec::new(), + limit: limit, + current_segment: 0, + } + } + + pub fn docs(self,) -> Vec { + self.docs + } +} + +impl Collector for FirstNCollector { + + fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + self.current_segment = segment_local_id; + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + if self.docs.len() < self.limit { + self.docs.push(DocAddress(self.current_segment.clone(), doc_id)); + } + } +} diff --git a/src/collector/mod.rs b/src/collector/mod.rs new file mode 100644 index 000000000..048d7db32 --- /dev/null +++ b/src/collector/mod.rs @@ -0,0 +1,109 @@ +use DocId; +use SegmentReader; +use SegmentLocalId; +use fastfield::U32FastFieldReader; +use schema::U32Field; +use std::io; + + +mod count_collector; +pub use self::count_collector::CountCollector; + +mod first_n_collector; +pub use self::first_n_collector::FirstNCollector; + +mod multi_collector; +pub use self::multi_collector::MultiCollector; + +pub trait Collector { + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>; + fn collect(&mut self, doc_id: DocId); +} + +pub struct TestCollector { + offset: DocId, + segment_max_doc: DocId, + docs: Vec, +} + +impl TestCollector { + pub fn new() -> TestCollector { + TestCollector { + docs: Vec::new(), + offset: 0, + segment_max_doc: 0, + } + } + + pub fn docs(self,) -> Vec { + self.docs + } +} + +impl Collector for TestCollector { + + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { + self.offset += self.segment_max_doc; + self.segment_max_doc = reader.max_doc(); + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + self.docs.push(doc_id + self.offset); + } +} + + +pub struct FastFieldTestCollector { + vals: Vec, + u32_field: U32Field, + ff_reader: Option, +} + +impl FastFieldTestCollector { + pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector { + FastFieldTestCollector { + vals: Vec::new(), + u32_field: u32_field, + ff_reader: None, + } + } + + pub fn vals(&self,) -> &Vec { + &self.vals + } +} + +impl Collector for FastFieldTestCollector { + + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { + self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field))); + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + let val = self.ff_reader.as_ref().unwrap().get(doc_id); + self.vals.push(val); + } +} + + + +#[cfg(test)] +mod tests { + + use super::*; + use test::Bencher; + + #[bench] + fn build_collector(b: &mut Bencher) { + b.iter(|| { + let mut count_collector = CountCollector::new(); + let docs: Vec = (0..1_000_000).collect(); + for doc in docs { + count_collector.collect(doc); + } + count_collector.count() + }); + } +} diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs new file mode 100644 index 000000000..8fafb458a --- /dev/null +++ b/src/collector/multi_collector.rs @@ -0,0 +1,33 @@ +use std::io; +use super::Collector; +use DocId; +use SegmentReader; +use SegmentLocalId; + +pub struct MultiCollector<'a> { + collectors: Vec<&'a mut Collector>, +} + +impl<'a> MultiCollector<'a> { + pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { + MultiCollector { + collectors: collectors, + } + } +} + +impl<'a> Collector for MultiCollector<'a> { + + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { + for collector in self.collectors.iter_mut() { + try!(collector.set_segment(segment_local_id, segment)); + } + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + for collector in self.collectors.iter_mut() { + collector.collect(doc_id); + } + } +} diff --git a/src/compression/block128.rs b/src/compression/block128.rs new file mode 100644 index 000000000..f48bd5011 --- /dev/null +++ b/src/compression/block128.rs @@ -0,0 +1,87 @@ +use libc::size_t; +use std::ptr; +use std::iter; + +extern { + fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +} + +//------------------------- +// Block128 + +pub struct Block128Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl Block128Encoder { + + pub fn new() -> Block128Encoder { + Block128Encoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert_eq!(input.len(), 128); + // TODO use clone_from when available + let written_size: usize; + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); + written_size = encode_sorted_block128_native( + self.input_buffer.as_mut_ptr(), + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + +pub struct Block128Decoder; + +impl Block128Decoder { + + pub fn new() -> Block128Decoder { + Block128Decoder + } + + pub fn decode_sorted( + &self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_block128_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + + +#[cfg(test)] +mod tests { + + use super::*; + use std::iter; + + #[test] + fn test_encode_block() { + let mut encoder = Block128Encoder::new(); + let expected_length = 21; + let input: Vec = (0u32..128u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = Block128Decoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } +} diff --git a/src/compression/intersection.rs b/src/compression/intersection.rs new file mode 100644 index 000000000..ddc24df42 --- /dev/null +++ b/src/compression/intersection.rs @@ -0,0 +1,14 @@ +use libc::size_t; + +extern { + fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; +} + +pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { + unsafe { + intersection_native( + left.as_ptr(), left.len(), + right.as_ptr(), right.len(), + output.as_mut_ptr()) + } +} diff --git a/src/compression/mod.rs b/src/compression/mod.rs index ca97f6b5b..e4ed74ab7 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -1,267 +1,20 @@ -use libc::size_t; -use std::ptr; -use std::iter; +mod intersection; +pub use self::intersection::intersection; -extern { - // fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - // fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +mod s4bp128; +pub use self::s4bp128::{S4BP128Encoder, S4BP128Decoder}; - fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; +mod block128; +pub use self::block128::{Block128Encoder, Block128Decoder}; - // complete s4-bp128-dm - fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +mod vints; +pub use self::vints::{SortedVIntsEncoder, SortedVIntsDecoder}; - // bp128, only encodes group of 128 u32 at a time - fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; - - // vints, used as the left over codec for the <128 remaining values - fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; - -} - -pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { - unsafe { - intersection_native( - left.as_ptr(), left.len(), - right.as_ptr(), right.len(), - output.as_mut_ptr()) - } -} - - - -//------------------------- -// Vint - - -pub struct VIntEncoder { - input_buffer: Vec, - output_buffer: Vec, -} - -impl VIntEncoder { - - pub fn new() -> VIntEncoder { - VIntEncoder { - input_buffer: Vec::with_capacity(128), - output_buffer: iter::repeat(0u32).take(256).collect(), - } - } - - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { - assert!(input.len() < 128); - let input_len = input.len(); - let written_size: usize; - // TODO use clone_from when available - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - written_size = encode_sorted_vint_native( - self.input_buffer.as_mut_ptr(), - input_len as size_t, - self.output_buffer.as_mut_ptr(), - 256, - ); - } - return &self.output_buffer[0..written_size]; - } -} - - - -pub struct VIntDecoder; - -impl VIntDecoder { - - pub fn new() -> VIntDecoder { - VIntDecoder - } - - pub fn decode_sorted(&self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_sorted_vint_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } -} - -//------------------------- -// Block128 - -pub struct Block128Encoder { - input_buffer: Vec, - output_buffer: Vec, -} - -impl Block128Encoder { - - pub fn new() -> Block128Encoder { - Block128Encoder { - input_buffer: Vec::with_capacity(128), - output_buffer: iter::repeat(0u32).take(256).collect(), - } - } - - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { - assert_eq!(input.len(), 128); - // TODO use clone_from when available - let written_size: usize; - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); - written_size = encode_s4_bp128_dm_native( - self.input_buffer.as_mut_ptr(), - 128, - self.output_buffer.as_mut_ptr(), - 256, - ); - } - return &self.output_buffer[0..written_size]; - } -} - -pub struct Block128Decoder; - -impl Block128Decoder { - - pub fn new() -> Block128Decoder { - Block128Decoder - } - - pub fn decode_sorted( - &self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_s4_bp128_dm_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } -} - -//------------------------- -// s4-bp128-dm - - -pub struct S4BP128Encoder { - input_buffer: Vec, - output_buffer: Vec, -} - -impl S4BP128Encoder { - - pub fn new() -> S4BP128Encoder { - S4BP128Encoder { - input_buffer: Vec::new(), - output_buffer: Vec::new(), - } - } - - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { - self.input_buffer.clear(); - let input_len = input.len(); - if input_len + 10000 >= self.input_buffer.len() { - let target_length = input_len + 1024; - self.input_buffer.resize(target_length, 0); - self.output_buffer.resize(target_length, 0); - } - // TODO use clone_from when available - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - let written_size = encode_s4_bp128_dm_native( - self.input_buffer.as_mut_ptr(), - input_len as size_t, - self.output_buffer.as_mut_ptr(), - self.output_buffer.len() as size_t, - ); - return &self.output_buffer[0..written_size]; - } - } -} - - - -pub struct S4BP128Decoder; - -impl S4BP128Decoder { - - pub fn new() -> S4BP128Decoder { - S4BP128Decoder - } - - pub fn decode_sorted(&self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_s4_bp128_dm_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } - - // pub fn decode_unsorted(&self, - // compressed_data: &[u32], - // uncompressed_values: &mut [u32]) -> size_t { - // unsafe { - // return decode_unsorted_native( - // compressed_data.as_ptr(), - // compressed_data.len() as size_t, - // uncompressed_values.as_mut_ptr(), - // uncompressed_values.len() as size_t); - // } - // } -} - - - - -// -// pub struct Intersector { -// output_buffer: Vec, -// } -// -// impl Intersector { -// fn new() -> Intersector { -// Intersector::with_capacity(1_000_000) -// } -// fn with_capacity(capacity: usize) -> Intersector { -// Intersector { -// output_buffer: iter::repeat(0u32).take(capacity).collect() -// } -// } -// fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] { -// let max_intersection_length = min(left.len(), right.len()); -// if self.output_buffer.len() < max_intersection_length { -// self.output_buffer.resize(max_intersection_length, 0); -// } -// unsafe { -// let intersection_len = intersection_native( -// left.as_ptr(), left.len() as size_t, -// right.as_ptr(), right.len() as size_t, -// self.output_buffer.as_mut_ptr()); -// return &self.output_buffer[0..intersection_len]; -// } -// } -// } #[cfg(test)] -mod tests { +pub mod tests { - use super::*; - use test::Bencher; - use std::iter; use rand::Rng; use rand::SeedableRng; use rand::XorShiftRng; @@ -275,116 +28,7 @@ mod tests { .collect() } - fn generate_array(n: usize, ratio: f32) -> Vec { + pub fn generate_array(n: usize, ratio: f32) -> Vec { generate_array_with_seed(n, ratio, 4) } - - #[test] - fn test_encode_big() { - let mut encoder = S4BP128Encoder::new(); - let num_ints = 10000 as usize; - let expected_length = 1274; - let input: Vec = (0..num_ints as u32) - .map(|i| i * 7 / 2) - .into_iter().collect(); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = S4BP128Decoder::new(); - let mut decoded_data: Vec = (0..num_ints as u32).collect(); - assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); - } - - #[test] - fn test_encode_block() { - let mut encoder = Block128Encoder::new(); - let expected_length = 21; - let input: Vec = (0u32..128u32) - .map(|i| i * 7 / 2) - .into_iter() - .collect(); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = Block128Decoder::new(); - let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); - assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); - } - - - - #[test] - fn test_encode_vint() { - { - let mut encoder = VIntEncoder::new(); - let expected_length = 31; - let input: Vec = (0u32..123u32) - .map(|i| i * 7 / 2) - .into_iter() - .collect(); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = VIntDecoder::new(); - let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); - assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(&decoded_data[0..123], &input[..]); - } - { - let mut encoder = VIntEncoder::new(); - let input = vec!(3, 17u32, 187); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), 1); - assert_eq!(encoded_data[0], 2167049859u32); - } - } - - // #[test] - // fn test_encode_unsorted() { - // let mut encoder = Encoder::new(); - // let num_ints = 10_000 as usize; - // let expected_length = 4361; - // let input: Vec = (0..num_ints as u32) - // .map(|i| i * 213_127 % 501) - // .into_iter().collect(); - // assert_eq!(input.len(), 10_000); - // let encoded_data = encoder.encode_unsorted(&input); - // assert_eq!(encoded_data.len(), expected_length); - // let decoder = Decoder::new(); - // let mut decoded_data: Vec = (0..num_ints as u32).collect(); - // assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data)); - // assert_eq!(decoded_data, input); - // } - // - // #[test] - // fn test_simd_intersection() { - // let mut intersector = Intersector::new(); - // let arr1 = generate_array_with_seed(1_000_000, 0.1, 2); - // let arr2 = generate_array_with_seed(5_000_000, 0.5, 3); - // let intersection = intersector.intersection(&arr1[..], &arr2[..]) ; - // assert_eq!(intersection.len(), 500_233); - // } - - #[bench] - fn bench_decode(b: &mut Bencher) { - const TEST_SIZE: usize = 1_000_000; - let arr = generate_array(TEST_SIZE, 0.1); - let mut encoder = S4BP128Encoder::new(); - let encoded = encoder.encode_sorted(&arr); - let mut uncompressed: Vec = (0..TEST_SIZE as u32).collect(); - let decoder = S4BP128Decoder; - b.iter(|| { - decoder.decode_sorted(&encoded, &mut uncompressed); - }); - } - - - // #[bench] - // fn bench_simd_intersection(b: &mut Bencher) { - // let mut intersector = Intersector::new(); - // let arr1 = generate_array_with_seed(1_000_000, 0.1, 2); - // let arr2 = generate_array_with_seed(5_000_000, 0.5, 3); - // b.iter(|| { - // intersector.intersection(&arr1[..], &arr2[..]).len() - // }); - // } } diff --git a/src/compression/s4bp128.rs b/src/compression/s4bp128.rs new file mode 100644 index 000000000..2069e53bb --- /dev/null +++ b/src/compression/s4bp128.rs @@ -0,0 +1,122 @@ + +use libc::size_t; +use std::ptr; + +extern { + // complete s4-bp128-dm + fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +} + +//------------------------- +// s4-bp128-dm + + +pub struct S4BP128Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl S4BP128Encoder { + + pub fn new() -> S4BP128Encoder { + S4BP128Encoder { + input_buffer: Vec::new(), + output_buffer: Vec::new(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + self.input_buffer.clear(); + let input_len = input.len(); + if input_len + 10000 >= self.input_buffer.len() { + let target_length = input_len + 1024; + self.input_buffer.resize(target_length, 0); + self.output_buffer.resize(target_length, 0); + } + // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + let written_size = encode_s4_bp128_dm_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr(), + self.output_buffer.len() as size_t, + ); + return &self.output_buffer[0..written_size]; + } + } +} + + +pub struct S4BP128Decoder; + +impl S4BP128Decoder { + + pub fn new() -> S4BP128Decoder { + S4BP128Decoder + } + + pub fn decode_sorted(&self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_s4_bp128_dm_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } + + // pub fn decode_unsorted(&self, + // compressed_data: &[u32], + // uncompressed_values: &mut [u32]) -> size_t { + // unsafe { + // return decode_unsorted_native( + // compressed_data.as_ptr(), + // compressed_data.len() as size_t, + // uncompressed_values.as_mut_ptr(), + // uncompressed_values.len() as size_t); + // } + // } +} + + + +#[cfg(test)] +mod tests { + + use super::*; + use test::Bencher; + use compression::tests::generate_array; + + #[test] + fn test_encode_big() { + let mut encoder = S4BP128Encoder::new(); + let num_ints = 10000 as usize; + let expected_length = 1274; + let input: Vec = (0..num_ints as u32) + .map(|i| i * 7 / 2) + .into_iter().collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = S4BP128Decoder::new(); + let mut decoded_data: Vec = (0..num_ints as u32).collect(); + assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } + + #[bench] + fn bench_decode(b: &mut Bencher) { + const TEST_SIZE: usize = 1_000_000; + let arr = generate_array(TEST_SIZE, 0.1); + let mut encoder = S4BP128Encoder::new(); + let encoded = encoder.encode_sorted(&arr); + let mut uncompressed: Vec = (0..TEST_SIZE as u32).collect(); + let decoder = S4BP128Decoder; + b.iter(|| { + decoder.decode_sorted(&encoded, &mut uncompressed); + }); + } +} diff --git a/src/compression/vints.rs b/src/compression/vints.rs new file mode 100644 index 000000000..ae0cdbf8c --- /dev/null +++ b/src/compression/vints.rs @@ -0,0 +1,97 @@ +use libc::size_t; +use std::ptr; +use std::iter; + +extern { + fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +} + +pub struct SortedVIntsEncoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl SortedVIntsEncoder { + + pub fn new() -> SortedVIntsEncoder { + SortedVIntsEncoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert!(input.len() < 128); + let input_len = input.len(); + let written_size: usize; + // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + written_size = encode_sorted_vint_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + + + +pub struct SortedVIntsDecoder; + +impl SortedVIntsDecoder { + + pub fn new() -> SortedVIntsDecoder { + SortedVIntsDecoder + } + + pub fn decode_sorted(&self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_vint_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + + +#[cfg(test)] +mod tests { + + use std::iter; + use super::*; + + #[test] + fn test_encode_vint() { + { + let mut encoder = SortedVIntsEncoder::new(); + let expected_length = 31; + let input: Vec = (0u32..123u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = SortedVIntsDecoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(&decoded_data[0..123], &input[..]); + } + { + let mut encoder = SortedVIntsEncoder::new(); + let input = vec!(3, 17u32, 187); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), 1); + assert_eq!(encoded_data[0], 2167049859u32); + } + } + +} diff --git a/src/core/reader.rs b/src/core/reader.rs index c271efa04..77d8f0094 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -75,7 +75,7 @@ impl SegmentPostings { let mut doc_ids: Vec = Vec::with_capacity(doc_freq as usize); unsafe { doc_ids.set_len(doc_freq as usize); } { - let decoder = compression::S4BP128Decoder::new(); + let decoder = S4BP128Decoder::new(); decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids); SegmentPostings(doc_ids) } From 3a2af1aa654c25695116ea4abde60cefda595f95 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 15:06:40 +0900 Subject: [PATCH 13/14] added travis ci conf. --- src/analyzer/mod.rs | 95 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/analyzer/mod.rs diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs new file mode 100644 index 000000000..09bb9eff5 --- /dev/null +++ b/src/analyzer/mod.rs @@ -0,0 +1,95 @@ +extern crate regex; + +use std::str::Chars; + +pub struct TokenIter<'a> { + chars: Chars<'a>, + term_buffer: String, +} + +fn append_char_lowercase(c: char, term_buffer: &mut String) { + for c_lower in c.to_lowercase() { + term_buffer.push(c_lower); + } +} + +pub trait StreamingIterator<'a, T> { + fn next(&'a mut self) -> Option; +} + +impl<'a, 'b> TokenIter<'b> { + fn consume_token(&'a mut self) -> Option<&'a str> { + loop { + match self.chars.next() { + Some(c) => { + if c.is_alphanumeric() { + append_char_lowercase(c, &mut self.term_buffer); + } + else { + break; + } + }, + None => { + break; + } + } + } + return Some(&self.term_buffer); + } +} + + +impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { + + fn next(&'a mut self,) -> Option<&'a str> { + self.term_buffer.clear(); + // skipping non-letter characters. + loop { + match self.chars.next() { + Some(c) => { + if c.is_alphanumeric() { + append_char_lowercase(c, &mut self.term_buffer); + return self.consume_token(); + } + } + None => { return None; } + } + } + } +} + +pub struct SimpleTokenizer; + + +impl SimpleTokenizer { + pub fn new() -> SimpleTokenizer { + SimpleTokenizer + } + + pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> { + TokenIter { + term_buffer: String::new(), + chars: text.chars(), + } + } +} + + +#[test] +fn test_tokenizer() { + let simple_tokenizer = SimpleTokenizer::new(); + let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!"); + assert_eq!(term_reader.next().unwrap(), "hello"); + assert_eq!(term_reader.next().unwrap(), "happy"); + assert_eq!(term_reader.next().unwrap(), "tax"); + assert_eq!(term_reader.next().unwrap(), "payer"); + assert_eq!(term_reader.next(), None); +} + + +#[test] +fn test_tokenizer_empty() { + let simple_tokenizer = SimpleTokenizer::new(); + let mut term_reader = simple_tokenizer.tokenize(""); + assert_eq!(term_reader.next(), None); +} From 389cdffb4b55cd5ec75cdac7e9cdec5ffb06f0f0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 15:08:44 +0900 Subject: [PATCH 14/14] blop --- src/core/analyzer.rs | 95 -------------------- src/core/collector.rs | 201 ------------------------------------------ src/core/merger.rs | 4 +- src/core/mod.rs | 2 - src/core/searcher.rs | 2 +- src/core/writer.rs | 4 +- src/lib.rs | 4 +- 7 files changed, 7 insertions(+), 305 deletions(-) delete mode 100644 src/core/analyzer.rs delete mode 100644 src/core/collector.rs diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs deleted file mode 100644 index 09bb9eff5..000000000 --- a/src/core/analyzer.rs +++ /dev/null @@ -1,95 +0,0 @@ -extern crate regex; - -use std::str::Chars; - -pub struct TokenIter<'a> { - chars: Chars<'a>, - term_buffer: String, -} - -fn append_char_lowercase(c: char, term_buffer: &mut String) { - for c_lower in c.to_lowercase() { - term_buffer.push(c_lower); - } -} - -pub trait StreamingIterator<'a, T> { - fn next(&'a mut self) -> Option; -} - -impl<'a, 'b> TokenIter<'b> { - fn consume_token(&'a mut self) -> Option<&'a str> { - loop { - match self.chars.next() { - Some(c) => { - if c.is_alphanumeric() { - append_char_lowercase(c, &mut self.term_buffer); - } - else { - break; - } - }, - None => { - break; - } - } - } - return Some(&self.term_buffer); - } -} - - -impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { - - fn next(&'a mut self,) -> Option<&'a str> { - self.term_buffer.clear(); - // skipping non-letter characters. - loop { - match self.chars.next() { - Some(c) => { - if c.is_alphanumeric() { - append_char_lowercase(c, &mut self.term_buffer); - return self.consume_token(); - } - } - None => { return None; } - } - } - } -} - -pub struct SimpleTokenizer; - - -impl SimpleTokenizer { - pub fn new() -> SimpleTokenizer { - SimpleTokenizer - } - - pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> { - TokenIter { - term_buffer: String::new(), - chars: text.chars(), - } - } -} - - -#[test] -fn test_tokenizer() { - let simple_tokenizer = SimpleTokenizer::new(); - let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!"); - assert_eq!(term_reader.next().unwrap(), "hello"); - assert_eq!(term_reader.next().unwrap(), "happy"); - assert_eq!(term_reader.next().unwrap(), "tax"); - assert_eq!(term_reader.next().unwrap(), "payer"); - assert_eq!(term_reader.next(), None); -} - - -#[test] -fn test_tokenizer_empty() { - let simple_tokenizer = SimpleTokenizer::new(); - let mut term_reader = simple_tokenizer.tokenize(""); - assert_eq!(term_reader.next(), None); -} diff --git a/src/core/collector.rs b/src/core/collector.rs deleted file mode 100644 index d9dfc3960..000000000 --- a/src/core/collector.rs +++ /dev/null @@ -1,201 +0,0 @@ -use DocId; -use core::reader::SegmentReader; -use core::searcher::SegmentLocalId; -use core::searcher::DocAddress; -use fastfield::U32FastFieldReader; -use schema::U32Field; -use std::io; - -pub trait Collector { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>; - fn collect(&mut self, doc_id: DocId); -} - -pub struct FirstNCollector { - docs: Vec, - current_segment: u32, - limit: usize, -} - -impl FirstNCollector { - pub fn with_limit(limit: usize) -> FirstNCollector { - FirstNCollector { - docs: Vec::new(), - limit: limit, - current_segment: 0, - } - } - - pub fn docs(self,) -> Vec { - self.docs - } -} - -impl Collector for FirstNCollector { - - fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { - self.current_segment = segment_local_id; - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - if self.docs.len() < self.limit { - self.docs.push(DocAddress(self.current_segment.clone(), doc_id)); - } - } -} - -pub struct CountCollector { - count: usize, -} - -impl CountCollector { - pub fn new() -> CountCollector { - CountCollector { - count: 0, - } - } - - pub fn count(&self,) -> usize { - self.count - } -} - -impl Collector for CountCollector { - - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { - Ok(()) - } - - fn collect(&mut self, _: DocId) { - self.count += 1; - } -} - -pub struct MultiCollector<'a> { - collectors: Vec<&'a mut Collector>, -} - -impl<'a> MultiCollector<'a> { - pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { - MultiCollector { - collectors: collectors, - } - } -} - -impl<'a> Collector for MultiCollector<'a> { - - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { - for collector in self.collectors.iter_mut() { - try!(collector.set_segment(segment_local_id, segment)); - } - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - for collector in self.collectors.iter_mut() { - collector.collect(doc_id); - } - } -} - -pub struct TestCollector { - offset: DocId, - segment_max_doc: DocId, - docs: Vec, -} - -impl TestCollector { - pub fn new() -> TestCollector { - TestCollector { - docs: Vec::new(), - offset: 0, - segment_max_doc: 0, - } - } - - pub fn docs(self,) -> Vec { - self.docs - } -} - -impl Collector for TestCollector { - - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { - self.offset += self.segment_max_doc; - self.segment_max_doc = reader.max_doc(); - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - self.docs.push(doc_id + self.offset); - } -} - - -pub struct FastFieldTestCollector { - vals: Vec, - u32_field: U32Field, - ff_reader: Option, -} - -impl FastFieldTestCollector { - pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector { - FastFieldTestCollector { - vals: Vec::new(), - u32_field: u32_field, - ff_reader: None, - } - } - - pub fn vals(&self,) -> &Vec { - &self.vals - } -} - -impl Collector for FastFieldTestCollector { - - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { - self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field))); - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - let val = self.ff_reader.as_ref().unwrap().get(doc_id); - self.vals.push(val); - } -} - - - -#[cfg(test)] -mod tests { - - use super::*; - use test::Bencher; - - #[bench] - fn build_collector(b: &mut Bencher) { - b.iter(|| { - let mut count_collector = CountCollector::new(); - let docs: Vec = (0..1_000_000).collect(); - for doc in docs { - count_collector.collect(doc); - } - count_collector.count() - }); - } - - // #[bench] - // fn build_first_3_collector(b: &mut Bencher) { - // b.iter(|| { - // let mut first3collector = FirstNCollector::with_limit(3); - // let docs: Vec = (0..1_000_000).collect(); - // for doc in docs { - // first3collector.collect(doc); - // } - // first3collector.docs() - // }); - // } -} diff --git a/src/core/merger.rs b/src/core/merger.rs index 4d7456510..10a6723a1 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -215,8 +215,8 @@ mod tests { use schema::Term; use core::index::Index; use core::searcher::DocAddress; - use core::collector::FastFieldTestCollector; - use core::collector::TestCollector; + use collector::FastFieldTestCollector; + use collector::TestCollector; #[test] fn test_index_merger() { diff --git a/src/core/mod.rs b/src/core/mod.rs index d5fc86b7e..b97d15b35 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,9 +1,7 @@ pub mod writer; -pub mod analyzer; pub mod reader; pub mod codec; pub mod searcher; -pub mod collector; pub mod index; pub mod merger; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index ddc836f8c..35857dd80 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -3,7 +3,7 @@ use core::index::Index; use core::index::Segment; use DocId; use schema::{Document, Term}; -use core::collector::Collector; +use collector::Collector; use std::io; use common::TimerTree; diff --git a/src/core/writer.rs b/src/core/writer.rs index 4d4f1ba2c..177131032 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -5,9 +5,9 @@ use schema::Term; use schema::TextFieldValue; use core::codec::*; use core::index::Index; -use core::analyzer::SimpleTokenizer; +use analyzer::SimpleTokenizer; use core::index::SerializableSegment; -use core::analyzer::StreamingIterator; +use analyzer::StreamingIterator; use core::index::Segment; use core::index::SegmentInfo; use postings::PostingsWriter; diff --git a/src/lib.rs b/src/lib.rs index 883b43e5e..746d02748 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,16 +35,16 @@ mod compression; mod fastfield; mod store; mod common; +pub mod analyzer; +pub mod collector; pub mod schema; pub use directory::Directory; -pub use core::analyzer; pub use core::searcher::Searcher; pub use core::index::Index; pub use schema::Term; pub use schema::Document; -pub use core::collector; pub use core::reader::SegmentReader; pub use core::searcher::SegmentLocalId;