diff --git a/Cargo.toml b/Cargo.toml index 952dc55d7..845f1d31d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.4.3" +version = "0.5.0-dev" authors = ["Paul Masurel "] build = "build.rs" license = "MIT" diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 0d35f0e42..3cc82ae4d 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { let mut old_man_doc = Document::default(); old_man_doc.add_text(title, "The Old Man and the Sea"); - old_man_doc.add_text(body, - "He was an old man who fished alone in a skiff in the Gulf Stream and \ - he had gone eighty-four days now without taking a fish."); + old_man_doc.add_text( + body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish.", + ); // ... and add it to the `IndexWriter`. index_writer.add_document(old_man_doc); diff --git a/src/collector/chained_collector.rs b/src/collector/chained_collector.rs index 6cc5785b4..1dff3e3c6 100644 --- a/src/collector/chained_collector.rs +++ b/src/collector/chained_collector.rs @@ -38,10 +38,11 @@ impl ChainedCollector { } impl Collector for ChainedCollector { - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()> { + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()> { try!(self.left.set_segment(segment_local_id, segment)); try!(self.right.set_segment(segment_local_id, segment)); Ok(()) diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index bfb17eb3c..1fd9613ec 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -45,11 +45,11 @@ mod tests { #[bench] fn build_collector(b: &mut Bencher) { b.iter(|| { - let mut count_collector = CountCollector::default(); - for doc in 0..1_000_000 { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + let mut count_collector = CountCollector::default(); + for doc in 0..1_000_000 { + count_collector.collect(doc, 1f32); + } + count_collector.count() + }); } } diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 2d760dfc6..b99822089 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -15,8 +15,9 @@ use SegmentLocalId; /// Facet collector for i64/u64 fast field pub struct FacetCollector - where T: FastFieldReader, - T::ValueType: Eq + Hash +where + T: FastFieldReader, + T::ValueType: Eq + Hash, { counters: HashMap, field: Field, @@ -25,8 +26,9 @@ pub struct FacetCollector impl FacetCollector - where T: FastFieldReader, - T::ValueType: Eq + Hash +where + T: FastFieldReader, + T::ValueType: Eq + Hash, { /// Creates a new facet collector for aggregating a given field. pub fn new(field: Field) -> FacetCollector { @@ -40,8 +42,9 @@ impl FacetCollector impl Collector for FacetCollector - where T: FastFieldReader, - T::ValueType: Eq + Hash +where + T: FastFieldReader, + T::ValueType: Eq + Hash, { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.ff_reader = Some(reader.get_fast_field_reader(self.field)?); @@ -51,7 +54,9 @@ impl Collector for FacetCollector fn collect(&mut self, doc: DocId, _: Score) { let val = self.ff_reader .as_ref() - .expect("collect() was called before set_segment. This should never happen.") + .expect( + "collect() was called before set_segment. This should never happen.", + ) .get(doc); *(self.counters.entry(val).or_insert(0)) += 1; } diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 44193399d..aea552e58 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -54,20 +54,22 @@ pub use self::chained_collector::chain; pub trait Collector { /// `set_segment` is called before beginning to enumerate /// on this segment. - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()>; + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()>; /// The query pushes the scored document to the collector via this method. fn collect(&mut self, doc: DocId, score: Score); } impl<'a, C: Collector> Collector for &'a mut C { - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()> { + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()> { (*self).set_segment(segment_local_id, segment) } /// The query pushes the scored document to the collector via this method. @@ -172,12 +174,12 @@ pub mod tests { #[bench] fn build_collector(b: &mut Bencher) { b.iter(|| { - let mut count_collector = CountCollector::default(); - let docs: Vec = (0..1_000_000).collect(); - for doc in docs { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + let mut count_collector = CountCollector::default(); + let docs: Vec = (0..1_000_000).collect(); + for doc in docs { + count_collector.collect(doc, 1f32); + } + count_collector.count() + }); } } diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index c2515782d..2e6bf0628 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> { impl<'a> Collector for MultiCollector<'a> { - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()> { + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()> { for collector in &mut self.collectors { try!(collector.set_segment(segment_local_id, segment)); } @@ -53,8 +54,8 @@ mod tests { let mut top_collector = TopCollector::with_limit(2); let mut count_collector = CountCollector::default(); { - let mut collectors = MultiCollector::from(vec![&mut top_collector, - &mut count_collector]); + let mut collectors = + MultiCollector::from(vec![&mut top_collector, &mut count_collector]); collectors.collect(1, 0.2); collectors.collect(2, 0.1); collectors.collect(3, 0.5); diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 7d3c33c9e..e022c4ba9 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc { impl Ord for GlobalScoredDoc { #[inline] fn cmp(&self, other: &GlobalScoredDoc) -> Ordering { - other - .score - .partial_cmp(&self.score) - .unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) + other.score.partial_cmp(&self.score).unwrap_or_else(|| { + other.doc_address.cmp(&self.doc_address) + }) } } @@ -87,7 +86,9 @@ impl TopCollector { scored_docs.sort(); scored_docs .into_iter() - .map(|GlobalScoredDoc { score, doc_address }| (score, doc_address)) + .map(|GlobalScoredDoc { score, doc_address }| { + (score, doc_address) + }) .collect() } @@ -108,14 +109,13 @@ impl Collector for TopCollector { fn collect(&mut self, doc: DocId, score: Score) { if self.at_capacity() { // It's ok to unwrap as long as a limit of 0 is forbidden. - let limit_doc: GlobalScoredDoc = - *self.heap - .peek() - .expect("Top collector with size 0 is forbidden"); + let limit_doc: GlobalScoredDoc = *self.heap.peek().expect( + "Top collector with size 0 is forbidden", + ); if limit_doc.score < score { - let mut mut_head = self.heap - .peek_mut() - .expect("Top collector with size 0 is forbidden"); + let mut mut_head = self.heap.peek_mut().expect( + "Top collector with size 0 is forbidden", + ); mut_head.score = score; mut_head.doc_address = DocAddress(self.segment_id, doc); } diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 3c3049877..423890101 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -15,7 +15,7 @@ use std::ops::Deref; /// reasons, we want to ensure that a value spawns over at most 8 bytes /// of aligns bytes. /// -/// Spawning over 9 bytes is possible for instance, if we do +/// Spanning over 9 bytes is possible for instance, if we do /// bitpacking with an amplitude of 63 bits. /// In this case, the second int will start on bit /// 63 (which belongs to byte 7) and ends at byte 15; @@ -88,7 +88,8 @@ impl BitPacker { pub struct BitUnpacker - where Data: Deref +where + Data: Deref, { num_bits: usize, mask: u64, @@ -96,7 +97,8 @@ pub struct BitUnpacker } impl BitUnpacker - where Data: Deref +where + Data: Deref, { pub fn new(data: Data, num_bits: usize) -> BitUnpacker { let mask: u64 = if num_bits == 64 { @@ -121,33 +123,13 @@ impl BitUnpacker let addr_in_bits = idx * num_bits; let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; - if cfg!(feature = "simdcompression") { - // for simdcompression, - // the bitpacker is only used for fastfields, - // and we expect them to be always padded. - debug_assert!( - addr + 8 <= data.len(), - "The fast field field should have been padded with 7 bytes." - ); - let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; - let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; - (val_shifted & mask) - } - else { - let val_unshifted_unmasked: u64; - if addr + 8 <= data.len() { - val_unshifted_unmasked = unsafe { *(data[addr..].as_ptr() as *const u64) }; - } - else { - let mut buffer = [0u8; 8]; - for i in addr..data.len() { - buffer[i - addr] += data[i]; - } - val_unshifted_unmasked = unsafe { *(buffer[..].as_ptr() as *const u64) }; - } - let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; - (val_shifted & mask) - } + debug_assert!( + addr + 8 <= data.len(), + "The fast field field should have been padded with 7 bytes." + ); + let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; + (val_shifted & mask) } pub fn get_range(&self, start: u32, output: &mut [u64]) { diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs new file mode 100644 index 000000000..4ab843d38 --- /dev/null +++ b/src/common/composite_file.rs @@ -0,0 +1,191 @@ +use std::io::Write; +use common::CountingWriter; +use std::collections::HashMap; +use schema::Field; +use common::VInt; +use directory::WritePtr; +use std::io; +use directory::ReadOnlySource; +use common::BinarySerializable; + + +/// A `CompositeWrite` is used to write a `CompositeFile`. +pub struct CompositeWrite { + write: CountingWriter, + offsets: HashMap, +} + +impl CompositeWrite { + /// Crate a new API writer that writes a composite file + /// in a given write. + pub fn wrap(w: W) -> CompositeWrite { + CompositeWrite { + write: CountingWriter::wrap(w), + offsets: HashMap::new(), + } + } + + /// Start writing a new field. + pub fn for_field(&mut self, field: Field) -> &mut CountingWriter { + let offset = self.write.written_bytes(); + assert!(!self.offsets.contains_key(&field)); + self.offsets.insert(field, offset); + &mut self.write + } + + + /// Close the composite file. + /// + /// An index of the different field offsets + /// will be written as a footer. + pub fn close(mut self) -> io::Result<()> { + let footer_offset = self.write.written_bytes(); + VInt(self.offsets.len() as u64).serialize(&mut self.write)?; + + let mut offset_fields: Vec<_> = self.offsets + .iter() + .map(|(field, offset)| (offset, field)) + .collect(); + + offset_fields.sort(); + + let mut prev_offset = 0; + for (offset, field) in offset_fields { + VInt((offset - prev_offset) as u64).serialize( + &mut self.write, + )?; + field.serialize(&mut self.write)?; + prev_offset = *offset; + } + + let footer_len = (self.write.written_bytes() - footer_offset) as u32; + footer_len.serialize(&mut self.write)?; + self.write.flush()?; + Ok(()) + } +} + + +/// A composite file is an abstraction to store a +/// file partitioned by field. +/// +/// The file needs to be written field by field. +/// A footer describes the start and stop offsets +/// for each field. +#[derive(Clone)] +pub struct CompositeFile { + data: ReadOnlySource, + offsets_index: HashMap, +} + +impl CompositeFile { + /// Opens a composite file stored in a given + /// `ReadOnlySource`. + pub fn open(data: ReadOnlySource) -> io::Result { + let end = data.len(); + let footer_len_data = data.slice_from(end - 4); + let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; + + let footer_start = end - 4 - footer_len; + let footer_data = data.slice(footer_start, footer_start + footer_len); + let mut footer_buffer = footer_data.as_slice(); + let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; + + let mut fields = vec![]; + let mut offsets = vec![]; + + let mut field_index = HashMap::new(); + + let mut offset = 0; + for _ in 0..num_fields { + offset += VInt::deserialize(&mut footer_buffer)?.0 as usize; + let field = Field::deserialize(&mut footer_buffer)?; + offsets.push(offset); + fields.push(field); + } + offsets.push(footer_start); + for i in 0..num_fields { + let field = fields[i]; + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; + field_index.insert(field, (start_offset, end_offset)); + } + + Ok(CompositeFile { + data: data.slice_to(footer_start), + offsets_index: field_index, + }) + } + + /// Returns a composite file that stores + /// no fields. + pub fn empty() -> CompositeFile { + CompositeFile { + offsets_index: HashMap::new(), + data: ReadOnlySource::empty(), + } + } + + /// Returns the `ReadOnlySource` associated + /// to a given `Field` and stored in a `CompositeFile`. + pub fn open_read(&self, field: Field) -> Option { + self.offsets_index.get(&field).map(|&(from, to)| { + self.data.slice(from, to) + }) + } +} + + +#[cfg(test)] +mod test { + + use std::io::Write; + use super::{CompositeWrite, CompositeFile}; + use directory::{RAMDirectory, Directory}; + use schema::Field; + use common::VInt; + use common::BinarySerializable; + use std::path::Path; + + #[test] + fn test_composite_file() { + let path = Path::new("test_path"); + let mut directory = RAMDirectory::create(); + { + let w = directory.open_write(path).unwrap(); + let mut composite_write = CompositeWrite::wrap(w); + { + let mut write_0 = composite_write.for_field(Field(0u32)); + VInt(32431123u64).serialize(&mut write_0).unwrap(); + write_0.flush().unwrap(); + } + + { + let mut write_4 = composite_write.for_field(Field(4u32)); + VInt(2).serialize(&mut write_4).unwrap(); + write_4.flush().unwrap(); + } + composite_write.close().unwrap(); + } + { + let r = directory.open_read(path).unwrap(); + let composite_file = CompositeFile::open(r).unwrap(); + { + let file0 = composite_file.open_read(Field(0u32)).unwrap(); + let mut file0_buf = file0.as_slice(); + let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0; + assert_eq!(file0_buf.len(), 0); + assert_eq!(payload_0, 32431123u64); + } + { + let file4 = composite_file.open_read(Field(4u32)).unwrap(); + let mut file4_buf = file4.as_slice(); + let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0; + assert_eq!(file4_buf.len(), 0); + assert_eq!(payload_4, 2u64); + } + } + + } + +} diff --git a/src/common/counting_writer.rs b/src/common/counting_writer.rs index db13e368f..d9ea877d2 100644 --- a/src/common/counting_writer.rs +++ b/src/common/counting_writer.rs @@ -2,7 +2,7 @@ use std::io::Write; use std::io; -pub struct CountingWriter { +pub struct CountingWriter { underlying: W, written_bytes: usize, } diff --git a/src/common/mod.rs b/src/common/mod.rs index 0af9d2417..803fe8bde 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -1,9 +1,13 @@ + mod serialize; mod timer; mod vint; mod counting_writer; +mod composite_file; pub mod bitpacker; + +pub(crate) use self::composite_file::{CompositeWrite, CompositeFile}; pub use self::serialize::BinarySerializable; pub use self::timer::Timing; pub use self::timer::TimerTree; diff --git a/src/common/serialize.rs b/src/common/serialize.rs index ee86247c5..87b735769 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -101,9 +101,9 @@ impl BinarySerializable for String { fn deserialize(reader: &mut R) -> io::Result { let string_length = VInt::deserialize(reader)?.val() as usize; let mut result = String::with_capacity(string_length); - reader - .take(string_length as u64) - .read_to_string(&mut result)?; + reader.take(string_length as u64).read_to_string( + &mut result, + )?; Ok(result) } } diff --git a/src/common/timer.rs b/src/common/timer.rs index 035bd65de..84e0f8c3a 100644 --- a/src/common/timer.rs +++ b/src/common/timer.rs @@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> { impl<'a> Drop for OpenTimer<'a> { fn drop(&mut self) { - self.timer_tree - .timings - .push(Timing { - name: self.name, - duration: self.start - .to(PreciseTime::now()) - .num_microseconds() - .unwrap(), - depth: self.depth, - }); + self.timer_tree.timings.push(Timing { + name: self.name, + duration: self.start + .to(PreciseTime::now()) + .num_microseconds() + .unwrap(), + depth: self.depth, + }); } } diff --git a/src/common/vint.rs b/src/common/vint.rs index 39653e8a7..07cdfa24c 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -47,7 +47,12 @@ impl BinarySerializable for VInt { } shift += 7; } - _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")), + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Reach end of buffer", + )) + } } } Ok(VInt(result)) diff --git a/src/compression/composite.rs b/src/compression/composite.rs deleted file mode 100644 index c363860ee..000000000 --- a/src/compression/composite.rs +++ /dev/null @@ -1,170 +0,0 @@ -use super::{BlockEncoder, BlockDecoder}; -use super::NUM_DOCS_PER_BLOCK; -use compression::{VIntEncoder, VIntDecoder}; - -pub struct CompositeEncoder { - block_encoder: BlockEncoder, - output: Vec, -} - -impl CompositeEncoder { - pub fn new() -> CompositeEncoder { - CompositeEncoder { - block_encoder: BlockEncoder::new(), - output: Vec::with_capacity(500_000), - } - } - - pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] { - self.output.clear(); - let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; - let mut offset = 0u32; - for i in 0..num_blocks { - let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK]; - let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset); - offset = vals_slice[NUM_DOCS_PER_BLOCK - 1]; - self.output.extend_from_slice(block_compressed); - } - let vint_compressed = - self.block_encoder - .compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset); - self.output.extend_from_slice(vint_compressed); - &self.output - } - - pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] { - self.output.clear(); - let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; - for i in 0..num_blocks { - let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK]; - let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice); - self.output.extend_from_slice(block_compressed); - } - let vint_compressed = self.block_encoder - .compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]); - self.output.extend_from_slice(vint_compressed); - &self.output - } -} - - -pub struct CompositeDecoder { - block_decoder: BlockDecoder, - vals: Vec, -} - - -impl CompositeDecoder { - pub fn new() -> CompositeDecoder { - CompositeDecoder { - block_decoder: BlockDecoder::new(), - vals: Vec::with_capacity(500_000), - } - } - - pub fn uncompress_sorted(&mut self, - mut compressed_data: &[u8], - uncompressed_len: usize) - -> &[u32] { - if uncompressed_len > self.vals.capacity() { - let extra_capacity = uncompressed_len - self.vals.capacity(); - self.vals.reserve(extra_capacity); - } - let mut offset = 0u32; - self.vals.clear(); - let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; - for _ in 0..num_blocks { - compressed_data = self.block_decoder - .uncompress_block_sorted(compressed_data, offset); - offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - } - self.block_decoder - .uncompress_vint_sorted(compressed_data, - offset, - uncompressed_len % NUM_DOCS_PER_BLOCK); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - &self.vals - } - - pub fn uncompress_unsorted(&mut self, - mut compressed_data: &[u8], - uncompressed_len: usize) - -> &[u32] { - self.vals.clear(); - let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; - for _ in 0..num_blocks { - compressed_data = self.block_decoder - .uncompress_block_unsorted(compressed_data); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - } - self.block_decoder - .uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - &self.vals - } -} - -impl Into> for CompositeDecoder { - fn into(self) -> Vec { - self.vals - } -} - - -#[cfg(test)] -pub mod tests { - - use test::Bencher; - use super::*; - use tests; - - #[test] - fn test_composite_unsorted() { - let data = tests::generate_array(10_000, 0.1); - let mut encoder = CompositeEncoder::new(); - let compressed = encoder.compress_unsorted(&data); - assert!(compressed.len() <= 19_794); - let mut decoder = CompositeDecoder::new(); - let result = decoder.uncompress_unsorted(&compressed, data.len()); - for i in 0..data.len() { - assert_eq!(data[i], result[i]); - } - } - - #[test] - fn test_composite_sorted() { - let data = tests::generate_array(10_000, 0.1); - let mut encoder = CompositeEncoder::new(); - let compressed = encoder.compress_sorted(&data); - assert!(compressed.len() <= 7_826); - let mut decoder = CompositeDecoder::new(); - let result = decoder.uncompress_sorted(&compressed, data.len()); - for i in 0..data.len() { - assert_eq!(data[i], result[i]); - } - } - - - const BENCH_NUM_INTS: usize = 99_968; - - #[bench] - fn bench_compress(b: &mut Bencher) { - let mut encoder = CompositeEncoder::new(); - let data = tests::generate_array(BENCH_NUM_INTS, 0.1); - b.iter(|| { encoder.compress_sorted(&data); }); - } - - #[bench] - fn bench_uncompress(b: &mut Bencher) { - let mut encoder = CompositeEncoder::new(); - let data = tests::generate_array(BENCH_NUM_INTS, 0.1); - let compressed = encoder.compress_sorted(&data); - let mut decoder = CompositeDecoder::new(); - b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); }); - } -} diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 0c3df4b2f..cd40e4f1a 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -1,52 +1,88 @@ #![allow(dead_code)] -mod composite; -pub use self::composite::{CompositeEncoder, CompositeDecoder}; +mod stream; +pub use self::stream::CompressedIntStream; -#[cfg(not(feature="simdcompression"))] +#[cfg(not(feature = "simdcompression"))] mod pack { mod compression_pack_nosimd; - pub use self::compression_pack_nosimd::*; + pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder}; } -#[cfg(feature="simdcompression")] +#[cfg(feature = "simdcompression")] mod pack { mod compression_pack_simd; - pub use self::compression_pack_simd::*; + pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder}; } pub use self::pack::{BlockEncoder, BlockDecoder}; -#[cfg( any(not(feature="simdcompression"), target_env="msvc") )] +#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))] mod vint { mod compression_vint_nosimd; - pub use self::compression_vint_nosimd::*; + pub(crate) use self::compression_vint_nosimd::*; } -#[cfg( all(feature="simdcompression", not(target_env="msvc")) )] +#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))] mod vint { mod compression_vint_simd; - pub use self::compression_vint_simd::*; + pub(crate) use self::compression_vint_simd::*; } +/// Returns the size in bytes of a compressed block, given num_bits. +pub fn compressed_block_size(num_bits: u8) -> usize { + 1 + (num_bits as usize) * 16 +} pub trait VIntEncoder { + /// Compresses an array of `u32` integers, + /// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding) + /// and variable bytes encoding. + /// + /// The method takes an array of ints to compress, and returns + /// a `&[u8]` representing the compressed data. + /// + /// The method also takes an offset to give the value of the + /// hypothetical previous element in the delta-encoding. fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8]; + + /// Compresses an array of `u32` integers, + /// using variable bytes encoding. + /// + /// The method takes an array of ints to compress, and returns + /// a `&[u8]` representing the compressed data. fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8]; } pub trait VIntDecoder { - fn uncompress_vint_sorted<'a>(&mut self, - compressed_data: &'a [u8], - offset: u32, - num_els: usize) - -> &'a [u8]; - fn uncompress_vint_unsorted<'a>(&mut self, - compressed_data: &'a [u8], - num_els: usize) - -> &'a [u8]; + /// Uncompress an array of `u32` integers, + /// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding) + /// and variable bytes encoding. + /// + /// The method takes a number of int to decompress, and returns + /// the amount of bytes that were read to decompress them. + /// + /// The method also takes an offset to give the value of the + /// hypothetical previous element in the delta-encoding. + /// + /// For instance, if delta encoded are `1, 3, 9`, and the + /// `offset` is 5, then the output will be: + /// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18` + fn uncompress_vint_sorted<'a>( + &mut self, + compressed_data: &'a [u8], + offset: u32, + num_els: usize, + ) -> usize; + + /// Uncompress an array of `u32s`, compressed using variable + /// byte encoding. + /// + /// The method takes a number of int to decompress, and returns + /// the amount of bytes that were read to decompress them. + fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize; } impl VIntEncoder for BlockEncoder { @@ -60,26 +96,24 @@ impl VIntEncoder for BlockEncoder { } impl VIntDecoder for BlockDecoder { - fn uncompress_vint_sorted<'a>(&mut self, - compressed_data: &'a [u8], - offset: u32, - num_els: usize) - -> &'a [u8] { + fn uncompress_vint_sorted<'a>( + &mut self, + compressed_data: &'a [u8], + offset: u32, + num_els: usize, + ) -> usize { self.output_len = num_els; vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset) } - fn uncompress_vint_unsorted<'a>(&mut self, - compressed_data: &'a [u8], - num_els: usize) - -> &'a [u8] { + fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize { self.output_len = num_els; vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els]) } } -pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize. +pub const COMPRESSION_BLOCK_SIZE: usize = 128; #[cfg(test)] pub mod tests { @@ -95,8 +129,8 @@ pub mod tests { let compressed_data = encoder.compress_block_sorted(&vals, 0); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0); - assert_eq!(remaining_data.len(), 0); + let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0); + assert_eq!(consumed_num_bytes, compressed_data.len()); } for i in 0..128 { assert_eq!(vals[i], decoder.output(i)); @@ -110,8 +144,8 @@ pub mod tests { let compressed_data = encoder.compress_block_sorted(&vals, 10); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10); - assert_eq!(remaining_data.len(), 0); + let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10); + assert_eq!(consumed_num_bytes, compressed_data.len()); } for i in 0..128 { assert_eq!(vals[i], decoder.output(i)); @@ -129,9 +163,9 @@ pub mod tests { compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(&compressed, 10); - assert_eq!(remaining_data.len(), 1); - assert_eq!(remaining_data[0], 173u8); + let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10); + assert_eq!(consumed_num_bytes, compressed.len() - 1); + assert_eq!(compressed[consumed_num_bytes], 173u8); } for i in 0..n { assert_eq!(vals[i], decoder.output(i)); @@ -149,9 +183,9 @@ pub mod tests { compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_unsorted(&compressed); - assert_eq!(remaining_data.len(), 1); - assert_eq!(remaining_data[0], 173u8); + let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed); + assert_eq!(consumed_num_bytes + 1, compressed.len()); + assert_eq!(compressed[consumed_num_bytes], 173u8); } for i in 0..n { assert_eq!(vals[i], decoder.output(i)); @@ -169,9 +203,9 @@ pub mod tests { let encoded_data = encoder.compress_vint_sorted(&input, *offset); assert!(encoded_data.len() <= expected_length); let mut decoder = BlockDecoder::new(); - let remaining_data = + let consumed_num_bytes = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len()); - assert_eq!(0, remaining_data.len()); + assert_eq!(consumed_num_bytes, encoded_data.len()); assert_eq!(input, decoder.output_array()); } } @@ -181,19 +215,32 @@ pub mod tests { #[bench] fn bench_compress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1); + let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1); b.iter(|| { encoder.compress_block_sorted(&data, 0u32); }); } #[bench] fn bench_uncompress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1); + let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1); let compressed = encoder.compress_block_sorted(&data, 0u32); let mut decoder = BlockDecoder::new(); b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); }); } + #[test] + fn test_all_docs_compression_numbits() { + for num_bits in 0..33 { + let mut data = [0u32; 128]; + if num_bits > 0 { + data[0] = 1 << (num_bits - 1); + } + let mut encoder = BlockEncoder::new(); + let compressed = encoder.compress_block_unsorted(&data); + assert_eq!(compressed[0] as usize, num_bits); + assert_eq!(compressed.len(), compressed_block_size(compressed[0])); + } + } const NUM_INTS_BENCH_VINT: usize = 10; @@ -210,7 +257,9 @@ pub mod tests { let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001); let compressed = encoder.compress_vint_sorted(&data, 0u32); let mut decoder = BlockDecoder::new(); - b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); }); + b.iter(|| { + decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); + }); } } diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 2e33d0b1e..ee5ab7c39 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -3,15 +3,15 @@ use common::bitpacker::{BitPacker, BitUnpacker}; use common::CountingWriter; use std::cmp; use std::io::Write; -use super::super::NUM_DOCS_PER_BLOCK; +use super::super::COMPRESSION_BLOCK_SIZE; -const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; +const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1; pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize { let mut max_delta = 0; { let mut local_offset = offset; - for i in 0..NUM_DOCS_PER_BLOCK { + for i in 0..COMPRESSION_BLOCK_SIZE { let val = vals[i]; let delta = val - local_offset; max_delta = cmp::max(max_delta, delta); @@ -22,6 +22,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz let mut counting_writer = CountingWriter::wrap(output); let num_bits = compute_num_bits(max_delta as u64); counting_writer.write_all(&[num_bits]).unwrap(); + let mut bit_packer = BitPacker::new(num_bits as usize); for val in vals { bit_packer.write(*val as u64, &mut counting_writer).unwrap(); @@ -34,7 +35,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz pub struct BlockEncoder { pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE], pub output_len: usize, - input_buffer: [u32; NUM_DOCS_PER_BLOCK], + input_buffer: [u32; COMPRESSION_BLOCK_SIZE], } impl BlockEncoder { @@ -42,7 +43,7 @@ impl BlockEncoder { BlockEncoder { output: [0u8; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, - input_buffer: [0u32; NUM_DOCS_PER_BLOCK], + input_buffer: [0u32; COMPRESSION_BLOCK_SIZE], } } @@ -55,10 +56,9 @@ impl BlockEncoder { pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] { let compressed_size = { let output: &mut [u8] = &mut self.output; - let max = vals.iter() - .cloned() - .max() - .expect("compress unsorted called with an empty array"); + let max = vals.iter().cloned().max().expect( + "compress unsorted called with an empty array", + ); let num_bits = compute_num_bits(max as u64); let mut counting_writer = CountingWriter::wrap(output); counting_writer.write_all(&[num_bits]).unwrap(); @@ -66,8 +66,16 @@ impl BlockEncoder { for val in vals { bit_packer.write(*val as u64, &mut counting_writer).unwrap(); } - bit_packer.flush(&mut counting_writer); - // we voluntarility avoid writing "closing", because we + for _ in vals.len()..COMPRESSION_BLOCK_SIZE { + bit_packer + .write(vals[0] as u64, &mut counting_writer) + .unwrap(); + } + bit_packer.flush(&mut counting_writer).expect( + "Flushing the bitpacking \ + in an in RAM buffer should never fail", + ); + // we avoid writing "closing", because we // do not want 7 bytes of padding here. counting_writer.written_bytes() }; @@ -93,34 +101,35 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted<'a>(&mut self, - compressed_data: &'a [u8], - mut offset: u32) - -> &'a [u8] { + pub fn uncompress_block_sorted<'a>( + &mut self, + compressed_data: &'a [u8], + mut offset: u32, + ) -> usize { let consumed_size = { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); - for i in 0..NUM_DOCS_PER_BLOCK { + for i in 0..COMPRESSION_BLOCK_SIZE { let delta = bit_unpacker.get(i); let val = offset + delta as u32; self.output[i] = val; offset = val; } - 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8 + 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8 }; - self.output_len = NUM_DOCS_PER_BLOCK; - &compressed_data[consumed_size..] + self.output_len = COMPRESSION_BLOCK_SIZE; + consumed_size } - pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] { + pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); - for i in 0..NUM_DOCS_PER_BLOCK { + for i in 0..COMPRESSION_BLOCK_SIZE { self.output[i] = bit_unpacker.get(i) as u32; } - let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8; - self.output_len = NUM_DOCS_PER_BLOCK; - &compressed_data[consumed_size..] + let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8; + self.output_len = COMPRESSION_BLOCK_SIZE; + consumed_size } #[inline] diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index 78cf58c37..498eb7852 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -1,6 +1,6 @@ -use super::super::NUM_DOCS_PER_BLOCK; +use super::super::COMPRESSION_BLOCK_SIZE; -const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; +const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1; mod simdcomp { use libc::size_t; @@ -8,10 +8,11 @@ mod simdcomp { extern "C" { pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t; - pub fn uncompress_sorted(compressed_data: *const u8, - output: *mut u32, - offset: u32) - -> size_t; + pub fn uncompress_sorted( + compressed_data: *const u8, + output: *mut u32, + offset: u32, + ) -> size_t; pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t; @@ -78,19 +79,16 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted<'a>(&mut self, - compressed_data: &'a [u8], - offset: u32) - -> &'a [u8] { + pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize { let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset); - self.output_len = NUM_DOCS_PER_BLOCK; - &compressed_data[consumed_size..] + self.output_len = COMPRESSION_BLOCK_SIZE; + consumed_size } - pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] { + pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize { let consumed_size = uncompress_unsorted(compressed_data, &mut self.output); - self.output_len = NUM_DOCS_PER_BLOCK; - &compressed_data[consumed_size..] + self.output_len = COMPRESSION_BLOCK_SIZE; + consumed_size } #[inline] @@ -117,4 +115,5 @@ mod tests { let compressed = encoder.compress_block_sorted(&data, 0u32); assert_eq!(compressed.len(), 17); } + } diff --git a/src/compression/stream.rs b/src/compression/stream.rs new file mode 100644 index 000000000..9071d0fff --- /dev/null +++ b/src/compression/stream.rs @@ -0,0 +1,135 @@ +use compression::BlockDecoder; +use compression::COMPRESSION_BLOCK_SIZE; +use compression::compressed_block_size; +use directory::{ReadOnlySource, SourceRead}; + +/// Reads a stream of compressed ints. +/// +/// Tantivy uses `CompressedIntStream` to read +/// the position file. +/// The `.skip(...)` makes it possible to avoid +/// decompressing blocks that are not required. +pub struct CompressedIntStream { + buffer: SourceRead, + block_decoder: BlockDecoder, + inner_offset: usize, +} + +impl CompressedIntStream { + /// Opens a compressed int stream. + pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream { + CompressedIntStream { + buffer: SourceRead::from(source), + block_decoder: BlockDecoder::new(), + inner_offset: COMPRESSION_BLOCK_SIZE, + } + } + + /// Fills a buffer with the next `output.len()` integers, + /// and advance the stream by that many els. + pub fn read(&mut self, output: &mut [u32]) { + let mut num_els: usize = output.len(); + let mut start: usize = 0; + loop { + let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; + if num_els >= available { + if available > 0 { + let uncompressed_block = &self.block_decoder.output_array() + [self.inner_offset..]; + &mut output[start..start + available].clone_from_slice(uncompressed_block); + } + num_els -= available; + start += available; + let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted( + self.buffer.as_ref(), + ); + self.buffer.advance(num_consumed_bytes); + self.inner_offset = 0; + } else { + let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset.. + self.inner_offset + + num_els]; + &output[start..start + num_els].clone_from_slice(uncompressed_block); + self.inner_offset += num_els; + break; + } + } + } + + + /// Skip the next `skip_len` integer. + /// + /// If a full block is skipped, calling + /// `.skip(...)` will avoid decompressing it. + pub fn skip(&mut self, mut skip_len: usize) { + let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; + if available >= skip_len { + self.inner_offset += skip_len; + } else { + skip_len -= available; + // entirely skip decompressing some blocks. + while skip_len >= COMPRESSION_BLOCK_SIZE { + skip_len -= COMPRESSION_BLOCK_SIZE; + let num_bits: u8 = self.buffer.as_ref()[0]; + let block_len = compressed_block_size(num_bits); + self.buffer.advance(block_len); + } + let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted( + self.buffer.as_ref(), + ); + self.buffer.advance(num_consumed_bytes); + self.inner_offset = skip_len; + } + } +} + + +#[cfg(test)] +pub mod tests { + + use super::CompressedIntStream; + use compression::compressed_block_size; + use compression::COMPRESSION_BLOCK_SIZE; + use compression::BlockEncoder; + use directory::ReadOnlySource; + + fn create_stream_buffer() -> ReadOnlySource { + let mut buffer: Vec = vec![]; + let mut encoder = BlockEncoder::new(); + let vals: Vec = (0u32..1_025u32).collect(); + for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) { + let compressed_block = encoder.compress_block_unsorted(chunk); + let num_bits = compressed_block[0]; + assert_eq!(compressed_block_size(num_bits), compressed_block.len()); + buffer.extend_from_slice(compressed_block); + } + if cfg!(simd) { + buffer.extend_from_slice(&[0u8; 7]); + } + ReadOnlySource::from(buffer) + } + + #[test] + fn test_compressed_int_stream() { + let buffer = create_stream_buffer(); + let mut stream = CompressedIntStream::wrap(buffer); + let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE]; + + stream.read(&mut block[0..2]); + assert_eq!(block[0], 0); + assert_eq!(block[1], 1); + stream.skip(5); + stream.read(&mut block[0..3]); + assert_eq!(block[0], 7); + assert_eq!(block[1], 8); + assert_eq!(block[2], 9); + stream.skip(500); + stream.read(&mut block[0..3]); + assert_eq!(block[0], 510); + assert_eq!(block[1], 511); + assert_eq!(block[2], 512); + stream.skip(511); + stream.read(&mut block[..1]); + assert_eq!(block[0], 1024); + } +} diff --git a/src/compression/vint/compression_vint_nosimd.rs b/src/compression/vint/compression_vint_nosimd.rs index a3af5e489..a87b6bb51 100644 --- a/src/compression/vint/compression_vint_nosimd.rs +++ b/src/compression/vint/compression_vint_nosimd.rs @@ -1,6 +1,10 @@ #[inline(always)] -pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { +pub(crate) fn compress_sorted<'a>( + input: &[u32], + output: &'a mut [u8], + mut offset: u32, +) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v - offset; @@ -22,7 +26,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) } #[inline(always)] -pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { +pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v; @@ -43,10 +47,11 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { } #[inline(always)] -pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], - output: &mut [u32], - offset: u32) - -> &'a [u8] { +pub(crate) fn uncompress_sorted<'a>( + compressed_data: &'a [u8], + output: &mut [u32], + offset: u32, +) -> usize { let mut read_byte = 0; let mut result = offset; let num_els = output.len(); @@ -63,11 +68,11 @@ pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], } output[i] = result; } - &compressed_data[read_byte..] + read_byte } #[inline(always)] -pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { +pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { let mut read_byte = 0; let num_els = output.len(); for i in 0..num_els { @@ -84,5 +89,5 @@ pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> } output[i] = result; } - &compressed_data[read_byte..] + read_byte } diff --git a/src/compression/vint/compression_vint_simd.rs b/src/compression/vint/compression_vint_simd.rs index dbeca660c..0b508a812 100644 --- a/src/compression/vint/compression_vint_simd.rs +++ b/src/compression/vint/compression_vint_simd.rs @@ -4,41 +4,47 @@ mod streamvbyte { use libc::size_t; extern "C" { - pub fn streamvbyte_delta_encode(data: *const u32, - num_els: u32, - output: *mut u8, - offset: u32) - -> size_t; + pub fn streamvbyte_delta_encode( + data: *const u32, + num_els: u32, + output: *mut u8, + offset: u32, + ) -> size_t; - pub fn streamvbyte_delta_decode(compressed_data: *const u8, - output: *mut u32, - num_els: u32, - offset: u32) - -> size_t; + pub fn streamvbyte_delta_decode( + compressed_data: *const u8, + output: *mut u32, + num_els: u32, + offset: u32, + ) -> size_t; pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t; - pub fn streamvbyte_decode(compressed_data: *const u8, - output: *mut u32, - num_els: usize) - -> size_t; + pub fn streamvbyte_decode( + compressed_data: *const u8, + output: *mut u32, + num_els: usize, + ) -> size_t; } } #[inline(always)] -pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] { +pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] { let compress_length = unsafe { - streamvbyte::streamvbyte_delta_encode(input.as_ptr(), - input.len() as u32, - output.as_mut_ptr(), - offset) + streamvbyte::streamvbyte_delta_encode( + input.as_ptr(), + input.len() as u32, + output.as_mut_ptr(), + offset, + ) }; &output[..compress_length] } + #[inline(always)] -pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { +pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { let compress_length = unsafe { streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr()) }; @@ -46,23 +52,24 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { } #[inline(always)] -pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], - output: &mut [u32], - offset: u32) - -> &'a [u8] { - let consumed_bytes = unsafe { - streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(), - output.as_mut_ptr(), - output.len() as u32, - offset) - }; - &compressed_data[consumed_bytes..] +pub(crate) fn uncompress_sorted<'a>( + compressed_data: &'a [u8], + output: &mut [u32], + offset: u32, +) -> usize { + unsafe { + streamvbyte::streamvbyte_delta_decode( + compressed_data.as_ptr(), + output.as_mut_ptr(), + output.len() as u32, + offset, + ) + } } #[inline(always)] -pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { - let consumed_bytes = unsafe { +pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { + unsafe { streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len()) - }; - &compressed_data[consumed_bytes..] + } } diff --git a/src/core/index.rs b/src/core/index.rs index 01a0abe54..e4acb8a07 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -48,9 +48,10 @@ impl Index { pub fn create_in_ram(schema: Schema) -> Index { let ram_directory = RAMDirectory::create(); // unwrap is ok here - let directory = ManagedDirectory::new(ram_directory) - .expect("Creating a managed directory from a brand new RAM directory \ - should never fail."); + let directory = ManagedDirectory::new(ram_directory).expect( + "Creating a managed directory from a brand new RAM directory \ + should never fail.", + ); Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") } @@ -127,10 +128,11 @@ impl Index { /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics /// If the heap size per thread is too small, panics. - pub fn writer_with_num_threads(&self, - num_threads: usize, - heap_size_in_bytes: usize) - -> Result { + pub fn writer_with_num_threads( + &self, + num_threads: usize, + heap_size_in_bytes: usize, + ) -> Result { open_index_writer(self, num_threads, heap_size_in_bytes) } @@ -155,10 +157,12 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - Ok(self.searchable_segment_metas()? - .into_iter() - .map(|segment_meta| self.segment(segment_meta)) - .collect()) + Ok( + self.searchable_segment_metas()? + .into_iter() + .map(|segment_meta| self.segment(segment_meta)) + .collect(), + ) } #[doc(hidden)] @@ -190,10 +194,12 @@ impl Index { /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { - Ok(self.searchable_segment_metas()? - .iter() - .map(|segment_meta| segment_meta.id()) - .collect()) + Ok( + self.searchable_segment_metas()? + .iter() + .map(|segment_meta| segment_meta.id()) + .collect(), + ) } /// Creates a new generation of searchers after @@ -203,10 +209,12 @@ impl Index { /// published or after a merge. pub fn load_searchers(&self) -> Result<()> { let searchable_segments = self.searchable_segments()?; - let segment_readers: Vec = try!(searchable_segments - .into_iter() - .map(SegmentReader::open) - .collect()); + let segment_readers: Vec = try!( + searchable_segments + .into_iter() + .map(SegmentReader::open) + .collect() + ); let searchers = (0..NUM_SEARCHERS) .map(|_| Searcher::from(segment_readers.clone())) .collect(); diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 785846a0d..6eafddf77 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -9,7 +9,7 @@ use core::SegmentMeta; /// * the index docstamp /// * the schema /// -#[derive(Clone,Debug,Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct IndexMeta { pub segments: Vec, pub schema: Schema, diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs new file mode 100644 index 000000000..06816f361 --- /dev/null +++ b/src/core/inverted_index_reader.rs @@ -0,0 +1,164 @@ +use directory::{SourceRead, ReadOnlySource}; +use termdict::{TermDictionary, TermDictionaryImpl}; +use postings::{SegmentPostings, BlockSegmentPostings}; +use postings::TermInfo; +use postings::SegmentPostingsOption; +use schema::Term; +use std::cmp; +use fastfield::DeleteBitSet; +use schema::Schema; +use compression::CompressedIntStream; + + +/// The inverted index reader is in charge of accessing +/// the inverted index associated to a specific field. +/// +/// # Note +/// +/// It is safe to delete the segment associated to +/// an `InvertedIndexReader`. As long as it is open, +/// the `ReadOnlySource` it is relying on should +/// stay available. +/// +/// +/// `InvertedIndexReader` are created by calling +/// the `SegmentReader`'s [`.inverted_index(...)`] method +pub struct InvertedIndexReader { + termdict: TermDictionaryImpl, + postings_source: ReadOnlySource, + positions_source: ReadOnlySource, + delete_bitset: DeleteBitSet, + schema: Schema, +} + +impl InvertedIndexReader { + pub(crate) fn new( + termdict_source: ReadOnlySource, + postings_source: ReadOnlySource, + positions_source: ReadOnlySource, + delete_bitset: DeleteBitSet, + schema: Schema, + ) -> InvertedIndexReader { + + InvertedIndexReader { + termdict: TermDictionaryImpl::from_source(termdict_source), + postings_source: postings_source, + positions_source: positions_source, + delete_bitset: delete_bitset, + schema: schema, + } + } + + /// Returns the term info associated with the term. + pub fn get_term_info(&self, term: &Term) -> Option { + self.termdict.get(term.as_slice()) + } + + + /// Return the term dictionary datastructure. + pub fn terms(&self) -> &TermDictionaryImpl { + &self.termdict + } + + /// Resets the block segment to another position of the postings + /// file. + /// + /// This is useful for enumerating through a list of terms, + /// and consuming the associated posting lists while avoiding + /// reallocating a `BlockSegmentPostings`. + /// + /// # Warning + /// + /// This does not reset the positions list. + pub fn reset_block_postings_from_terminfo( + &self, + term_info: &TermInfo, + block_postings: &mut BlockSegmentPostings, + ) { + let offset = term_info.postings_offset as usize; + let end_source = self.postings_source.len(); + let postings_slice = self.postings_source.slice(offset, end_source); + let postings_reader = SourceRead::from(postings_slice); + block_postings.reset(term_info.doc_freq as usize, postings_reader); + } + + + + /// Returns a block postings given a `term_info`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: SegmentPostingsOption, + ) -> BlockSegmentPostings { + let offset = term_info.postings_offset as usize; + let postings_data = self.postings_source.slice_from(offset); + let has_freq = option.has_freq(); + BlockSegmentPostings::from_data( + term_info.doc_freq as usize, + SourceRead::from(postings_data), + has_freq, + ) + } + + /// Returns a posting object given a `term_info`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: SegmentPostingsOption, + ) -> SegmentPostings { + let block_postings = self.read_block_postings_from_terminfo(term_info, option); + let delete_bitset = self.delete_bitset.clone(); + let position_stream = { + if option.has_positions() { + let position_offset = term_info.positions_offset; + let positions_source = self.positions_source.slice_from(position_offset as usize); + let mut stream = CompressedIntStream::wrap(positions_source); + stream.skip(term_info.positions_inner_offset as usize); + Some(stream) + } else { + None + } + }; + SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream) + } + + /// Returns the segment postings associated with the term, and with the given option, + /// or `None` if the term has never been encountered and indexed. + /// + /// If the field was not indexed with the indexing options that cover + /// the requested options, the returned `SegmentPostings` the method does not fail + /// and returns a `SegmentPostings` with as much information as possible. + /// + /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a + /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` + /// with `DocId`s and frequencies. + pub fn read_postings( + &self, + term: &Term, + option: SegmentPostingsOption, + ) -> Option { + let field = term.field(); + let field_entry = self.schema.get_field_entry(field); + let term_info = get!(self.get_term_info(term)); + let maximum_option = get!(field_entry.field_type().get_segment_postings_option()); + let best_effort_option = cmp::min(maximum_option, option); + Some(self.read_postings_from_terminfo( + &term_info, + best_effort_option, + )) + } + + /// Returns the number of documents containing the term. + pub fn doc_freq(&self, term: &Term) -> u32 { + match self.get_term_info(term) { + Some(term_info) => term_info.doc_freq, + None => 0, + } + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs index dca8b5ccd..3a6c9568a 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,7 +7,9 @@ mod segment; mod index_meta; mod pool; mod segment_meta; +mod inverted_index_reader; +pub use self::inverted_index_reader::InvertedIndexReader; pub use self::searcher::Searcher; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; @@ -18,7 +20,6 @@ pub use self::index::Index; pub use self::segment_meta::SegmentMeta; pub use self::index_meta::IndexMeta; - use std::path::PathBuf; lazy_static! { diff --git a/src/core/pool.rs b/src/core/pool.rs index 805ea3467..1796fc32c 100644 --- a/src/core/pool.rs +++ b/src/core/pool.rs @@ -76,8 +76,11 @@ impl Pool { if former_generation >= generation { break; } - self.freshest_generation - .compare_and_swap(former_generation, generation, Ordering::SeqCst); + self.freshest_generation.compare_and_swap( + former_generation, + generation, + Ordering::SeqCst, + ); } } @@ -91,9 +94,9 @@ impl Pool { let gen_item = self.queue.pop(); if gen_item.generation >= generation { return LeasedItem { - gen_item: Some(gen_item), - recycle_queue: self.queue.clone(), - }; + gen_item: Some(gen_item), + recycle_queue: self.queue.clone(), + }; } else { // this searcher is obsolete, // removing it from the pool. @@ -113,25 +116,26 @@ impl Deref for LeasedItem { fn deref(&self) -> &T { &self.gen_item - .as_ref() - .expect("Unwrapping a leased item should never fail") - .item // unwrap is safe here + .as_ref() + .expect("Unwrapping a leased item should never fail") + .item // unwrap is safe here } } impl DerefMut for LeasedItem { fn deref_mut(&mut self) -> &mut T { &mut self.gen_item - .as_mut() - .expect("Unwrapping a mut leased item should never fail") - .item // unwrap is safe here + .as_mut() + .expect("Unwrapping a mut leased item should never fail") + .item // unwrap is safe here } } impl Drop for LeasedItem { fn drop(&mut self) { - let gen_item: GenerationItem = mem::replace(&mut self.gen_item, None) - .expect("Unwrapping a leased item should never fail"); + let gen_item: GenerationItem = mem::replace(&mut self.gen_item, None).expect( + "Unwrapping a leased item should never fail", + ); self.recycle_queue.push(gen_item); } } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 6579698e2..14f1cb141 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -6,10 +6,11 @@ use common::TimerTree; use query::Query; use DocId; use DocAddress; -use schema::Term; -use termdict::TermMerger; +use schema::{Term, Field}; +use termdict::{TermMerger, TermDictionary}; +use std::sync::Arc; use std::fmt; -use postings::TermInfo; +use core::InvertedIndexReader; /// Holds a list of `SegmentReader`s ready for search. @@ -21,7 +22,6 @@ pub struct Searcher { segment_readers: Vec, } - impl Searcher { /// Fetches a document from tantivy's store given a `DocAddress`. /// @@ -46,7 +46,9 @@ impl Searcher { pub fn doc_freq(&self, term: &Term) -> u32 { self.segment_readers .iter() - .map(|segment_reader| segment_reader.doc_freq(term)) + .map(|segment_reader| { + segment_reader.inverted_index(term.field()).doc_freq(term) + }) .fold(0u32, |acc, val| acc + val) } @@ -65,20 +67,41 @@ impl Searcher { query.search(self, collector) } - /// Returns a Stream over all of the sorted unique terms of - /// the searcher. - /// - /// This includes all of the fields from all of the segment_readers. - /// See [`TermIterator`](struct.TermIterator.html). - /// - /// # Warning - /// This API is very likely to change in the future. - pub fn terms(&self) -> TermMerger { - TermMerger::from(self.segment_readers()) + /// Return the field searcher associated to a `Field`. + pub fn field(&self, field: Field) -> FieldSearcher { + let inv_index_readers = self.segment_readers + .iter() + .map(|segment_reader| segment_reader.inverted_index(field)) + .collect::>(); + FieldSearcher::new(inv_index_readers) } } + + +pub struct FieldSearcher { + inv_index_readers: Vec>, +} + + +impl FieldSearcher { + fn new(inv_index_readers: Vec>) -> FieldSearcher { + FieldSearcher { inv_index_readers: inv_index_readers } + } + + + /// Returns a Stream over all of the sorted unique terms of + /// for the given field. + pub fn terms(&self) -> TermMerger { + let term_streamers: Vec<_> = self.inv_index_readers + .iter() + .map(|inverted_index| inverted_index.terms().stream()) + .collect(); + TermMerger::new(term_streamers) + } +} + impl From> for Searcher { fn from(segment_readers: Vec) -> Searcher { Searcher { segment_readers: segment_readers } diff --git a/src/core/segment.rs b/src/core/segment.rs index 16cb214d2..59b5eaa13 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -76,18 +76,20 @@ impl Segment { } /// Open one of the component file for a *regular* read. - pub fn open_read(&self, - component: SegmentComponent) - -> result::Result { + pub fn open_read( + &self, + component: SegmentComponent, + ) -> result::Result { let path = self.relative_path(component); let source = try!(self.index.directory().open_read(&path)); Ok(source) } /// Open one of the component file for *regular* write. - pub fn open_write(&mut self, - component: SegmentComponent) - -> result::Result { + pub fn open_write( + &mut self, + component: SegmentComponent, + ) -> result::Result { let path = self.relative_path(component); let write = try!(self.index.directory_mut().open_write(&path)); Ok(write) @@ -125,11 +127,11 @@ mod tests { { let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS); assert!(directory.exists(&*path)); - directory.garbage_collect(|| { living_files.clone() }); + directory.garbage_collect(|| living_files.clone()); assert!(directory.exists(&*path)); } - directory.garbage_collect(|| { living_files }); + directory.garbage_collect(|| living_files); assert!(!directory.exists(&*path)); } diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index e4cbc0068..b460258c7 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -28,13 +28,15 @@ pub enum SegmentComponent { impl SegmentComponent { /// Iterates through the components. pub fn iterator() -> impl Iterator { - static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS, - SegmentComponent::POSITIONS, - SegmentComponent::FASTFIELDS, - SegmentComponent::FIELDNORMS, - SegmentComponent::TERMS, - SegmentComponent::STORE, - SegmentComponent::DELETE]; + static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [ + SegmentComponent::POSTINGS, + SegmentComponent::POSITIONS, + SegmentComponent::FASTFIELDS, + SegmentComponent::FIELDNORMS, + SegmentComponent::TERMS, + SegmentComponent::STORE, + SegmentComponent::DELETE, + ]; SEGMENT_COMPONENTS.into_iter() } } diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 623b22442..1abe95652 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -64,16 +64,14 @@ impl SegmentMeta { pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { let mut path = self.id().uuid_string(); path.push_str(&*match component { - SegmentComponent::POSITIONS => ".pos".to_string(), - SegmentComponent::POSTINGS => ".idx".to_string(), - SegmentComponent::TERMS => ".term".to_string(), - SegmentComponent::STORE => ".store".to_string(), - SegmentComponent::FASTFIELDS => ".fast".to_string(), - SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), - SegmentComponent::DELETE => { - format!(".{}.del", self.delete_opstamp().unwrap_or(0)) - } - }); + SegmentComponent::POSITIONS => ".pos".to_string(), + SegmentComponent::POSTINGS => ".idx".to_string(), + SegmentComponent::TERMS => ".term".to_string(), + SegmentComponent::STORE => ".store".to_string(), + SegmentComponent::FASTFIELDS => ".fast".to_string(), + SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), + SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)), + }); PathBuf::from(path) } @@ -111,8 +109,8 @@ impl SegmentMeta { #[doc(hidden)] pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { self.deletes = Some(DeleteMeta { - num_deleted_docs: num_deleted_docs, - opstamp: opstamp, - }); + num_deleted_docs: num_deleted_docs, + opstamp: opstamp, + }); } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ed155e56b..c77c71a7b 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -2,28 +2,24 @@ use Result; use core::Segment; use core::SegmentId; use core::SegmentComponent; -use schema::Term; +use std::sync::RwLock; use common::HasLen; use core::SegmentMeta; use fastfield::{self, FastFieldNotAvailableError}; use fastfield::DeleteBitSet; use store::StoreReader; -use schema::Document; use directory::ReadOnlySource; +use schema::Document; use DocId; use std::str; -use termdict::TermDictionary; -use std::cmp; -use postings::TermInfo; -use termdict::TermDictionaryImpl; use std::sync::Arc; +use std::collections::HashMap; +use common::CompositeFile; use std::fmt; +use core::InvertedIndexReader; use schema::Field; -use postings::SegmentPostingsOption; -use postings::{SegmentPostings, BlockSegmentPostings}; -use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; +use fastfield::{FastFieldReader, U64FastFieldReader}; use schema::Schema; -use postings::FreqHandler; @@ -40,15 +36,19 @@ use postings::FreqHandler; /// #[derive(Clone)] pub struct SegmentReader { + inv_idx_reader_cache: Arc>>>, + segment_id: SegmentId, segment_meta: SegmentMeta, - terms: Arc, - postings_data: ReadOnlySource, + + termdict_composite: CompositeFile, + postings_composite: CompositeFile, + positions_composite: CompositeFile, + fast_fields_composite: CompositeFile, + fieldnorms_composite: CompositeFile, + store_reader: StoreReader, - fast_fields_reader: Arc, - fieldnorms_reader: Arc, delete_bitset: DeleteBitSet, - positions_data: ReadOnlySource, schema: Schema, } @@ -76,11 +76,6 @@ impl SegmentReader { self.delete_bitset.len() as DocId } - #[doc(hidden)] - pub fn fast_fields_reader(&self) -> &FastFieldsReader { - &*self.fast_fields_reader - } - /// Accessor to a segment's fast field reader given a field. /// /// Returns the u64 fast value reader if the field @@ -91,17 +86,18 @@ impl SegmentReader { /// /// # Panics /// May panic if the index is corrupted. - pub fn get_fast_field_reader - (&self, - field: Field) - -> fastfield::Result { + pub fn get_fast_field_reader( + &self, + field: Field, + ) -> fastfield::Result { let field_entry = self.schema.get_field_entry(field); if !TFastFieldReader::is_enabled(field_entry.field_type()) { Err(FastFieldNotAvailableError::new(field_entry)) } else { - Ok(self.fast_fields_reader - .open_reader(field) - .expect("Fast field file corrupted.")) + self.fast_fields_composite + .open_read(field) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(TFastFieldReader::open) } } @@ -114,15 +110,9 @@ impl SegmentReader { /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> Option { - self.fieldnorms_reader.open_reader(field) - } - - /// Returns the number of documents containing the term. - pub fn doc_freq(&self, term: &Term) -> u32 { - match self.get_term_info(term) { - Some(term_info) => term_info.doc_freq, - None => 0, - } + self.fieldnorms_composite.open_read(field).map( + U64FastFieldReader::open, + ) } /// Accessor to the segment's `StoreReader`. @@ -133,23 +123,30 @@ impl SegmentReader { /// Open a new segment for reading. pub fn open(segment: Segment) -> Result { - let source = segment.open_read(SegmentComponent::TERMS)?; - let terms = TermDictionaryImpl::from_source(source)?; + let termdict_source = segment.open_read(SegmentComponent::TERMS)?; + let termdict_composite = CompositeFile::open(termdict_source)?; let store_source = segment.open_read(SegmentComponent::STORE)?; let store_reader = StoreReader::from_source(store_source); - let postings_shared_mmap = segment.open_read(SegmentComponent::POSTINGS)?; + let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; + let postings_composite = CompositeFile::open(postings_source)?; - let fast_field_data = segment.open_read(SegmentComponent::FASTFIELDS)?; - let fast_fields_reader = FastFieldsReader::from_source(fast_field_data)?; + let positions_composite = { + if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) { + CompositeFile::open(source)? + } else { + CompositeFile::empty() + } + }; + + + let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?; + let fast_fields_composite = CompositeFile::open(fast_fields_data)?; let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; - let fieldnorms_reader = FastFieldsReader::from_source(fieldnorms_data)?; + let fieldnorms_composite = CompositeFile::open(fieldnorms_data)?; - let positions_data = segment - .open_read(SegmentComponent::POSITIONS) - .unwrap_or_else(|_| ReadOnlySource::empty()); let delete_bitset = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; @@ -160,22 +157,66 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { - segment_meta: segment.meta().clone(), - postings_data: postings_shared_mmap, - terms: Arc::new(terms), - segment_id: segment.id(), - store_reader: store_reader, - fast_fields_reader: Arc::new(fast_fields_reader), - fieldnorms_reader: Arc::new(fieldnorms_reader), - delete_bitset: delete_bitset, - positions_data: positions_data, - schema: schema, - }) + inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), + segment_meta: segment.meta().clone(), + termdict_composite: termdict_composite, + postings_composite: postings_composite, + fast_fields_composite: fast_fields_composite, + fieldnorms_composite: fieldnorms_composite, + segment_id: segment.id(), + store_reader: store_reader, + delete_bitset: delete_bitset, + positions_composite: positions_composite, + schema: schema, + }) } - /// Return the term dictionary datastructure. - pub fn terms(&self) -> &TermDictionaryImpl { - &self.terms + + /// Returns a field reader associated to the field given in argument. + /// + /// The field reader is in charge of iterating through the + /// term dictionary associated to a specific field, + /// and opening the posting list associated to any term. + pub fn inverted_index(&self, field: Field) -> Arc { + if let Some(inv_idx_reader) = + self.inv_idx_reader_cache + .read() + .expect("Lock poisoned. This should never happen") + .get(&field) + { + inv_idx_reader.clone(); + } + + let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect( + "Index corrupted. Failed to open field term dictionary in composite file.", + ); + + let postings_source = self.postings_composite.open_read(field).expect( + "Index corrupted. Failed to open field postings in composite file.", + ); + + let positions_source = self.positions_composite.open_read(field).expect( + "Index corrupted. Failed to open field positions in composite file.", + ); + + let inv_idx_reader = Arc::new(InvertedIndexReader::new( + termdict_source, + postings_source, + positions_source, + self.delete_bitset.clone(), + self.schema.clone(), + )); + + // by releasing the lock in between, we may end up opening the inverting index + // twice, but this is fine. + self.inv_idx_reader_cache + .write() + .expect( + "Field reader cache lock poisoned. This should never happen.", + ) + .insert(field, inv_idx_reader.clone()); + + inv_idx_reader } /// Returns the document (or to be accurate, its stored field) @@ -187,89 +228,6 @@ impl SegmentReader { } - /// Returns the segment postings associated with the term, and with the given option, - /// or `None` if the term has never been encountered and indexed. - /// - /// If the field was not indexed with the indexing options that cover - /// the requested options, the returned `SegmentPostings` the method does not fail - /// and returns a `SegmentPostings` with as much information as possible. - /// - /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a - /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` - /// with `DocId`s and frequencies. - pub fn read_postings(&self, - term: &Term, - option: SegmentPostingsOption) - -> Option { - let field = term.field(); - let field_entry = self.schema.get_field_entry(field); - let term_info = get!(self.get_term_info(term)); - let maximum_option = get!(field_entry.field_type().get_segment_postings_option()); - let best_effort_option = cmp::min(maximum_option, option); - Some(self.read_postings_from_terminfo(&term_info, best_effort_option)) - } - - - /// Returns a posting object given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most user should prefer using `read_postings` instead. - pub fn read_postings_from_terminfo(&self, - term_info: &TermInfo, - option: SegmentPostingsOption) - -> SegmentPostings { - let block_postings = self.read_block_postings_from_terminfo(term_info, option); - let delete_bitset = self.delete_bitset.clone(); - SegmentPostings::from_block_postings(block_postings, delete_bitset) - } - - - /// Returns a block postings given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most user should prefer using `read_postings` instead. - pub fn read_block_postings_from_terminfo(&self, - term_info: &TermInfo, - option: SegmentPostingsOption) - -> BlockSegmentPostings { - let offset = term_info.postings_offset as usize; - let postings_data = &self.postings_data[offset..]; - let freq_handler = match option { - SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(), - SegmentPostingsOption::Freq => FreqHandler::new_with_freq(), - SegmentPostingsOption::FreqAndPositions => { - let offset = term_info.positions_offset as usize; - let offseted_position_data = &self.positions_data[offset..]; - FreqHandler::new_with_freq_and_position(offseted_position_data) - } - }; - BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler) - } - - - /// Resets the block segment to another position of the postings - /// file. - /// - /// This is useful for enumerating through a list of terms, - /// and consuming the associated posting lists while avoiding - /// reallocating a `BlockSegmentPostings`. - /// - /// # Warning - /// - /// This does not reset the positions list. - pub fn reset_block_postings_from_terminfo<'a>(&'a self, - term_info: &TermInfo, - block_postings: &mut BlockSegmentPostings<'a>) { - let offset = term_info.postings_offset as usize; - let postings_data: &'a [u8] = &self.postings_data[offset..]; - block_postings.reset(term_info.doc_freq as usize, postings_data); - } - - /// Returns the term info associated with the term. - pub fn get_term_info(&self, term: &Term) -> Option { - self.terms.get(term.as_slice()) - } - /// Returns the segment id pub fn segment_id(&self) -> SegmentId { self.segment_id diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index eaa439d08..af665ab3c 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -39,11 +39,11 @@ impl LayerBuilder { doc_id.serialize(&mut self.buffer)?; value.serialize(&mut self.buffer)?; Ok(if self.remaining == 0 { - self.remaining = self.period; - Some((doc_id, offset)) - } else { - None - }) + self.remaining = self.period; + Some((doc_id, offset)) + } else { + None + }) } } @@ -78,8 +78,10 @@ impl SkipListBuilder { loop { skip_pointer = match skip_pointer { Some((skip_doc_id, skip_offset)) => { - try!(self.get_skip_layer(layer_id) - .insert(skip_doc_id, &skip_offset)) + try!(self.get_skip_layer(layer_id).insert( + skip_doc_id, + &skip_offset, + )) } None => { return Ok(()); diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 03f18ed51..c9054dff2 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -68,9 +68,14 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) { }; let table_num_bits: usize = (1..) .into_iter() - .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) + .take_while(|num_bits: &usize| { + compute_table_size(*num_bits) < table_size_limit + }) .last() - .expect(&format!("Per thread memory is too small: {}", per_thread_memory_budget)); + .expect(&format!( + "Per thread memory is too small: {}", + per_thread_memory_budget + )); let table_size = compute_table_size(table_num_bits); let heap_size = per_thread_memory_budget - table_size; (heap_size, table_num_bits) @@ -174,13 +179,10 @@ impl<'a> HashMap<'a> { } pub fn iter<'b: 'a>(&'b self) -> impl Iterator + 'b { - self.occupied - .iter() - .cloned() - .map(move |bucket: usize| { - let kv = self.table[bucket]; - self.get_key_value(kv.key_value_addr) - }) + self.occupied.iter().cloned().map(move |bucket: usize| { + let kv = self.table[bucket]; + self.get_key_value(kv.key_value_addr) + }) } @@ -282,8 +284,10 @@ mod tests { let s1 = "abcdef"; let s2 = "abcdeg"; for i in 0..5 { - assert_eq!(murmurhash2(&s1[i..5].as_bytes()), - murmurhash2(&s2[i..5].as_bytes())); + assert_eq!( + murmurhash2(&s1[i..5].as_bytes()), + murmurhash2(&s2[i..5].as_bytes()) + ); } } @@ -303,13 +307,13 @@ mod tests { let keys: Vec<&'static str> = vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "]; b.iter(|| { - keys.iter() - .map(|&s| s.as_bytes()) - .map(murmurhash2::murmurhash2) - .map(|h| h as u64) - .last() - .unwrap() - }); + keys.iter() + .map(|&s| s.as_bytes()) + .map(murmurhash2::murmurhash2) + .map(|h| h as u64) + .last() + .unwrap() + }); } diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index 9d7a8f885..0bfd01fc2 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -144,7 +144,8 @@ impl InnerHeap { addr } else { if self.next_heap.is_none() { - info!(r#"Exceeded heap size. The segment will be committed right after indexing this document."#,); + info!(r#"Exceeded heap size. The segment will be committed right + after indexing this document."#,); self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize))); } self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len @@ -154,10 +155,9 @@ impl InnerHeap { fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] { let start = bytes_ref.0; if start >= self.buffer_len { - self.next_heap - .as_ref() - .unwrap() - .get_slice(BytesRef(start - self.buffer_len)) + self.next_heap.as_ref().unwrap().get_slice(BytesRef( + start - self.buffer_len, + )) } else { let start = start as usize; let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize; @@ -167,10 +167,10 @@ impl InnerHeap { fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] { if start >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .get_mut_slice(start - self.buffer_len, stop - self.buffer_len) + self.next_heap.as_mut().unwrap().get_mut_slice( + start - self.buffer_len, + stop - self.buffer_len, + ) } else { &mut self.buffer[start as usize..stop as usize] } @@ -188,10 +188,9 @@ impl InnerHeap { fn get_mut(&mut self, addr: u32) -> *mut u8 { if addr >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .get_mut(addr - self.buffer_len) + self.next_heap.as_mut().unwrap().get_mut( + addr - self.buffer_len, + ) } else { let addr_isize = addr as isize; unsafe { self.buffer.as_mut_ptr().offset(addr_isize) } @@ -200,10 +199,9 @@ impl InnerHeap { fn get_mut_ref(&mut self, addr: u32) -> &mut Item { if addr >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .get_mut_ref(addr - self.buffer_len) + self.next_heap.as_mut().unwrap().get_mut_ref( + addr - self.buffer_len, + ) } else { let v_ptr_u8 = self.get_mut(addr) as *mut u8; let v_ptr = v_ptr_u8 as *mut Item; @@ -213,10 +211,10 @@ impl InnerHeap { pub fn set(&mut self, addr: u32, val: &Item) { if addr >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .set(addr - self.buffer_len, val); + self.next_heap.as_mut().unwrap().set( + addr - self.buffer_len, + val, + ); } else { let v_ptr: *const Item = val as *const Item; let v_ptr_u8: *const u8 = v_ptr as *const u8; diff --git a/src/directory/error.rs b/src/directory/error.rs index d864012ea..73424f2e0 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError { write!(f, "the file '{:?}' already exists", path) } OpenWriteError::IOError(ref err) => { - write!(f, - "an io error occurred while opening a file for writing: '{}'", - err) + write!( + f, + "an io error occurred while opening a file for writing: '{}'", + err + ) } } } @@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError { write!(f, "the file '{:?}' does not exist", path) } OpenReadError::IOError(ref err) => { - write!(f, - "an io error occurred while opening a file for reading: '{}'", - err) + write!( + f, + "an io error occurred while opening a file for reading: '{}'", + err + ) } } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 5f4e7e773..8005c62b4 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -45,10 +45,9 @@ pub struct FileProtection { } fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) { - let mut meta_informations_wlock = directory - .meta_informations - .write() - .expect("Managed file lock poisoned"); + let mut meta_informations_wlock = directory.meta_informations.write().expect( + "Managed file lock poisoned", + ); if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) { (*counter_ref_mut) -= 1; } @@ -68,9 +67,10 @@ impl Drop for FileProtection { /// Saves the file containing the list of existing files /// that were created by tantivy. -fn save_managed_paths(directory: &mut Directory, - wlock: &RwLockWriteGuard) - -> io::Result<()> { +fn save_managed_paths( + directory: &mut Directory, + wlock: &RwLockWriteGuard, +) -> io::Result<()> { let mut w = serde_json::to_vec(&wlock.managed_paths)?; write!(&mut w, "\n")?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; @@ -84,22 +84,22 @@ impl ManagedDirectory { Ok(data) => { let managed_files_json = String::from_utf8_lossy(&data); let managed_files: HashSet = - serde_json::from_str(&managed_files_json) - .chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?; + serde_json::from_str(&managed_files_json).chain_err(|| { + ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()) + })?; Ok(ManagedDirectory { - directory: box directory, - meta_informations: Arc::new(RwLock::new(MetaInformation { - managed_paths: managed_files, - protected_files: - HashMap::default(), - })), - }) + directory: box directory, + meta_informations: Arc::new(RwLock::new(MetaInformation { + managed_paths: managed_files, + protected_files: HashMap::default(), + })), + }) } Err(OpenReadError::FileDoesNotExist(_)) => { Ok(ManagedDirectory { - directory: box directory, - meta_informations: Arc::default(), - }) + directory: box directory, + meta_informations: Arc::default(), + }) } Err(OpenReadError::IOError(e)) => Err(From::from(e)), } @@ -116,15 +116,14 @@ impl ManagedDirectory { /// If a file cannot be deleted (for permission reasons for instance) /// an error is simply logged, and the file remains in the list of managed /// files. - pub fn garbage_collect HashSet >(&mut self, get_living_files: L) { + pub fn garbage_collect HashSet>(&mut self, get_living_files: L) { info!("Garbage collect"); let mut files_to_delete = vec![]; { // releasing the lock as .delete() will use it too. - let meta_informations_rlock = - self.meta_informations - .read() - .expect("Managed directory rlock poisoned in garbage collect."); + let meta_informations_rlock = self.meta_informations.read().expect( + "Managed directory rlock poisoned in garbage collect.", + ); // It is crucial to get the living files after acquiring the // read lock of meta informations. That way, we @@ -177,9 +176,9 @@ impl ManagedDirectory { if !deleted_files.is_empty() { // update the list of managed files by removing // the file that were removed. - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed directory wlock poisoned (2)."); + let mut meta_informations_wlock = self.meta_informations.write().expect( + "Managed directory wlock poisoned (2).", + ); { let managed_paths_write = &mut meta_informations_wlock.managed_paths; for delete_file in &deleted_files { @@ -202,13 +201,13 @@ impl ManagedDirectory { pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection { let pathbuf = path.to_owned(); { - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed file lock poisoned on protect"); + let mut meta_informations_wlock = self.meta_informations.write().expect( + "Managed file lock poisoned on protect", + ); *meta_informations_wlock - .protected_files - .entry(pathbuf.clone()) - .or_insert(0) += 1; + .protected_files + .entry(pathbuf.clone()) + .or_insert(0) += 1; } FileProtection { directory: self.clone(), @@ -224,9 +223,9 @@ impl ManagedDirectory { /// will not lead to garbage files that will /// never get removed. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { - let mut meta_wlock = self.meta_informations - .write() - .expect("Managed file lock poisoned"); + let mut meta_wlock = self.meta_informations.write().expect( + "Managed file lock poisoned", + ); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); if has_changed { save_managed_paths(self.directory.as_mut(), &meta_wlock)?; @@ -241,8 +240,9 @@ impl Directory for ManagedDirectory { } fn open_write(&mut self, path: &Path) -> result::Result { - self.register_file_as_managed(path) - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + self.register_file_as_managed(path).map_err(|e| { + IOError::with_path(path.to_owned(), e) + })?; self.directory.open_write(path) } @@ -257,9 +257,9 @@ impl Directory for ManagedDirectory { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { { - let metas_rlock = self.meta_informations - .read() - .expect("poisoned lock in managed directory meta"); + let metas_rlock = self.meta_informations.read().expect( + "poisoned lock in managed directory meta", + ); if let Some(counter) = metas_rlock.protected_files.get(path) { if *counter > 0 { return Err(DeleteError::FileProtected(path.to_owned())); @@ -327,7 +327,7 @@ mod tests { { let living_files: HashSet = [TEST_PATH1.to_owned()].into_iter().cloned().collect(); - managed_directory.garbage_collect(|| { living_files }); + managed_directory.garbage_collect(|| living_files); } { assert!(managed_directory.exists(*TEST_PATH1)); @@ -343,7 +343,7 @@ mod tests { } { let living_files: HashSet = HashSet::new(); - managed_directory.garbage_collect(|| { living_files }); + managed_directory.garbage_collect(|| living_files); } { assert!(!managed_directory.exists(*TEST_PATH1)); @@ -366,7 +366,7 @@ mod tests { assert!(managed_directory.exists(*TEST_PATH1)); let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap(); - managed_directory.garbage_collect(|| { living_files.clone() }); + managed_directory.garbage_collect(|| living_files.clone()); if cfg!(target_os = "windows") { // On Windows, gc should try and fail the file as it is mmapped. assert!(managed_directory.exists(*TEST_PATH1)); @@ -374,7 +374,7 @@ mod tests { drop(_mmap_read); // The file should still be in the list of managed file and // eventually be deleted once mmap is released. - managed_directory.garbage_collect(|| { living_files }); + managed_directory.garbage_collect(|| living_files); assert!(!managed_directory.exists(*TEST_PATH1)); } else { assert!(!managed_directory.exists(*TEST_PATH1)); @@ -398,11 +398,11 @@ mod tests { { let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1); - managed_directory.garbage_collect(|| { living_files.clone() }); + managed_directory.garbage_collect(|| living_files.clone()); assert!(managed_directory.exists(*TEST_PATH1)); } - managed_directory.garbage_collect(|| { living_files.clone() }); + managed_directory.garbage_collect(|| living_files.clone()); assert!(!managed_directory.exists(*TEST_PATH1)); diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index a3d5748b8..970b987cb 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -24,15 +24,17 @@ use std::sync::Weak; use tempdir::TempDir; fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadError> { - let file = File::open(&full_path) - .map_err(|e| if e.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.clone()) - } else { - OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) - })?; + let file = File::open(&full_path).map_err(|e| if e.kind() == + io::ErrorKind::NotFound + { + OpenReadError::FileDoesNotExist(full_path.clone()) + } else { + OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) + })?; - let meta_data = file.metadata() - .map_err(|e| IOError::with_path(full_path.to_owned(), e))?; + let meta_data = file.metadata().map_err(|e| { + IOError::with_path(full_path.to_owned(), e) + })?; if meta_data.len() == 0 { // if the file size is 0, it will not be possible // to mmap the file, so we return an anonymous mmap_cache @@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadE } -#[derive(Default,Clone,Debug,Serialize,Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct CacheCounters { // Number of time the cache prevents to call `mmap` pub hit: usize, @@ -58,7 +60,7 @@ pub struct CacheCounters { pub miss_weak: usize, } -#[derive(Clone,Debug,Serialize,Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct CacheInfo { pub counters: CacheCounters, pub mmapped: Vec, @@ -113,31 +115,31 @@ impl MmapCache { self.cleanup(); } Ok(match self.cache.entry(full_path.clone()) { - HashMapEntry::Occupied(mut occupied_entry) => { - if let Some(mmap_arc) = occupied_entry.get().upgrade() { - self.counters.hit += 1; - Some(mmap_arc.clone()) - } else { - // The entry exists but the weak ref has been destroyed. - self.counters.miss_weak += 1; - if let Some(mmap_arc) = open_mmap(&full_path)? { - occupied_entry.insert(Arc::downgrade(&mmap_arc)); - Some(mmap_arc) - } else { - None - } - } - } - HashMapEntry::Vacant(vacant_entry) => { - self.counters.miss_empty += 1; - if let Some(mmap_arc) = open_mmap(&full_path)? { - vacant_entry.insert(Arc::downgrade(&mmap_arc)); - Some(mmap_arc) - } else { - None - } - } - }) + HashMapEntry::Occupied(mut occupied_entry) => { + if let Some(mmap_arc) = occupied_entry.get().upgrade() { + self.counters.hit += 1; + Some(mmap_arc.clone()) + } else { + // The entry exists but the weak ref has been destroyed. + self.counters.miss_weak += 1; + if let Some(mmap_arc) = open_mmap(&full_path)? { + occupied_entry.insert(Arc::downgrade(&mmap_arc)); + Some(mmap_arc) + } else { + None + } + } + } + HashMapEntry::Vacant(vacant_entry) => { + self.counters.miss_empty += 1; + if let Some(mmap_arc) = open_mmap(&full_path)? { + vacant_entry.insert(Arc::downgrade(&mmap_arc)); + Some(mmap_arc) + } else { + None + } + } + }) } } @@ -180,15 +182,19 @@ impl MmapDirectory { /// exist or if it is not a directory. pub fn open(directory_path: &Path) -> Result { if !directory_path.exists() { - Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path))) + Err(OpenDirectoryError::DoesNotExist( + PathBuf::from(directory_path), + )) } else if !directory_path.is_dir() { - Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path))) + Err(OpenDirectoryError::NotADirectory( + PathBuf::from(directory_path), + )) } else { Ok(MmapDirectory { - root_path: PathBuf::from(directory_path), - mmap_cache: Arc::new(RwLock::new(MmapCache::default())), - _temp_directory: Arc::new(None), - }) + root_path: PathBuf::from(directory_path), + mmap_cache: Arc::new(RwLock::new(MmapCache::default())), + _temp_directory: Arc::new(None), + }) } } @@ -215,9 +221,9 @@ impl MmapDirectory { use std::os::windows::fs::OpenOptionsExt; use winapi::winbase; - open_opts - .write(true) - .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS); + open_opts.write(true).custom_flags( + winbase::FILE_FLAG_BACKUP_SEMANTICS, + ); } let fd = try!(open_opts.open(&self.root_path)); @@ -270,46 +276,50 @@ impl Directory for MmapDirectory { debug!("Open Read {:?}", path); let full_path = self.resolve_path(path); - let mut mmap_cache = self.mmap_cache - .write() - .map_err(|_| { - let msg = format!("Failed to acquired write lock \ + let mut mmap_cache = self.mmap_cache.write().map_err(|_| { + let msg = format!( + "Failed to acquired write lock \ on mmap cache while reading {:?}", - path); - IOError::with_path(path.to_owned(), make_io_err(msg)) - })?; + path + ); + IOError::with_path(path.to_owned(), make_io_err(msg)) + })?; - Ok(mmap_cache - .get_mmap(full_path)? - .map(MmapReadOnly::from) - .map(ReadOnlySource::Mmap) - .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) + Ok( + mmap_cache + .get_mmap(full_path)? + .map(MmapReadOnly::from) + .map(ReadOnlySource::Mmap) + .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())), + ) } fn open_write(&mut self, path: &Path) -> Result { debug!("Open Write {:?}", path); let full_path = self.resolve_path(path); - let open_res = OpenOptions::new() - .write(true) - .create_new(true) - .open(full_path); + let open_res = OpenOptions::new().write(true).create_new(true).open( + full_path, + ); - let mut file = open_res - .map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists { - OpenWriteError::FileAlreadyExists(path.to_owned()) - } else { - IOError::with_path(path.to_owned(), err).into() - })?; + let mut file = open_res.map_err(|err| if err.kind() == + io::ErrorKind::AlreadyExists + { + OpenWriteError::FileAlreadyExists(path.to_owned()) + } else { + IOError::with_path(path.to_owned(), err).into() + })?; // making sure the file is created. - file.flush() - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + file.flush().map_err( + |e| IOError::with_path(path.to_owned(), e), + )?; // Apparetntly, on some filesystem syncing the parent // directory is required. - self.sync_directory() - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + self.sync_directory().map_err(|e| { + IOError::with_path(path.to_owned(), e) + })?; let writer = SafeFileWriter::new(file); Ok(BufWriter::new(Box::new(writer))) @@ -318,22 +328,23 @@ impl Directory for MmapDirectory { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { debug!("Deleting file {:?}", path); let full_path = self.resolve_path(path); - let mut mmap_cache = self.mmap_cache - .write() - .map_err(|_| { - let msg = format!("Failed to acquired write lock \ + let mut mmap_cache = self.mmap_cache.write().map_err(|_| { + let msg = format!( + "Failed to acquired write lock \ on mmap cache while deleting {:?}", - path); - IOError::with_path(path.to_owned(), make_io_err(msg)) - })?; + path + ); + IOError::with_path(path.to_owned(), make_io_err(msg)) + })?; // Removing the entry in the MMap cache. // The munmap will appear on Drop, // when the last reference is gone. mmap_cache.cache.remove(&full_path); match fs::remove_file(&full_path) { Ok(_) => { - self.sync_directory() - .map_err(|e| IOError::with_path(path.to_owned(), e).into()) + self.sync_directory().map_err(|e| { + IOError::with_path(path.to_owned(), e).into() + }) } Err(e) => { if e.kind() == io::ErrorKind::NotFound { @@ -355,8 +366,9 @@ impl Directory for MmapDirectory { let mut buffer = Vec::new(); match File::open(&full_path) { Ok(mut file) => { - file.read_to_end(&mut buffer) - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + file.read_to_end(&mut buffer).map_err(|e| { + IOError::with_path(path.to_owned(), e) + })?; Ok(buffer) } Err(e) => { diff --git a/src/directory/mod.rs b/src/directory/mod.rs index b107d78c5..b4c18b359 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -13,14 +13,15 @@ mod managed_directory; /// Errors specific to the directory module. pub mod error; -use std::io::{Write, Seek}; +use std::io::{Write, Seek, BufWriter}; -use std::io::BufWriter; pub use self::read_only_source::ReadOnlySource; pub use self::directory::Directory; pub use self::ram_directory::RAMDirectory; pub use self::mmap_directory::MmapDirectory; -pub use self::managed_directory::{ManagedDirectory, FileProtection}; + +pub(crate) use self::read_only_source::SourceRead; +pub(crate) use self::managed_directory::{ManagedDirectory, FileProtection}; /// Synonym of Seek + Write pub trait SeekableWrite: Seek + Write {} diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 0f205c6f1..ca23bc07c 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -41,8 +41,10 @@ impl VecWriter { impl Drop for VecWriter { fn drop(&mut self) { if !self.is_flushed { - panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", - self.path) + panic!( + "You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", + self.path + ) } } } @@ -62,8 +64,10 @@ impl Write for VecWriter { fn flush(&mut self) -> io::Result<()> { self.is_flushed = true; - try!(self.shared_directory - .write(self.path.clone(), self.data.get_ref())); + try!(self.shared_directory.write( + self.path.clone(), + self.data.get_ref(), + )); Ok(()) } } @@ -79,11 +83,11 @@ impl InnerDirectory { } fn write(&self, path: PathBuf, data: &[u8]) -> io::Result { - let mut map = try!(self.0 - .write() - .map_err(|_| { - make_io_err(format!("Failed to lock the directory, when trying to write {:?}", - path)) + let mut map = try!(self.0.write().map_err(|_| { + make_io_err(format!( + "Failed to lock the directory, when trying to write {:?}", + path + )) })); let prev_value = map.insert(path, Arc::new(Vec::from(data))); Ok(prev_value.is_some()) @@ -93,17 +97,21 @@ impl InnerDirectory { self.0 .read() .map_err(|_| { - let msg = format!("Failed to acquire read lock for the \ + let msg = format!( + "Failed to acquire read lock for the \ directory when trying to read {:?}", - path); - let io_err = make_io_err(msg); - OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) + path + ); + let io_err = make_io_err(msg); + OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err)) + }) .and_then(|readable_map| { readable_map .get(path) .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) - .map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))) + .map(|data| { + ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())) + }) }) } @@ -111,16 +119,18 @@ impl InnerDirectory { self.0 .write() .map_err(|_| { - let msg = format!("Failed to acquire write lock for the \ + let msg = format!( + "Failed to acquire write lock for the \ directory when trying to delete {:?}", - path); - let io_err = make_io_err(msg); - DeleteError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) + path + ); + let io_err = make_io_err(msg); + DeleteError::IOError(IOError::with_path(path.to_owned(), io_err)) + }) .and_then(|mut writable_map| match writable_map.remove(path) { - Some(_) => Ok(()), - None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), - }) + Some(_) => Ok(()), + None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), + }) } fn exists(&self, path: &Path) -> bool { @@ -164,9 +174,11 @@ impl Directory for RAMDirectory { let path_buf = PathBuf::from(path); let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); - let exists = self.fs - .write(path_buf.clone(), &Vec::new()) - .map_err(|err| IOError::with_path(path.to_owned(), err))?; + let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err( + |err| { + IOError::with_path(path.to_owned(), err) + }, + )?; // force the creation of the file to mimic the MMap directory. if exists { diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index d327f5a51..9b1506217 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -2,6 +2,8 @@ use fst::raw::MmapReadOnly; use std::ops::Deref; use super::shared_vec_slice::SharedVecSlice; use common::HasLen; +use std::slice; +use std::io::{self, Read}; use stable_deref_trait::StableDeref; /// Read object that represents files in tantivy. @@ -41,6 +43,14 @@ impl ReadOnlySource { } } + /// Splits into 2 `ReadOnlySource`, at the offset given + /// as an argument. + pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) { + let left = self.slice(0, addr); + let right = self.slice_from(addr); + (left, right) + } + /// Creates a ReadOnlySource that is just a /// view over a slice of the data. /// @@ -62,6 +72,23 @@ impl ReadOnlySource { } } } + + /// Like `.slice(...)` but enforcing only the `from` + /// boundary. + /// + /// Equivalent to `.slice(from_offset, self.len())` + pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource { + let len = self.len(); + self.slice(from_offset, len) + } + + /// Like `.slice(...)` but enforcing only the `to` + /// boundary. + /// + /// Equivalent to `.slice(0, to_offset)` + pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource { + self.slice(0, to_offset) + } } impl HasLen for ReadOnlySource { @@ -82,3 +109,42 @@ impl From> for ReadOnlySource { ReadOnlySource::Anonymous(shared_data) } } + + +/// Acts as a owning cursor over the data backed up by a ReadOnlySource +pub(crate) struct SourceRead { + _data_owner: ReadOnlySource, + cursor: &'static [u8], +} + +impl SourceRead { + // Advance the cursor by a given number of bytes. + pub fn advance(&mut self, len: usize) { + self.cursor = &self.cursor[len..]; + } +} + +impl AsRef<[u8]> for SourceRead { + fn as_ref(&self) -> &[u8] { + self.cursor + } +} + +impl From for SourceRead { + // Creates a new `SourceRead` from a given `ReadOnlySource` + fn from(source: ReadOnlySource) -> SourceRead { + let len = source.len(); + let slice_ptr = source.as_slice().as_ptr(); + let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) }; + SourceRead { + _data_owner: source, + cursor: static_slice, + } + } +} + +impl Read for SourceRead { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.cursor.read(buf) + } +} diff --git a/src/error.rs b/src/error.rs index d6ce4a33d..7a2db9d2b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -10,6 +10,7 @@ use schema; use fastfield::FastFieldNotAvailableError; use serde_json; + error_chain!( errors { /// Path does not exist. @@ -111,12 +112,9 @@ impl From for Error { impl From for Error { fn from(error: OpenWriteError) -> Error { match error { - OpenWriteError::FileAlreadyExists(filepath) => { - ErrorKind::FileAlreadyExists(filepath) - } - OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error), - } - .into() + OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath), + OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error), + }.into() } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 37b398b9a..9748d07a1 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -32,7 +32,7 @@ mod delete; pub use self::delete::write_delete_bitset; pub use self::delete::DeleteBitSet; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; -pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader}; +pub use self::reader::{U64FastFieldReader, I64FastFieldReader}; pub use self::reader::FastFieldReader; pub use self::serializer::FastFieldSerializer; pub use self::error::{Result, FastFieldNotAvailableError}; @@ -51,6 +51,7 @@ mod tests { use fastfield::FastFieldReader; use rand::Rng; use rand::SeedableRng; + use common::CompositeFile; use rand::XorShiftRng; lazy_static! { @@ -84,7 +85,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64); @@ -94,12 +95,12 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 38 as usize); + assert_eq!(source.len(), 35 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); - let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + let composite_file = CompositeFile::open(source).unwrap(); + let field_source = composite_file.open_read(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source); assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); @@ -112,7 +113,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64); @@ -128,12 +129,12 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 63 as usize); + assert_eq!(source.len(), 60 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); assert_eq!(fast_field_reader.get(0), 4u64); assert_eq!(fast_field_reader.get(1), 14_082_001u64); assert_eq!(fast_field_reader.get(2), 3_052u64); @@ -154,7 +155,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for _ in 0..10_000 { add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64); @@ -164,12 +165,12 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 36 as usize); + assert_eq!(source.len(), 33 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); for doc in 0..10_000 { assert_eq!(fast_field_reader.get(doc), 100_000u64); } @@ -183,30 +184,35 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); // forcing the amplitude to be high add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64); for i in 0u64..10_000u64 { - add_single_field_doc(&mut fast_field_writers, - *FIELD, - 5_000_000_000_000_000_000u64 + i); + add_single_field_doc( + &mut fast_field_writers, + *FIELD, + 5_000_000_000_000_000_000u64 + i, + ); } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 80044 as usize); + assert_eq!(source.len(), 80041 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { - assert_eq!(fast_field_reader.get(doc), - 5_000_000_000_000_000_000u64 + doc as u64 - 1u64); + assert_eq!( + fast_field_reader.get(doc), + 5_000_000_000_000_000_000u64 + doc as u64 - 1u64 + ); } } } @@ -221,7 +227,7 @@ mod tests { let schema = schema_builder.build(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); for i in -100i64..10_000i64 { let mut doc = Document::default(); @@ -233,12 +239,13 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 17711 as usize); + assert_eq!(source.len(), 17708 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: I64FastFieldReader = - fast_field_readers.open_reader(i64_field).unwrap(); + I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap()); + assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); for (doc, i) in (-100i64..10_000i64).enumerate() { @@ -262,7 +269,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); let doc = Document::default(); fast_field_writers.add_document(&doc); @@ -272,9 +279,10 @@ mod tests { let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: I64FastFieldReader = - fast_field_readers.open_reader(i64_field).unwrap(); + I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap()); assert_eq!(fast_field_reader.get(0u32), 0i64); } } @@ -295,7 +303,7 @@ mod tests { let mut directory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); @@ -305,9 +313,10 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let mut a = 0u64; for _ in 0..n { assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]); @@ -333,13 +342,13 @@ mod tests { fn bench_intfastfield_veclookup(b: &mut Bencher) { let permutation = generate_permutation(); b.iter(|| { - let n = test::black_box(1000u32); - let mut a = 0u64; - for _ in 0u32..n { - a = permutation[a as usize]; - } - a - }); + let n = test::black_box(1000u32); + let mut a = 0u64; + for _ in 0u32..n { + a = permutation[a as usize]; + } + a + }); } #[bench] @@ -349,7 +358,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); @@ -359,9 +368,11 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + + b.iter(|| { let n = test::black_box(7000u32); let mut a = 0u64; @@ -380,7 +391,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); @@ -390,17 +401,18 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + b.iter(|| { - let n = test::black_box(1000u32); - let mut a = 0u32; - for _ in 0u32..n { - a = fast_field_reader.get(a) as u32; - } - a - }); + let n = test::black_box(1000u32); + let mut a = 0u32; + for _ in 0u32..n { + a = fast_field_reader.get(a) as u32; + } + a + }); } } } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index aae1dd797..d7544b28c 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,20 +1,15 @@ -use std::io; -use std::collections::HashMap; use directory::ReadOnlySource; -use common::BinarySerializable; +use common::{self, BinarySerializable}; +use common::bitpacker::{compute_num_bits, BitUnpacker}; use DocId; -use schema::{Field, SchemaBuilder}; +use schema::SchemaBuilder; use std::path::Path; use schema::FAST; use directory::{WritePtr, RAMDirectory, Directory}; -use fastfield::FastFieldSerializer; -use fastfield::FastFieldsWriter; -use common::bitpacker::compute_num_bits; -use common::bitpacker::BitUnpacker; +use fastfield::{FastFieldSerializer, FastFieldsWriter}; use schema::FieldType; -use error::ResultExt; use std::mem; -use common; +use common::CompositeFile; use owning_ref::OwningRef; /// Trait for accessing a fastfield. @@ -111,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader { let amplitude: u64; { let mut cursor = data.as_slice(); - min_value = u64::deserialize(&mut cursor) - .expect("Failed to read the min_value of fast field."); - amplitude = u64::deserialize(&mut cursor) - .expect("Failed to read the amplitude of fast field."); + min_value = + u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field."); + amplitude = + u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field."); } let max_value = min_value + amplitude; @@ -135,33 +130,36 @@ impl From> for U64FastFieldReader { let mut schema_builder = SchemaBuilder::default(); let field = schema_builder.add_u64_field("field", FAST); let schema = schema_builder.build(); - let path = Path::new("test"); + let path = Path::new("__dummy__"); let mut directory: RAMDirectory = RAMDirectory::create(); { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let write: WritePtr = directory.open_write(path).expect( + "With a RAMDirectory, this should never fail.", + ); + let mut serializer = FastFieldSerializer::from_write(write).expect( + "With a RAMDirectory, this should never fail.", + ); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); - for val in vals { - let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap(); - fast_field_writer.add_val(val); + { + let fast_field_writer = fast_field_writers.get_field_writer(field).expect( + "With a RAMDirectory, this should never fail.", + ); + for val in vals { + fast_field_writer.add_val(val); + } } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); } - directory - .open_read(path) - .chain_err(|| "Failed to open the file") - .and_then(|source| { - FastFieldsReader::from_source(source) - .chain_err(|| "Failed to read the file.") - }) - .and_then(|ff_readers| { - ff_readers - .open_reader(field) - .ok_or_else(|| "Failed to find the requested field".into()) - }) - .expect("This should never happen, please report.") + let source = directory.open_read(path).expect("Failed to open the file"); + let composite_file = + CompositeFile::open(source).expect("Failed to read the composite file"); + + let field_source = composite_file.open_read(field).expect( + "File component not found", + ); + U64FastFieldReader::open(field_source) } } @@ -212,7 +210,7 @@ impl FastFieldReader for I64FastFieldReader { let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; self.underlying.get_range(start, output_u64); for mut_val in output_u64.iter_mut() { - *mut_val ^= 1 << 63; + *mut_val = common::u64_to_i64(*mut_val as u64) as u64; } } @@ -231,67 +229,3 @@ impl FastFieldReader for I64FastFieldReader { } } } - - - -/// The `FastFieldsReader` is the datastructure containing -/// all of the fast fields' data. -/// -/// It contains a mapping that associated these fields to -/// the proper slice in the fastfield reader file. -pub struct FastFieldsReader { - source: ReadOnlySource, - field_offsets: HashMap, -} - -impl FastFieldsReader { - /// Opens a `FastFieldsReader` - /// - /// When opening the fast field reader, the - /// the list of the offset is read (as a footer of the - /// data file). - pub fn from_source(source: ReadOnlySource) -> io::Result { - let header_offset; - let field_offsets: Vec<(Field, u32)>; - { - let buffer = source.as_slice(); - { - let mut cursor = buffer; - header_offset = u32::deserialize(&mut cursor)?; - } - { - let mut cursor = &buffer[header_offset as usize..]; - field_offsets = Vec::deserialize(&mut cursor)?; - } - } - let mut end_offsets: Vec = field_offsets.iter().map(|&(_, offset)| offset).collect(); - end_offsets.push(header_offset); - let mut field_offsets_map: HashMap = HashMap::new(); - for (field_start_offsets, stop_offset) in - field_offsets.iter().zip(end_offsets.iter().skip(1)) { - let (field, start_offset) = *field_start_offsets; - field_offsets_map.insert(field, (start_offset, *stop_offset)); - } - Ok(FastFieldsReader { - field_offsets: field_offsets_map, - source: source, - }) - } - - /// Returns the u64 fast value reader if the field - /// is a u64 field indexed as "fast". - /// - /// Return None if the field is not a u64 field - /// indexed with the fast option. - /// - /// # Panics - /// May panic if the index is corrupted. - pub fn open_reader(&self, field: Field) -> Option { - self.field_offsets - .get(&field) - .map(|&(start, stop)| { - let field_source = self.source.slice(start as usize, stop as usize); - FFReader::open(field_source) - }) - } -} diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index ef6ffedf9..d26366de0 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -3,7 +3,8 @@ use directory::WritePtr; use schema::Field; use common::bitpacker::{compute_num_bits, BitPacker}; use common::CountingWriter; -use std::io::{self, Write, Seek, SeekFrom}; +use common::CompositeWrite; +use std::io::{self, Write}; /// `FastFieldSerializer` is in charge of serializing /// fastfields on disk. @@ -26,51 +27,61 @@ use std::io::{self, Write, Seek, SeekFrom}; /// * `close_field()` /// * `close()` pub struct FastFieldSerializer { - write: CountingWriter, - fields: Vec<(Field, u32)>, - min_value: u64, - field_open: bool, - bit_packer: BitPacker, + composite_write: CompositeWrite, } - impl FastFieldSerializer { /// Constructor - pub fn new(write: WritePtr) -> io::Result { + pub fn from_write(write: WritePtr) -> io::Result { // just making room for the pointer to header. - let mut counting_writer = CountingWriter::wrap(write); - 0u32.serialize(&mut counting_writer)?; - Ok(FastFieldSerializer { - write: counting_writer, - fields: Vec::new(), - min_value: 0, - field_open: false, - bit_packer: BitPacker::new(0), - }) + let composite_write = CompositeWrite::wrap(write); + Ok(FastFieldSerializer { composite_write: composite_write }) } /// Start serializing a new u64 fast field - pub fn new_u64_fast_field(&mut self, - field: Field, - min_value: u64, - max_value: u64) - -> io::Result<()> { - if self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed")); - } - self.min_value = min_value; - self.field_open = true; - self.fields.push((field, self.write.written_bytes() as u32)); - let write = &mut self.write; + pub fn new_u64_fast_field( + &mut self, + field: Field, + min_value: u64, + max_value: u64, + ) -> io::Result>> { + let field_write = self.composite_write.for_field(field); + FastSingleFieldSerializer::open(field_write, min_value, max_value) + } + + + /// Closes the serializer + /// + /// After this call the data must be persistently save on disk. + pub fn close(self) -> io::Result<()> { + self.composite_write.close() + } +} + +pub struct FastSingleFieldSerializer<'a, W: Write + 'a> { + bit_packer: BitPacker, + write: &'a mut W, + min_value: u64, +} + +impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { + fn open( + write: &'a mut W, + min_value: u64, + max_value: u64, + ) -> io::Result> { min_value.serialize(write)?; let amplitude = max_value - min_value; amplitude.serialize(write)?; let num_bits = compute_num_bits(amplitude); - self.bit_packer = BitPacker::new(num_bits as usize); - Ok(()) + let bit_packer = BitPacker::new(num_bits as usize); + Ok(FastSingleFieldSerializer { + write: write, + bit_packer: bit_packer, + min_value: min_value, + }) } - /// Pushes a new value to the currently open u64 fast field. pub fn add_val(&mut self, val: u64) -> io::Result<()> { let val_to_write: u64 = val - self.min_value; @@ -78,33 +89,7 @@ impl FastFieldSerializer { Ok(()) } - /// Close the u64 fast field. - pub fn close_field(&mut self) -> io::Result<()> { - if !self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed")); - } - self.field_open = false; - // adding some padding to make sure we - // can read the last elements with our u64 - // cursor - self.bit_packer.close(&mut self.write)?; - Ok(()) - } - - - /// Closes the serializer - /// - /// After this call the data must be persistently save on disk. - pub fn close(self) -> io::Result { - if self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed")); - } - let header_offset: usize = self.write.written_bytes() as usize; - let (mut write, written_size) = self.write.finish()?; - self.fields.serialize(&mut write)?; - write.seek(SeekFrom::Start(0))?; - (header_offset as u32).serialize(&mut write)?; - write.flush()?; - Ok(written_size) + pub fn close_field(mut self) -> io::Result<()> { + self.bit_packer.close(&mut self.write) } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 52b29972f..1750f90ca 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -58,9 +58,9 @@ impl FastFieldsWriter { /// Get the `FastFieldWriter` associated to a field. pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> { // TODO optimize - self.field_writers - .iter_mut() - .find(|field_writer| field_writer.field == field) + self.field_writers.iter_mut().find(|field_writer| { + field_writer.field == field + }) } @@ -155,9 +155,9 @@ impl IntFastFieldWriter { /// associated to the document with the `DocId` n. /// (Well, `n-1` actually because of 0-indexing) pub fn add_val(&mut self, val: u64) { - VInt(val) - .serialize(&mut self.vals) - .expect("unable to serialize VInt to Vec"); + VInt(val).serialize(&mut self.vals).expect( + "unable to serialize VInt to Vec", + ); if val > self.val_max { self.val_max = val; @@ -208,13 +208,14 @@ impl IntFastFieldWriter { (self.val_min, self.val_max) }; - serializer.new_u64_fast_field(self.field, min, max)?; + + let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?; let mut cursor = self.vals.as_slice(); while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) { - serializer.add_val(val)?; + single_field_serializer.add_val(val)?; } - serializer.close_field() + single_field_serializer.close_field() } } diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index fc22dbc84..da09c49c9 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -40,9 +40,9 @@ impl DeleteQueue { { let mut delete_queue_wlock = delete_queue.inner.write().unwrap(); delete_queue_wlock.last_block = Some(Arc::new(Block { - operations: Arc::default(), - next: next_block, - })); + operations: Arc::default(), + next: next_block, + })); } delete_queue @@ -59,9 +59,11 @@ impl DeleteQueue { .expect("Read lock poisoned when opening delete queue cursor") .last_block .clone() - .expect("Failed to unwrap last_block. This should never happen + .expect( + "Failed to unwrap last_block. This should never happen as the Option<> is only here to make - initialization possible"); + initialization possible", + ); let operations_len = last_block.operations.len(); DeleteCursor { block: last_block, @@ -92,9 +94,9 @@ impl DeleteQueue { // be some unflushed operations. // fn flush(&self) -> Option> { - let mut self_wlock = self.inner - .write() - .expect("Failed to acquire write lock on delete queue writer"); + let mut self_wlock = self.inner.write().expect( + "Failed to acquire write lock on delete queue writer", + ); let delete_operations; { @@ -108,9 +110,9 @@ impl DeleteQueue { let next_block = NextBlock::from(self.clone()); { self_wlock.last_block = Some(Arc::new(Block { - operations: Arc::new(delete_operations), - next: next_block, - })); + operations: Arc::new(delete_operations), + next: next_block, + })); } self_wlock.last_block.clone() } @@ -132,18 +134,18 @@ impl From for NextBlock { impl NextBlock { fn next_block(&self) -> Option> { { - let next_read_lock = self.0 - .read() - .expect("Failed to acquire write lock in delete queue"); + let next_read_lock = self.0.read().expect( + "Failed to acquire write lock in delete queue", + ); if let InnerNextBlock::Closed(ref block) = *next_read_lock { return Some(block.clone()); } } let next_block; { - let mut next_write_lock = self.0 - .write() - .expect("Failed to acquire write lock in delete queue"); + let mut next_write_lock = self.0.write().expect( + "Failed to acquire write lock in delete queue", + ); match *next_write_lock { InnerNextBlock::Closed(ref block) => { return Some(block.clone()); diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs index 03556ef17..e7d277f00 100644 --- a/src/indexer/doc_opstamp_mapping.rs +++ b/src/indexer/doc_opstamp_mapping.rs @@ -56,8 +56,10 @@ mod tests { #[test] fn test_doc_to_opstamp_mapping_none() { let doc_to_opstamp_mapping = DocToOpstampMapping::None; - assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), - u32::max_value()); + assert_eq!( + doc_to_opstamp_mapping.compute_doc_limit(1), + u32::max_value() + ); } #[test] diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 565c3089e..57acc00a7 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -102,14 +102,17 @@ impl !Sync for IndexWriter {} /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics /// If the heap size per thread is too small, panics. -pub fn open_index_writer(index: &Index, - num_threads: usize, - heap_size_in_bytes_per_thread: usize) - -> Result { +pub fn open_index_writer( + index: &Index, + num_threads: usize, + heap_size_in_bytes_per_thread: usize, +) -> Result { if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { - panic!(format!("The heap size per thread needs to be at least {}.", - HEAP_SIZE_LIMIT)); + panic!(format!( + "The heap size per thread needs to be at least {}.", + HEAP_SIZE_LIMIT + )); } let directory_lock = DirectoryLock::lock(index.directory().box_clone())?; @@ -156,12 +159,13 @@ pub fn open_index_writer(index: &Index, -pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, - segment_reader: &SegmentReader, - delete_cursor: &mut DeleteCursor, - doc_opstamps: &DocToOpstampMapping, - target_opstamp: u64) - -> Result { +pub fn compute_deleted_bitset( + delete_bitset: &mut BitSet, + segment_reader: &SegmentReader, + delete_cursor: &mut DeleteCursor, + doc_opstamps: &DocToOpstampMapping, + target_opstamp: u64, +) -> Result { let mut might_have_changed = false; @@ -177,8 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, // Limit doc helps identify the first document // that may be affected by the delete operation. let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); - if let Some(mut docset) = - segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + let inverted_index = segment_reader.inverted_index(delete_op.term.field()); + if let Some(mut docset) = inverted_index.read_postings( + &delete_op.term, + SegmentPostingsOption::NoFreq, + ) + { while docset.advance() { let deleted_doc = docset.doc(); if deleted_doc < limit_doc { @@ -198,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, /// Advance delete for the given segment up /// to the target opstamp. -pub fn advance_deletes(mut segment: Segment, - segment_entry: &mut SegmentEntry, - target_opstamp: u64) - -> Result> { +pub fn advance_deletes( + mut segment: Segment, + segment_entry: &mut SegmentEntry, + target_opstamp: u64, +) -> Result> { let mut file_protect: Option = None; @@ -222,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment, let delete_cursor = segment_entry.delete_cursor(); - compute_deleted_bitset(&mut delete_bitset, - &segment_reader, - delete_cursor, - &DocToOpstampMapping::None, - target_opstamp)?; + compute_deleted_bitset( + &mut delete_bitset, + &segment_reader, + delete_cursor, + &DocToOpstampMapping::None, + target_opstamp, + )?; for doc in 0u32..max_doc { if segment_reader.is_deleted(doc) { @@ -247,15 +258,16 @@ pub fn advance_deletes(mut segment: Segment, Ok(file_protect) } -fn index_documents(heap: &mut Heap, - table_size: usize, - segment: Segment, - schema: &Schema, - generation: usize, - document_iterator: &mut Iterator, - segment_updater: &mut SegmentUpdater, - mut delete_cursor: DeleteCursor) - -> Result { +fn index_documents( + heap: &mut Heap, + table_size: usize, + segment: Segment, + schema: &Schema, + generation: usize, + document_iterator: &mut Iterator, + segment_updater: &mut SegmentUpdater, + mut delete_cursor: DeleteCursor, +) -> Result { heap.clear(); let segment_id = segment.id(); let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?; @@ -265,8 +277,10 @@ fn index_documents(heap: &mut Heap, // One is the memory arena dedicated to the segment is // getting full. if segment_writer.is_buffer_full() { - info!("Buffer limit reached, flushing segment with maxdoc={}.", - segment_writer.max_doc()); + info!( + "Buffer limit reached, flushing segment with maxdoc={}.", + segment_writer.max_doc() + ); break; } // The second is the term dictionary hash table @@ -275,8 +289,10 @@ fn index_documents(heap: &mut Heap, // Tantivy does not resize its hashtable. When it reaches // capacity, we just stop indexing new document. if segment_writer.is_term_saturated() { - info!("Term dic saturated, flushing segment with maxdoc={}.", - segment_writer.max_doc()); + info!( + "Term dic saturated, flushing segment with maxdoc={}.", + segment_writer.max_doc() + ); break; } } @@ -296,11 +312,13 @@ fn index_documents(heap: &mut Heap, let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); let segment_reader = SegmentReader::open(segment)?; let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - &doc_to_opstamps, - last_docstamp)?; + let may_have_deletes = compute_deleted_bitset( + &mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + &doc_to_opstamps, + last_docstamp, + )?; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { if may_have_deletes { @@ -327,14 +345,15 @@ impl IndexWriter { join_handle .join() .expect("Indexing Worker thread panicked") - .chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?; + .chain_err(|| { + ErrorKind::ErrorInThread("Error in indexing worker thread.".into()) + })?; } drop(self.workers_join_handle); - let result = - self.segment_updater - .wait_merging_thread() - .chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into())); + let result = self.segment_updater.wait_merging_thread().chain_err(|| { + ErrorKind::ErrorInThread("Failed to join merging thread.".into()) + }); if let Err(ref e) = result { error!("Some merging thread failed {:?}", e); @@ -347,8 +366,10 @@ impl IndexWriter { pub fn add_segment(&mut self, segment_meta: SegmentMeta) { let delete_cursor = self.delete_queue.cursor(); let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None); - self.segment_updater - .add_segment(self.generation, segment_entry); + self.segment_updater.add_segment( + self.generation, + segment_entry, + ); } #[doc(hidden)] @@ -372,7 +393,11 @@ impl IndexWriter { let mut delete_cursor = self.delete_queue.cursor(); let join_handle: JoinHandle> = thread::Builder::new() - .name(format!("indexing thread {} for gen {}", self.worker_id, generation)) + .name(format!( + "indexing thread {} for gen {}", + self.worker_id, + generation + )) .spawn(move || { loop { @@ -396,14 +421,16 @@ impl IndexWriter { return Ok(()); } let segment = segment_updater.new_segment(); - index_documents(&mut heap, - table_size, - segment, - &schema, - generation, - &mut document_iterator, - &mut segment_updater, - delete_cursor.clone())?; + index_documents( + &mut heap, + table_size, + segment, + &schema, + generation, + &mut document_iterator, + &mut segment_updater, + delete_cursor.clone(), + )?; } })?; @@ -436,9 +463,10 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, - segment_ids: &[SegmentId]) - -> impl Future { + pub fn merge( + &mut self, + segment_ids: &[SegmentId], + ) -> impl Future { self.segment_updater.start_merge(segment_ids) } @@ -522,14 +550,15 @@ impl IndexWriter { self.recreate_document_channel(); let mut former_workers_join_handle = Vec::new(); - swap(&mut former_workers_join_handle, - &mut self.workers_join_handle); + swap( + &mut former_workers_join_handle, + &mut self.workers_join_handle, + ); for worker_handle in former_workers_join_handle { - let indexing_worker_result = - worker_handle - .join() - .map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?; + let indexing_worker_result = worker_handle.join().map_err(|e| { + Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))) + })?; indexing_worker_result?; // add a new worker for the next generation. @@ -623,13 +652,17 @@ mod tests { let schema_builder = schema::SchemaBuilder::default(); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), - "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \ - level_log_size: 0.75 }"); + assert_eq!( + format!("{:?}", index_writer.get_merge_policy()), + "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \ + level_log_size: 0.75 }" + ); let merge_policy = box NoMergePolicy::default(); index_writer.set_merge_policy(merge_policy); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), - "NoMergePolicy"); + assert_eq!( + format!("{:?}", index_writer.get_merge_policy()), + "NoMergePolicy" + ); } #[test] @@ -719,9 +752,9 @@ mod tests { } // this should create 8 segments and trigger a merge. index_writer.commit().expect("commit failed"); - index_writer - .wait_merging_threads() - .expect("waiting merging thread failed"); + index_writer.wait_merging_threads().expect( + "waiting merging thread failed", + ); index.load_searchers().unwrap(); assert_eq!(num_docs_containing("a"), 200); diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index f565e5ae1..b5c860e25 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -62,7 +62,9 @@ impl MergePolicy for LogMergePolicy { let size_sorted_log_tuples: Vec<_> = size_sorted_tuples .into_iter() - .map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2())) + .map(|(ind, num_docs)| { + (ind, (self.clip_min_size(num_docs) as f64).log2()) + }) .collect(); let (first_ind, first_score) = size_sorted_log_tuples[0]; @@ -79,7 +81,9 @@ impl MergePolicy for LogMergePolicy { levels .iter() .filter(|level| level.len() >= self.min_merge_size) - .map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())) + .map(|ind_vec| { + MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()) + }) .collect() } @@ -138,17 +142,19 @@ mod tests { // * one with the 6 * 10-docs segments // * one with the 3 * 1000-docs segments // no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3 - let test_input = vec![seg_meta(10), - seg_meta(10), - seg_meta(10), - seg_meta(1000), - seg_meta(1000), - seg_meta(1000), - seg_meta(10000), - seg_meta(10000), - seg_meta(10), - seg_meta(10), - seg_meta(10)]; + let test_input = vec![ + seg_meta(10), + seg_meta(10), + seg_meta(10), + seg_meta(1000), + seg_meta(1000), + seg_meta(1000), + seg_meta(10000), + seg_meta(10000), + seg_meta(10), + seg_meta(10), + seg_meta(10), + ]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 2); } @@ -156,24 +162,28 @@ mod tests { #[test] fn test_log_merge_policy_within_levels() { // multiple levels all get merged correctly - let test_input = vec![seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75) - seg_meta(11), // log2(11) = ~3.46 - seg_meta(12), // log2(12) = ~3.58 - seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75) - seg_meta(1000), // log2(1000) = ~9.97 - seg_meta(1000)]; // log2(1000) = ~9.97 + let test_input = vec![ + seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75) + seg_meta(11), // log2(11) = ~3.46 + seg_meta(12), // log2(12) = ~3.58 + seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75) + seg_meta(1000), // log2(1000) = ~9.97 + seg_meta(1000), + ]; // log2(1000) = ~9.97 let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 2); } #[test] fn test_log_merge_policy_small_segments() { // segments under min_layer_size are merged together - let test_input = vec![seg_meta(1), - seg_meta(1), - seg_meta(1), - seg_meta(2), - seg_meta(2), - seg_meta(2)]; + let test_input = vec![ + seg_meta(1), + seg_meta(1), + seg_meta(1), + seg_meta(2), + seg_meta(2), + seg_meta(2), + ]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 1); } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b71774059..594f952d5 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -5,7 +5,7 @@ use DocId; use core::SerializableSegment; use schema::FieldValue; use indexer::SegmentSerializer; -use postings::PostingsSerializer; +use postings::InvertedIndexSerializer; use fastfield::U64FastFieldReader; use itertools::Itertools; use postings::Postings; @@ -17,9 +17,9 @@ use fastfield::FastFieldSerializer; use fastfield::FastFieldReader; use store::StoreWriter; use std::cmp::{min, max}; +use termdict::TermDictionary; use schema::Term; use termdict::TermStreamer; -use postings::SegmentPostingsOption; pub struct IndexMerger { schema: Schema, @@ -28,33 +28,11 @@ pub struct IndexMerger { } -struct DeltaPositionComputer { - buffer: Vec, -} - -impl DeltaPositionComputer { - fn new() -> DeltaPositionComputer { - DeltaPositionComputer { buffer: vec![0u32; 512] } - } - - fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] { - if positions.len() > self.buffer.len() { - self.buffer.resize(positions.len(), 0u32); - } - let mut last_pos = 0u32; - for (i, position) in positions.iter().cloned().enumerate() { - self.buffer[i] = position - last_pos; - last_pos = position; - } - &self.buffer[..positions.len()] - } -} - - -fn compute_min_max_val(u64_reader: &U64FastFieldReader, - max_doc: DocId, - delete_bitset: &DeleteBitSet) - -> Option<(u64, u64)> { +fn compute_min_max_val( + u64_reader: &U64FastFieldReader, + max_doc: DocId, + delete_bitset: &DeleteBitSet, +) -> Option<(u64, u64)> { if max_doc == 0 { None } else if !delete_bitset.has_deletes() { @@ -72,18 +50,46 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader, } } -fn extract_fieldnorm_reader(segment_reader: &SegmentReader, - field: Field) - -> Option { +fn extract_fieldnorm_reader( + segment_reader: &SegmentReader, + field: Field, +) -> Option { segment_reader.get_fieldnorms_reader(field) } -fn extract_fast_field_reader(segment_reader: &SegmentReader, - field: Field) - -> Option { - segment_reader.fast_fields_reader().open_reader(field) +fn extract_fast_field_reader( + segment_reader: &SegmentReader, + field: Field, +) -> Option { + segment_reader.get_fast_field_reader(field).ok() + } +struct DeltaComputer { + buffer: Vec, +} + +impl DeltaComputer { + fn new() -> DeltaComputer { + DeltaComputer { buffer: vec![0u32; 512] } + } + + fn compute_delta(&mut self, positions: &[u32]) -> &[u32] { + if positions.len() > self.buffer.len() { + self.buffer.resize(positions.len(), 0u32); + } + let mut last_pos = 0u32; + let num_positions = positions.len(); + for i in 0..num_positions { + let cur_pos = positions[i]; + self.buffer[i] = cur_pos - last_pos; + last_pos = cur_pos; + } + &self.buffer[..positions.len()] + } +} + + impl IndexMerger { pub fn open(schema: Schema, segments: &[Segment]) -> Result { let mut readers = vec![]; @@ -96,10 +102,10 @@ impl IndexMerger { } } Ok(IndexMerger { - schema: schema, - readers: readers, - max_doc: max_doc, - }) + schema: schema, + readers: readers, + max_doc: max_doc, + }) } fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { @@ -110,9 +116,11 @@ impl IndexMerger { .filter(|&(_, field_entry)| field_entry.is_indexed()) .map(|(field_id, _)| Field(field_id as u32)) .collect(); - self.generic_write_fast_field(fieldnorm_fastfields, - &extract_fieldnorm_reader, - fast_field_serializer) + self.generic_write_fast_field( + fieldnorm_fastfields, + &extract_fieldnorm_reader, + fast_field_serializer, + ) } fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { @@ -123,19 +131,21 @@ impl IndexMerger { .filter(|&(_, field_entry)| field_entry.is_int_fast()) .map(|(field_id, _)| Field(field_id as u32)) .collect(); - self.generic_write_fast_field(fast_fields, - &extract_fast_field_reader, - fast_field_serializer) + self.generic_write_fast_field( + fast_fields, + &extract_fast_field_reader, + fast_field_serializer, + ) } // used both to merge field norms and regular u64 fast fields. - fn generic_write_fast_field(&self, - fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) - -> Option, - fast_field_serializer: &mut FastFieldSerializer) - -> Result<()> { + fn generic_write_fast_field( + &self, + fields: Vec, + field_reader_extractor: &Fn(&SegmentReader, Field) -> Option, + fast_field_serializer: &mut FastFieldSerializer, + ) -> Result<()> { for field in fields { @@ -147,19 +157,25 @@ impl IndexMerger { match field_reader_extractor(reader, field) { Some(u64_reader) => { if let Some((seg_min_val, seg_max_val)) = - compute_min_max_val(&u64_reader, - reader.max_doc(), - reader.delete_bitset()) { + compute_min_max_val( + &u64_reader, + reader.max_doc(), + reader.delete_bitset(), + ) + { // the segment has some non-deleted documents min_val = min(min_val, seg_min_val); max_val = max(max_val, seg_max_val); - u64_readers - .push((reader.max_doc(), u64_reader, reader.delete_bitset())); + u64_readers.push(( + reader.max_doc(), + u64_reader, + reader.delete_bitset(), + )); } } None => { - let error_msg = format!("Failed to find a u64_reader for field {:?}", - field); + let error_msg = + format!("Failed to find a u64_reader for field {:?}", field); error!("{}", error_msg); bail!(ErrorKind::SchemaError(error_msg)); } @@ -174,50 +190,68 @@ impl IndexMerger { assert!(min_val <= max_val); - fast_field_serializer - .new_u64_fast_field(field, min_val, max_val)?; + + let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field( + field, + min_val, + max_val, + )?; for (max_doc, u64_reader, delete_bitset) in u64_readers { for doc_id in 0..max_doc { if !delete_bitset.is_deleted(doc_id) { let val = u64_reader.get(doc_id); - fast_field_serializer.add_val(val)?; + fast_single_field_serializer.add_val(val)?; } } } - fast_field_serializer.close_field()?; + fast_single_field_serializer.close_field()?; } Ok(()) } - fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> { + fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { - let mut merged_terms = TermMerger::from(&self.readers[..]); - let mut delta_position_computer = DeltaPositionComputer::new(); + let mut delta_computer = DeltaComputer::new(); - let mut max_doc = 0; - - // map from segment doc ids to the resulting merged segment doc id. - let mut merged_doc_id_map: Vec>> = Vec::with_capacity(self.readers.len()); - - for reader in &self.readers { - let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); - for doc_id in 0..reader.max_doc() { - if reader.is_deleted(doc_id) { - segment_local_map.push(None); - } else { - segment_local_map.push(Some(max_doc)); - max_doc += 1u32; - } + let mut indexed_fields = vec![]; + for (field_ord, field_entry) in self.schema.fields().iter().enumerate() { + if field_entry.is_indexed() { + indexed_fields.push(Field(field_ord as u32)); } - merged_doc_id_map.push(segment_local_map); } - let mut last_field: Option = None; + for indexed_field in indexed_fields { - let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions; + let field_readers = self.readers + .iter() + .map(|reader| reader.inverted_index(indexed_field)) + .collect::>(); - while merged_terms.advance() { + let field_term_streams = field_readers + .iter() + .map(|field_reader| field_reader.terms().stream()) + .collect(); + + let mut merged_terms = TermMerger::new(field_term_streams); + let mut max_doc = 0; + + // map from segment doc ids to the resulting merged segment doc id. + let mut merged_doc_id_map: Vec>> = + Vec::with_capacity(self.readers.len()); + + for reader in &self.readers { + let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); + for doc_id in 0..reader.max_doc() { + if reader.is_deleted(doc_id) { + segment_local_map.push(None); + } else { + segment_local_map.push(Some(max_doc)); + max_doc += 1u32; + } + } + merged_doc_id_map.push(segment_local_map); + } // Create the total list of doc ids // by stacking the doc ids from the different segment. @@ -229,86 +263,92 @@ impl IndexMerger { // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, // seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... - let term = Term::wrap(merged_terms.key()); - let current_field = term.field(); - if last_field != Some(current_field) { - // we reached a new field. - let field_entry = self.schema.get_field_entry(current_field); - // ... set segment postings option the new field. - segment_postings_option = field_entry - .field_type() - .get_segment_postings_option() - .expect("Encountered a field that is not supposed to be - indexed. Have you modified the schema?"); + let mut field_serializer = serializer.new_field(indexed_field)?; - last_field = Some(current_field); + let field_entry = self.schema.get_field_entry(indexed_field); - // it is perfectly safe to call `.new_field` - // even if there is no postings associated. - serializer.new_field(current_field); - } + // ... set segment postings option the new field. + let segment_postings_option = field_entry + .field_type() + .get_segment_postings_option() + .expect( + "Encountered a field that is not supposed to be + indexed. Have you modified the schema?", + ); - // Let's compute the list of non-empty posting lists - let segment_postings: Vec<_> = merged_terms - .current_kvs() - .iter() - .flat_map(|heap_item| { - let segment_ord = heap_item.segment_ord; - let term_info = heap_item.streamer.value(); - let segment_reader = &self.readers[heap_item.segment_ord]; - let mut segment_postings = - segment_reader - .read_postings_from_terminfo(term_info, segment_postings_option); - if segment_postings.advance() { - Some((segment_ord, segment_postings)) - } else { - None + while merged_terms.advance() { + + let term = Term::wrap(merged_terms.key()); + + // Let's compute the list of non-empty posting lists + let segment_postings: Vec<_> = merged_terms + .current_kvs() + .iter() + .flat_map(|heap_item| { + let segment_ord = heap_item.segment_ord; + let term_info = heap_item.streamer.value(); + let segment_reader = &self.readers[heap_item.segment_ord]; + let inverted_index = segment_reader.inverted_index(term.field()); + let mut segment_postings = inverted_index.read_postings_from_terminfo( + term_info, + segment_postings_option, + ); + if segment_postings.advance() { + Some((segment_ord, segment_postings)) + } else { + None + } + }) + .collect(); + + // At this point, `segment_postings` contains the posting list + // of all of the segments containing the given term. + // + // These segments are non-empty and advance has already been called. + + if !segment_postings.is_empty() { + // If not, the `term` will be entirely removed. + + // We know that there is at least one document containing + // the term, so we add it. + field_serializer.new_term(term.as_ref())?; + + // We can now serialize this postings, by pushing each document to the + // postings serializer. + for (segment_ord, mut segment_postings) in segment_postings { + let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; + loop { + // `.advance()` has been called once before the loop. + // Hence we cannot use a `while segment_postings.advance()` loop. + if let Some(remapped_doc_id) = + old_to_new_doc_id[segment_postings.doc() as usize] + { + // we make sure to only write the term iff + // there is at least one document. + let positions: &[u32] = segment_postings.positions(); + let term_freq = segment_postings.term_freq(); + let delta_positions = delta_computer.compute_delta(positions); + field_serializer.write_doc( + remapped_doc_id, + term_freq, + delta_positions, + )?; + } + if !segment_postings.advance() { + break; + } + } } - }) - .collect(); - // At this point, `segment_postings` contains the posting list - // of all of the segments containing the given term. - // - // These segments are non-empty and advance has already been called. - - if segment_postings.is_empty() { - // by continuing here, the `term` will be entirely removed. - continue; - } - - // We know that there is at least one document containing - // the term, so we add it. - serializer.new_term(term.as_ref())?; - - // We can now serialize this postings, by pushing each document to the - // postings serializer. - - for (segment_ord, mut segment_postings) in segment_postings { - let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; - loop { - // `.advance()` has been called once before the loop. - // Hence we cannot use a `while segment_postings.advance()` loop. - if let Some(remapped_doc_id) = - old_to_new_doc_id[segment_postings.doc() as usize] { - // we make sure to only write the term iff - // there is at least one document. - let delta_positions: &[u32] = - delta_position_computer - .compute_delta_positions(segment_postings.positions()); - let term_freq = segment_postings.term_freq(); - serializer - .write_doc(remapped_doc_id, term_freq, delta_positions)?; - } - if !segment_postings.advance() { - break; - } + // closing the term. + field_serializer.close_term()?; } + } - // closing the term. - serializer.close_term()?; + field_serializer.close()?; + } Ok(()) } @@ -318,9 +358,9 @@ impl IndexMerger { let store_reader = reader.get_store_reader(); for doc_id in 0..reader.max_doc() { if !reader.is_deleted(doc_id) { - let doc = try!(store_reader.get(doc_id)); + let doc = store_reader.get(doc_id)?; let field_values: Vec<&FieldValue> = doc.field_values().iter().collect(); - try!(store_writer.store(&field_values)); + store_writer.store(&field_values)?; } } } @@ -330,11 +370,15 @@ impl IndexMerger { impl SerializableSegment for IndexMerger { fn write(&self, mut serializer: SegmentSerializer) -> Result { - try!(self.write_postings(serializer.get_postings_serializer())); - try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer())); - try!(self.write_fast_fields(serializer.get_fast_field_serializer())); - try!(self.write_storable_fields(serializer.get_store_writer())); - try!(serializer.close()); + self.write_postings(serializer.get_postings_serializer())?; + self.write_fieldnorms( + serializer.get_fieldnorms_serializer(), + )?; + self.write_fast_fields( + serializer.get_fast_field_serializer(), + )?; + self.write_storable_fields(serializer.get_store_writer())?; + serializer.close()?; Ok(self.max_doc) } } @@ -411,14 +455,13 @@ mod tests { } } { - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index_writer.wait_merging_threads().unwrap(); } { @@ -431,14 +474,22 @@ mod tests { collector.docs() }; { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec![1, 2, 4]); - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec![0, 3]); - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]), - vec![4]); - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec![0, 1, 2, 3, 4]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + vec![1, 2, 4] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + vec![0, 3] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "g")]), + vec![4] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + vec![0, 1, 2, 3, 4] + ); } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); @@ -467,8 +518,10 @@ mod tests { assert!(searcher.search(&query, &mut collector).is_ok()); collector.vals() }; - assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]), - vec![5, 7, 13]); + assert_eq!( + get_fast_vals(vec![Term::from_field_text(text_field, "a")]), + vec![5, 7, 13] + ); } } } @@ -515,14 +568,22 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - vec![1]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - vec![1]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - vec![1, 3]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + vec![1] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + vec![1] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![1, 3] + ); } { // a second commit @@ -554,20 +615,34 @@ mod tests { assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[1].num_docs(), 2); assert_eq!(searcher.segment_readers()[1].max_doc(), 4); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) @@ -585,33 +660,46 @@ mod tests { } { // merging the segments - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 3); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) .get_fast_field_reader(score_field) @@ -630,20 +718,34 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) .get_fast_field_reader(score_field) @@ -653,13 +755,12 @@ mod tests { } { // Test merging a single segment in order to remove deletes. - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); @@ -667,20 +768,34 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 2); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) .get_fast_field_reader(score_field) @@ -692,13 +807,12 @@ mod tests { { // Test removing all docs index_writer.delete_term(Term::from_field_text(text_field, "g")); - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 082f9e1c1..9e8ad74a5 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -44,10 +44,11 @@ pub struct SegmentEntry { impl SegmentEntry { /// Create a new `SegmentEntry` - pub fn new(segment_meta: SegmentMeta, - delete_cursor: DeleteCursor, - delete_bitset: Option) - -> SegmentEntry { + pub fn new( + segment_meta: SegmentMeta, + delete_cursor: DeleteCursor, + delete_bitset: Option, + ) -> SegmentEntry { SegmentEntry { meta: segment_meta, state: SegmentState::Ready, diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 35c264cdc..a040c2ed5 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -32,31 +32,36 @@ pub struct SegmentManager { impl Debug for SegmentManager { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { let lock = self.read(); - write!(f, - "{{ uncommitted: {:?}, committed: {:?} }}", - lock.uncommitted, - lock.committed) + write!( + f, + "{{ uncommitted: {:?}, committed: {:?} }}", + lock.uncommitted, + lock.committed + ) } } -pub fn get_mergeable_segments(segment_manager: &SegmentManager) - -> (Vec, Vec) { +pub fn get_mergeable_segments( + segment_manager: &SegmentManager, +) -> (Vec, Vec) { let registers_lock = segment_manager.read(); - (registers_lock.committed.get_mergeable_segments(), - registers_lock.uncommitted.get_mergeable_segments()) + ( + registers_lock.committed.get_mergeable_segments(), + registers_lock.uncommitted.get_mergeable_segments(), + ) } impl SegmentManager { - pub fn from_segments(segment_metas: Vec, - delete_cursor: DeleteCursor) - -> SegmentManager { + pub fn from_segments( + segment_metas: Vec, + delete_cursor: DeleteCursor, + ) -> SegmentManager { SegmentManager { registers: RwLock::new(SegmentRegisters { - uncommitted: SegmentRegister::default(), - committed: SegmentRegister::new(segment_metas, - delete_cursor), - writing: HashSet::new(), - }), + uncommitted: SegmentRegister::default(), + committed: SegmentRegister::new(segment_metas, delete_cursor), + writing: HashSet::new(), + }), } } @@ -94,25 +99,24 @@ impl SegmentManager { pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { let registers = self.read(); - registers - .committed - .segment_entry(segment_id) - .or_else(|| registers.uncommitted.segment_entry(segment_id)) + registers.committed.segment_entry(segment_id).or_else(|| { + registers.uncommitted.segment_entry(segment_id) + }) } // Lock poisoning should never happen : // The lock is acquired and released within this class, // and the operations cannot panic. fn read(&self) -> RwLockReadGuard { - self.registers - .read() - .expect("Failed to acquire read lock on SegmentManager.") + self.registers.read().expect( + "Failed to acquire read lock on SegmentManager.", + ) } fn write(&self) -> RwLockWriteGuard { - self.registers - .write() - .expect("Failed to acquire write lock on SegmentManager.") + self.registers.write().expect( + "Failed to acquire write lock on SegmentManager.", + ) } pub fn commit(&self, segment_entries: Vec) { @@ -140,9 +144,11 @@ impl SegmentManager { } - pub fn cancel_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_id: SegmentId) { + pub fn cancel_merge( + &self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_id: SegmentId, + ) { let mut registers_lock = self.write(); @@ -150,13 +156,15 @@ impl SegmentManager { { let target_segment_register: &mut SegmentRegister; target_segment_register = { - if registers_lock - .uncommitted - .contains_all(before_merge_segment_ids) { + if registers_lock.uncommitted.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.uncommitted - } else if registers_lock - .committed - .contains_all(before_merge_segment_ids) { + } else if registers_lock.committed.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); @@ -185,23 +193,26 @@ impl SegmentManager { registers_lock.uncommitted.add_segment_entry(segment_entry); } - pub fn end_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_entry: SegmentEntry) { + pub fn end_merge( + &self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentEntry, + ) { let mut registers_lock = self.write(); - registers_lock - .writing - .remove(&after_merge_segment_entry.segment_id()); + registers_lock.writing.remove(&after_merge_segment_entry + .segment_id()); - let mut target_register: &mut SegmentRegister = { - if registers_lock - .uncommitted - .contains_all(before_merge_segment_ids) { + let target_register: &mut SegmentRegister = { + if registers_lock.uncommitted.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.uncommitted - } else if registers_lock - .committed - .contains_all(before_merge_segment_ids) { + } else if registers_lock.committed.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index af7e778d1..97be73c85 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -24,7 +24,12 @@ impl Debug for SegmentRegister { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { try!(write!(f, "SegmentRegister(")); for (k, v) in &self.segment_states { - try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())); + try!(write!( + f, + "{}:{}, ", + k.short_uuid_string(), + v.state().letter_code() + )); } try!(write!(f, ")")); Ok(()) @@ -74,9 +79,9 @@ impl SegmentRegister { } pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { - segment_ids - .iter() - .all(|segment_id| self.segment_states.contains_key(segment_id)) + segment_ids.iter().all(|segment_id| { + self.segment_states.contains_key(segment_id) + }) } pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) { @@ -91,14 +96,18 @@ impl SegmentRegister { pub fn cancel_merge(&mut self, segment_id: &SegmentId) { self.segment_states .get_mut(segment_id) - .expect("Received a merge notification for a segment that is not registered") + .expect( + "Received a merge notification for a segment that is not registered", + ) .cancel_merge(); } pub fn start_merge(&mut self, segment_id: &SegmentId) { self.segment_states .get_mut(segment_id) - .expect("Received a merge notification for a segment that is not registered") + .expect( + "Received a merge notification for a segment that is not registered", + ) .start_merge(); } @@ -144,34 +153,42 @@ mod tests { let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register - .segment_entry(&segment_id_a) - .unwrap() - .state(), - SegmentState::Ready); + assert_eq!( + segment_register + .segment_entry(&segment_id_a) + .unwrap() + .state(), + SegmentState::Ready + ); assert_eq!(segment_ids(&segment_register), vec![segment_id_a]); { let segment_meta = SegmentMeta::new(segment_id_b); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register - .segment_entry(&segment_id_b) - .unwrap() - .state(), - SegmentState::Ready); + assert_eq!( + segment_register + .segment_entry(&segment_id_b) + .unwrap() + .state(), + SegmentState::Ready + ); segment_register.start_merge(&segment_id_a); segment_register.start_merge(&segment_id_b); - assert_eq!(segment_register - .segment_entry(&segment_id_a) - .unwrap() - .state(), - SegmentState::InMerge); - assert_eq!(segment_register - .segment_entry(&segment_id_b) - .unwrap() - .state(), - SegmentState::InMerge); + assert_eq!( + segment_register + .segment_entry(&segment_id_a) + .unwrap() + .state(), + SegmentState::InMerge + ); + assert_eq!( + segment_register + .segment_entry(&segment_id_b) + .unwrap() + .state(), + SegmentState::InMerge + ); segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_b); { diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index b75663927..c2aa4bcae 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -4,8 +4,7 @@ use core::Segment; use core::SegmentComponent; use fastfield::FastFieldSerializer; use store::StoreWriter; -use postings::PostingsSerializer; - +use postings::InvertedIndexSerializer; /// Segment serializer is in charge of laying out on disk /// the data accumulated and sorted by the `SegmentWriter`. @@ -13,7 +12,7 @@ pub struct SegmentSerializer { store_writer: StoreWriter, fast_field_serializer: FastFieldSerializer, fieldnorms_serializer: FastFieldSerializer, - postings_serializer: PostingsSerializer, + postings_serializer: InvertedIndexSerializer, } impl SegmentSerializer { @@ -22,22 +21,22 @@ impl SegmentSerializer { let store_write = try!(segment.open_write(SegmentComponent::STORE)); let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS)); - let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write)); + let fast_field_serializer = try!(FastFieldSerializer::from_write(fast_field_write)); let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS)); - let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write)); + let fieldnorms_serializer = try!(FastFieldSerializer::from_write(fieldnorms_write)); - let postings_serializer = try!(PostingsSerializer::open(segment)); + let postings_serializer = try!(InvertedIndexSerializer::open(segment)); Ok(SegmentSerializer { - postings_serializer: postings_serializer, - store_writer: StoreWriter::new(store_write), - fast_field_serializer: fast_field_serializer, - fieldnorms_serializer: fieldnorms_serializer, - }) + postings_serializer: postings_serializer, + store_writer: StoreWriter::new(store_write), + fast_field_serializer: fast_field_serializer, + fieldnorms_serializer: fieldnorms_serializer, + }) } /// Accessor to the `PostingsSerializer`. - pub fn get_postings_serializer(&mut self) -> &mut PostingsSerializer { + pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { &mut self.postings_serializer } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index feeb33d03..db7add226 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) - /// and flushed. /// /// This method is not part of tantivy's public API -pub fn save_metas(segment_metas: Vec, - schema: Schema, - opstamp: u64, - directory: &mut Directory) - -> Result<()> { +pub fn save_metas( + segment_metas: Vec, + schema: Schema, + opstamp: u64, + directory: &mut Directory, +) -> Result<()> { let metas = IndexMeta { segments: segment_metas, schema: schema, @@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc); -fn perform_merge(segment_ids: &[SegmentId], - segment_updater: &SegmentUpdater, - mut merged_segment: Segment, - target_opstamp: u64) - -> Result { +fn perform_merge( + segment_ids: &[SegmentId], + segment_updater: &SegmentUpdater, + mut merged_segment: Segment, + target_opstamp: u64, +) -> Result { // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids); @@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId], for segment_id in segment_ids { if let Some(mut segment_entry) = - segment_updater.0.segment_manager.segment_entry(segment_id) { + segment_updater.0.segment_manager.segment_entry(segment_id) + { let segment = index.segment(segment_entry.meta().clone()); if let Some(file_protection) = - advance_deletes(segment, &mut segment_entry, target_opstamp)? { + advance_deletes(segment, &mut segment_entry, target_opstamp)? + { file_protections.push(file_protection); } segment_entries.push(segment_entry); } else { error!("Error, had to abort merge as some of the segment is not managed anymore."); - let msg = format!("Segment {:?} requested for merge is not managed.", - segment_id); + let msg = format!( + "Segment {:?} requested for merge is not managed.", + segment_id + ); bail!(ErrorKind::InvalidArgument(msg)); } } @@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId], // ... we just serialize this index merger in our new segment // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment) - .expect("Creating index serializer failed"); + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect( + "Creating index serializer failed", + ); - let num_docs = merger - .write(segment_serializer) - .expect("Serializing merged index failed"); + let num_docs = merger.write(segment_serializer).expect( + "Serializing merged index failed", + ); let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_max_doc(num_docs); @@ -161,23 +168,24 @@ struct InnerSegmentUpdater { } impl SegmentUpdater { - pub fn new(index: Index, - stamper: Stamper, - delete_cursor: DeleteCursor) - -> Result { + pub fn new( + index: Index, + stamper: Stamper, + delete_cursor: DeleteCursor, + ) -> Result { let segments = index.searchable_segment_metas()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater { - pool: CpuPool::new(1), - index: index, - segment_manager: segment_manager, - merge_policy: RwLock::new(box DefaultMergePolicy::default()), - merging_thread_id: AtomicUsize::default(), - merging_threads: RwLock::new(HashMap::new()), - generation: AtomicUsize::default(), - killed: AtomicBool::new(false), - stamper: stamper, - }))) + pool: CpuPool::new(1), + index: index, + segment_manager: segment_manager, + merge_policy: RwLock::new(box DefaultMergePolicy::default()), + merging_thread_id: AtomicUsize::default(), + merging_threads: RwLock::new(HashMap::new()), + generation: AtomicUsize::default(), + killed: AtomicBool::new(false), + stamper: stamper, + }))) } pub fn new_segment(&self) -> Segment { @@ -199,10 +207,10 @@ impl SegmentUpdater { self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst) } - fn run_async T> - (&self, - f: F) - -> CpuFuture { + fn run_async T>( + &self, + f: F, + ) -> CpuFuture { let me_clone = self.clone(); self.0.pool.spawn_fn(move || Ok(f(me_clone))) } @@ -211,11 +219,10 @@ impl SegmentUpdater { pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool { if generation >= self.0.generation.load(Ordering::Acquire) { self.run_async(|segment_updater| { - segment_updater.0.segment_manager.add_segment(segment_entry); - segment_updater.consider_merge_options(); - true - }) - .forget(); + segment_updater.0.segment_manager.add_segment(segment_entry); + segment_updater.consider_merge_options(); + true + }).forget(); true } else { false @@ -249,46 +256,46 @@ impl SegmentUpdater { if self.is_alive() { let index = &self.0.index; let directory = index.directory(); - save_metas(self.0.segment_manager.committed_segment_metas(), - index.schema(), - opstamp, - directory.box_clone().borrow_mut()) - .expect("Could not save metas."); + save_metas( + self.0.segment_manager.committed_segment_metas(), + index.schema(), + opstamp, + directory.box_clone().borrow_mut(), + ).expect("Could not save metas."); } } pub fn garbage_collect_files(&self) -> Result<()> { self.run_async(move |segment_updater| { segment_updater.garbage_collect_files_exec(); - }) - .wait() + }).wait() } fn garbage_collect_files_exec(&self) { info!("Running garbage collection"); let mut index = self.0.index.clone(); - index.directory_mut().garbage_collect(|| { - self.0.segment_manager.list_files() - }); + index.directory_mut().garbage_collect( + || self.0.segment_manager.list_files(), + ); } pub fn commit(&self, opstamp: u64) -> Result<()> { self.run_async(move |segment_updater| if segment_updater.is_alive() { - let segment_entries = segment_updater - .purge_deletes(opstamp) - .expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(segment_entries); - segment_updater.save_metas(opstamp); - segment_updater.garbage_collect_files_exec(); - segment_updater.consider_merge_options(); - }) - .wait() + let segment_entries = segment_updater.purge_deletes(opstamp).expect( + "Failed purge deletes", + ); + segment_updater.0.segment_manager.commit(segment_entries); + segment_updater.save_metas(opstamp); + segment_updater.garbage_collect_files_exec(); + segment_updater.consider_merge_options(); + }).wait() } - pub fn start_merge(&self, - segment_ids: &[SegmentId]) - -> impl Future { + pub fn start_merge( + &self, + segment_ids: &[SegmentId], + ) -> impl Future { self.0.segment_manager.start_merge(segment_ids); let segment_updater_clone = self.clone(); @@ -308,10 +315,12 @@ impl SegmentUpdater { // first we need to apply deletes to our segment. let merged_segment = segment_updater_clone.new_segment(); let merged_segment_id = merged_segment.id(); - let merge_result = perform_merge(&segment_ids_vec, - &segment_updater_clone, - merged_segment, - target_opstamp); + let merge_result = perform_merge( + &segment_ids_vec, + &segment_updater_clone, + merged_segment, + target_opstamp, + ); match merge_result { Ok(after_merge_segment_entry) => { @@ -345,11 +354,10 @@ impl SegmentUpdater { .remove(&merging_thread_id); Ok(()) }); - self.0 - .merging_threads - .write() - .unwrap() - .insert(merging_thread_id, merging_join_handle); + self.0.merging_threads.write().unwrap().insert( + merging_thread_id, + merging_join_handle, + ); merging_future_recv } @@ -368,19 +376,23 @@ impl SegmentUpdater { } } - fn cancel_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_entry: SegmentId) { - self.0 - .segment_manager - .cancel_merge(before_merge_segment_ids, after_merge_segment_entry); + fn cancel_merge( + &self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentId, + ) { + self.0.segment_manager.cancel_merge( + before_merge_segment_ids, + after_merge_segment_entry, + ); } - fn end_merge(&self, - before_merge_segment_ids: Vec, - mut after_merge_segment_entry: SegmentEntry) - -> Result<()> { + fn end_merge( + &self, + before_merge_segment_ids: Vec, + mut after_merge_segment_entry: SegmentEntry, + ) -> Result<()> { self.run_async(move |segment_updater| { info!("End merge {:?}", after_merge_segment_entry.meta()); @@ -391,28 +403,37 @@ impl SegmentUpdater { if delete_operation.opstamp < committed_opstamp { let index = &segment_updater.0.index; let segment = index.segment(after_merge_segment_entry.meta().clone()); - match advance_deletes(segment, - &mut after_merge_segment_entry, - committed_opstamp) { + match advance_deletes( + segment, + &mut after_merge_segment_entry, + committed_opstamp, + ) { Ok(file_protection_opt_res) => { _file_protection_opt = file_protection_opt_res; } Err(e) => { - error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", - before_merge_segment_ids, e); + error!( + "Merge of {:?} was cancelled (advancing deletes failed): {:?}", + before_merge_segment_ids, + e + ); // ... cancel merge if cfg!(test) { panic!("Merge failed."); } - segment_updater.cancel_merge(&before_merge_segment_ids, - after_merge_segment_entry.segment_id()); + segment_updater.cancel_merge( + &before_merge_segment_ids, + after_merge_segment_entry.segment_id(), + ); return; } } } } - segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, - after_merge_segment_entry); + segment_updater.0.segment_manager.end_merge( + &before_merge_segment_ids, + after_merge_segment_entry, + ); segment_updater.consider_merge_options(); info!("save metas"); segment_updater.save_metas(segment_updater.0.index.opstamp()); @@ -450,10 +471,9 @@ impl SegmentUpdater { } debug!("wait merging thread {}", new_merging_threads.len()); for (_, merging_thread_handle) in new_merging_threads { - merging_thread_handle - .join() - .map(|_| ()) - .map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?; + merging_thread_handle.join().map(|_| ()).map_err(|_| { + ErrorKind::ErrorInThread("Merging thread failed.".into()) + })?; } // Our merging thread may have queued their completed self.run_async(move |_| {}).wait()?; @@ -522,9 +542,9 @@ mod tests { assert_eq!(index.searcher().num_docs(), 302); { - index_writer - .wait_merging_threads() - .expect("waiting for merging threads"); + index_writer.wait_merging_threads().expect( + "waiting for merging threads", + ); } index.load_searchers().unwrap(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bbafb37ef..93c5ee5ee 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -54,22 +54,23 @@ impl<'a> SegmentWriter<'a> { /// the flushing behavior as a buffer limit /// - segment: The segment being written /// - schema - pub fn for_segment(heap: &'a Heap, - table_bits: usize, - mut segment: Segment, - schema: &Schema) - -> Result> { + pub fn for_segment( + heap: &'a Heap, + table_bits: usize, + mut segment: Segment, + schema: &Schema, + ) -> Result> { let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap); Ok(SegmentWriter { - heap: heap, - max_doc: 0, - multifield_postings: multifield_postings, - fieldnorms_writer: create_fieldnorms_writer(schema), - segment_serializer: segment_serializer, - fast_field_writers: FastFieldsWriter::from_schema(schema), - doc_opstamps: Vec::with_capacity(1_000), - }) + heap: heap, + max_doc: 0, + multifield_postings: multifield_postings, + fieldnorms_writer: create_fieldnorms_writer(schema), + segment_serializer: segment_serializer, + fast_field_writers: FastFieldsWriter::from_schema(schema), + doc_opstamps: Vec::with_capacity(1_000), + }) } /// Lay on disk the current content of the `SegmentWriter` @@ -77,10 +78,12 @@ impl<'a> SegmentWriter<'a> { /// Finalize consumes the `SegmentWriter`, so that it cannot /// be used afterwards. pub fn finalize(self) -> Result> { - write(&self.multifield_postings, - &self.fast_field_writers, - &self.fieldnorms_writer, - self.segment_serializer)?; + write( + &self.multifield_postings, + &self.fast_field_writers, + &self.fieldnorms_writer, + self.segment_serializer, + )?; Ok(self.doc_opstamps) } @@ -107,10 +110,11 @@ impl<'a> SegmentWriter<'a> { /// Indexes a new document /// /// As a user, you should rather use `IndexWriter`'s add_document. - pub fn add_document(&mut self, - add_operation: &AddOperation, - schema: &Schema) - -> io::Result<()> { + pub fn add_document( + &mut self, + add_operation: &AddOperation, + schema: &Schema, + ) -> io::Result<()> { let doc_id = self.max_doc; let doc = &add_operation.document; self.doc_opstamps.push(add_operation.opstamp); @@ -122,8 +126,11 @@ impl<'a> SegmentWriter<'a> { match *field_options.field_type() { FieldType::Str(ref text_options) => { let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() { - self.multifield_postings - .index_text(doc_id, field, &field_values) + self.multifield_postings.index_text( + doc_id, + field, + &field_values, + ) } else { let num_field_values = field_values.len() as u32; for field_value in field_values { @@ -132,15 +139,17 @@ impl<'a> SegmentWriter<'a> { } num_field_values }; - self.fieldnorms_writer - .get_field_writer(field) - .map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64)); + self.fieldnorms_writer.get_field_writer(field).map( + |field_norms_writer| field_norms_writer.add_val(num_tokens as u64), + ); } FieldType::U64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_u64(field_value.field(), - field_value.value().u64_value()); + let term = Term::from_field_u64( + field_value.field(), + field_value.value().u64_value(), + ); self.multifield_postings.suscribe(doc_id, &term); } } @@ -148,8 +157,10 @@ impl<'a> SegmentWriter<'a> { FieldType::I64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_i64(field_value.field(), - field_value.value().i64_value()); + let term = Term::from_field_i64( + field_value.field(), + field_value.value().i64_value(), + ); self.multifield_postings.suscribe(doc_id, &term); } } @@ -160,7 +171,9 @@ impl<'a> SegmentWriter<'a> { self.fast_field_writers.add_document(doc); let stored_fieldvalues: Vec<&FieldValue> = doc.field_values() .iter() - .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) + .filter(|field_value| { + schema.get_field_entry(field_value.field()).is_stored() + }) .collect(); let doc_writer = self.segment_serializer.get_store_writer(); try!(doc_writer.store(&stored_fieldvalues)); @@ -191,15 +204,22 @@ impl<'a> SegmentWriter<'a> { } // This method is used as a trick to workaround the borrow checker -fn write(multifield_postings: &MultiFieldPostingsWriter, - fast_field_writers: &FastFieldsWriter, - fieldnorms_writer: &FastFieldsWriter, - mut serializer: SegmentSerializer) - -> Result<()> { +fn write( + multifield_postings: &MultiFieldPostingsWriter, + fast_field_writers: &FastFieldsWriter, + fieldnorms_writer: &FastFieldsWriter, + mut serializer: SegmentSerializer, +) -> Result<()> { - try!(multifield_postings.serialize(serializer.get_postings_serializer())); - try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); - try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())); + try!(multifield_postings.serialize( + serializer.get_postings_serializer(), + )); + try!(fast_field_writers.serialize( + serializer.get_fast_field_serializer(), + )); + try!(fieldnorms_writer.serialize( + serializer.get_fieldnorms_serializer(), + )); try!(serializer.close()); Ok(()) @@ -208,10 +228,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter, impl<'a> SerializableSegment for SegmentWriter<'a> { fn write(&self, serializer: SegmentSerializer) -> Result { let max_doc = self.max_doc; - write(&self.multifield_postings, - &self.fast_field_writers, - &self.fieldnorms_writer, - serializer)?; + write( + &self.multifield_postings, + &self.fast_field_writers, + &self.fieldnorms_writer, + serializer, + )?; Ok(max_doc) } } diff --git a/src/lib.rs b/src/lib.rs index 775267892..5ff378dd0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,7 +68,7 @@ extern crate stable_deref_trait; #[cfg(test)] extern crate env_logger; -#[cfg(feature="simdcompression")] +#[cfg(feature = "simdcompression")] extern crate libc; #[cfg(windows)] @@ -98,6 +98,8 @@ mod core; mod compression; mod indexer; mod common; + +#[allow(unused_doc_comment)] mod error; mod analyzer; mod datastruct; @@ -116,7 +118,7 @@ pub use directory::Directory; pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher}; pub use indexer::IndexWriter; pub use schema::{Term, Document}; -pub use core::SegmentReader; +pub use core::{SegmentReader, InvertedIndexReader}; pub use self::common::TimerTree; pub use postings::DocSet; @@ -254,7 +256,7 @@ mod tests { } #[test] - fn test_docfreq() { + fn test_docfreq1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); @@ -293,7 +295,6 @@ mod tests { } } - #[test] fn test_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); @@ -382,15 +383,24 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let inverted_index = reader.inverted_index(text_field); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); { - let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_b, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -416,16 +426,25 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); + let inverted_index = reader.inverted_index(term_abcd.field()); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); { - let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_b, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -451,13 +470,22 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let inverted_index = reader.inverted_index(term_abcd.field()); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); { - let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_b, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -465,7 +493,9 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_c, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -489,6 +519,7 @@ mod tests { let term = Term::from_field_u64(field, 1u64); let mut postings = searcher .segment_reader(0) + .inverted_index(term.field()) .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -512,6 +543,7 @@ mod tests { let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) + .inverted_index(term.field()) .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -574,10 +606,17 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); + let inverted_index = reader.inverted_index(text_field); let term_abcd = Term::from_field_text(text_field, "abcd"); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); let term_af = Term::from_field_text(text_field, "af"); - let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_af, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); @@ -619,29 +658,43 @@ mod tests { collector.docs() }; { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec![1, 2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + vec![1, 2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec![0]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + vec![0] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec![0, 1, 2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + vec![0, 1, 2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]), - vec![1, 2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "c")]), + vec![1, 2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]), - vec![2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "d")]), + vec![2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"), - Term::from_field_text(text_field, "a")]), - vec![0, 1, 2]); + assert_eq!( + get_doc_ids(vec![ + Term::from_field_text(text_field, "b"), + Term::from_field_text(text_field, "a"), + ]), + vec![0, 1, 2] + ); } } } @@ -678,7 +731,8 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let other_text_field = schema_builder.add_text_field("text2", TEXT); - let document = doc!(text_field => "tantivy", + let document = + doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short"); assert_eq!(document.len(), 3); diff --git a/src/postings/docset.rs b/src/postings/docset.rs index ea4211a5f..8aa665f53 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -52,6 +52,33 @@ pub trait DocSet { } } + + /// Fills a given mutable buffer with the next doc ids from the + /// `DocSet` + /// + /// If that many `DocId`s are available, the method should + /// fill the entire buffer and return the length of the buffer. + /// + /// If we reach the end of the `DocSet` before filling + /// it entirely, then the buffer is filled up to this point, and + /// return value is the number of elements that were filled. + /// + /// # Warning + /// + /// This method is only here for specific high-performance + /// use case where batching. The normal way to + /// go through the `DocId`'s is to call `.advance()`. + fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize { + for (i, buffer_val) in buffer.iter_mut().enumerate() { + if self.advance() { + *buffer_val = self.doc(); + } else { + return i; + } + } + return buffer.len(); + } + /// Returns the current document fn doc(&self) -> DocId; diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs deleted file mode 100644 index f1e3f256c..000000000 --- a/src/postings/freq_handler.rs +++ /dev/null @@ -1,125 +0,0 @@ -use compression::BlockDecoder; -use common::VInt; -use common::BinarySerializable; -use compression::{CompositeDecoder, VIntDecoder}; -use postings::SegmentPostingsOption; -use compression::NUM_DOCS_PER_BLOCK; - - -/// `FreqHandler` is in charge of decompressing -/// frequencies and/or positions. -pub struct FreqHandler { - freq_decoder: BlockDecoder, - positions: Vec, - option: SegmentPostingsOption, - positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1], -} - - -fn read_positions(data: &[u8]) -> Vec { - let mut composite_reader = CompositeDecoder::new(); - let mut readable: &[u8] = data; - let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize; - composite_reader.uncompress_unsorted(readable, uncompressed_len); - composite_reader.into() -} - - - -impl FreqHandler { - /// Returns a `FreqHandler` that just decodes `DocId`s. - pub fn new_without_freq() -> FreqHandler { - FreqHandler { - freq_decoder: BlockDecoder::with_val(1u32), - positions: Vec::new(), - option: SegmentPostingsOption::NoFreq, - positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], - } - } - - /// Returns a `FreqHandler` that decodes `DocId`s and term frequencies. - pub fn new_with_freq() -> FreqHandler { - FreqHandler { - freq_decoder: BlockDecoder::new(), - positions: Vec::new(), - option: SegmentPostingsOption::Freq, - positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], - } - } - - /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions. - pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler { - let positions = read_positions(position_data); - FreqHandler { - freq_decoder: BlockDecoder::new(), - positions: positions, - option: SegmentPostingsOption::FreqAndPositions, - positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], - } - } - - fn fill_positions_offset(&mut self) { - let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK]; - let mut i: usize = 0; - self.positions_offsets[i] = cur_position; - let mut last_cur_position = cur_position; - for &doc_freq in self.freq_decoder.output_array() { - i += 1; - let mut cumulated_pos = 0u32; - // this next loop decodes delta positions into normal positions. - for j in last_cur_position..(last_cur_position + (doc_freq as usize)) { - cumulated_pos += self.positions[j]; - self.positions[j] = cumulated_pos; - } - cur_position += doc_freq as usize; - self.positions_offsets[i] = cur_position; - last_cur_position = cur_position; - } - } - - - /// Accessor to term frequency - /// - /// idx is the offset of the current doc in the block. - /// It takes value between 0 and 128. - pub fn freq(&self, idx: usize) -> u32 { - self.freq_decoder.output(idx) - } - - /// Accessor to the positions - /// - /// idx is the offset of the current doc in the block. - /// It takes value between 0 and 128. - pub fn positions(&self, idx: usize) -> &[u32] { - let start = self.positions_offsets[idx]; - let stop = self.positions_offsets[idx + 1]; - &self.positions[start..stop] - } - - /// Decompresses a complete frequency block - pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { - match self.option { - SegmentPostingsOption::NoFreq => data, - SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data), - SegmentPostingsOption::FreqAndPositions => { - let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data); - self.fill_positions_offset(); - remaining - } - } - } - - /// Decompresses an incomplete frequency block - pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) { - match self.option { - SegmentPostingsOption::NoFreq => {} - SegmentPostingsOption::Freq => { - self.freq_decoder.uncompress_vint_unsorted(data, num_els); - } - SegmentPostingsOption::FreqAndPositions => { - self.freq_decoder.uncompress_vint_unsorted(data, num_els); - self.fill_positions_offset(); - } - } - } -} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 3cb20eb82..cbe059e31 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -16,14 +16,14 @@ mod term_info; mod vec_postings; mod segment_postings; mod intersection; -mod freq_handler; mod docset; mod segment_postings_option; pub use self::docset::{SkipResult, DocSet}; use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; -pub use self::serializer::PostingsSerializer; +pub use self::serializer::{InvertedIndexSerializer, FieldSerializer}; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; + pub use self::term_info::TermInfo; pub use self::postings::Postings; @@ -32,7 +32,6 @@ pub use self::vec_postings::VecPostings; pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings}; pub use self::intersection::IntersectionDocSet; -pub use self::freq_handler::FreqHandler; pub use self::segment_postings_option::SegmentPostingsOption; pub use common::HasLen; @@ -64,21 +63,25 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut segment = index.new_segment(); - let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); - posting_serializer.new_field(text_field); - posting_serializer.new_term("abc".as_bytes()).unwrap(); - for doc_id in 0u32..3u32 { - let positions = vec![1, 2, 3, 2]; - posting_serializer.write_doc(doc_id, 2, &positions).unwrap(); + let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); + { + let mut field_serializer = posting_serializer.new_field(text_field).unwrap(); + field_serializer.new_term("abc".as_bytes()).unwrap(); + for doc_id in 0u32..120u32 { + let delta_positions = vec![1, 2, 3, 2]; + field_serializer + .write_doc(doc_id, 2, &delta_positions) + .unwrap(); + } + field_serializer.close_term().unwrap(); } - posting_serializer.close_term().unwrap(); posting_serializer.close().unwrap(); let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); - assert!(read.len() <= 16); + assert!(read.len() <= 140); } #[test] - pub fn test_position_and_fieldnorm() { + pub fn test_position_and_fieldnorm1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -87,8 +90,8 @@ mod tests { let heap = Heap::with_capacity(10_000_000); { - let mut segment_writer = SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema) - .unwrap(); + let mut segment_writer = + SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap(); { let mut doc = Document::default(); // checking that position works if the field has two values @@ -134,13 +137,17 @@ mod tests { } { let term_a = Term::from_field_text(text_field, "abcdef"); - assert!(segment_reader - .read_postings(&term_a, FreqAndPositions) - .is_none()); + assert!( + segment_reader + .inverted_index(term_a.field()) + .read_postings(&term_a, FreqAndPositions) + .is_none() + ); } { let term_a = Term::from_field_text(text_field, "a"); let mut postings_a = segment_reader + .inverted_index(term_a.field()) .read_postings(&term_a, FreqAndPositions) .unwrap(); assert_eq!(postings_a.len(), 1000); @@ -148,6 +155,7 @@ mod tests { assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.term_freq(), 6); assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); + assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 1u32); assert_eq!(postings_a.term_freq(), 1); @@ -162,6 +170,7 @@ mod tests { { let term_e = Term::from_field_text(text_field, "e"); let mut postings_e = segment_reader + .inverted_index(term_e.field()) .read_postings(&term_e, FreqAndPositions) .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); @@ -201,8 +210,10 @@ mod tests { assert!(index_writer.commit().is_ok()); } index.load_searchers().unwrap(); - let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), - SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new( + Term::from_field_text(text_field, "a"), + SegmentPostingsOption::NoFreq, + ); let searcher = index.searcher(); let mut term_weight = term_query.specialized_weight(&*searcher); term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions; @@ -249,6 +260,7 @@ mod tests { for i in 0..num_docs - 1 { for j in i + 1..num_docs { let mut segment_postings = segment_reader + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -262,6 +274,7 @@ mod tests { { let mut segment_postings = segment_reader + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -282,6 +295,7 @@ mod tests { // check that filtering works { let mut segment_postings = segment_reader + .inverted_index(term_0.field()) .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -291,6 +305,7 @@ mod tests { } let mut segment_postings = segment_reader + .inverted_index(term_0.field()) .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -315,6 +330,7 @@ mod tests { // make sure seeking still works for i in 0..num_docs { let mut segment_postings = segment_reader + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -330,6 +346,7 @@ mod tests { // now try with a longer sequence { let mut segment_postings = segment_reader + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -365,12 +382,14 @@ mod tests { // finally, check that it's empty { let mut segment_postings = segment_reader + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); assert_eq!(segment_postings.skip_next(0), SkipResult::End); let mut segment_postings = segment_reader + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -437,11 +456,12 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { - let mut segment_postings = segment_reader - .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) - .unwrap(); - while segment_postings.advance() {} - }); + let mut segment_postings = segment_reader + .inverted_index(TERM_A.field()) + .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) + .unwrap(); + while segment_postings.advance() {} + }); } #[bench] @@ -450,21 +470,27 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { let segment_postings_a = segment_reader + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_b = segment_reader + .inverted_index(TERM_B.field()) .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_c = segment_reader + .inverted_index(TERM_C.field()) .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_d = segment_reader + .inverted_index(TERM_D.field()) .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq) .unwrap(); - let mut intersection = IntersectionDocSet::from(vec![segment_postings_a, - segment_postings_b, - segment_postings_c, - segment_postings_d]); + let mut intersection = IntersectionDocSet::from(vec![ + segment_postings_a, + segment_postings_b, + segment_postings_c, + segment_postings_d, + ]); while intersection.advance() {} }); } @@ -475,6 +501,7 @@ mod tests { let docs = tests::sample(segment_reader.num_docs(), p); let mut segment_postings = segment_reader + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); @@ -491,6 +518,7 @@ mod tests { b.iter(|| { let mut segment_postings = segment_reader + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { @@ -528,6 +556,7 @@ mod tests { b.iter(|| { let n: u32 = test::black_box(17); let mut segment_postings = segment_reader + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let mut s = 0u32; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 67a8f9c5e..1b62942c5 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,7 +1,7 @@ use DocId; use schema::Term; use schema::FieldValue; -use postings::PostingsSerializer; +use postings::{InvertedIndexSerializer, FieldSerializer}; use std::io; use postings::Recorder; use analyzer::SimpleTokenizer; @@ -16,9 +16,10 @@ use schema::FieldEntry; use schema::FieldType; use schema::TextIndexingOptions; -fn posting_from_field_entry<'a>(field_entry: &FieldEntry, - heap: &'a Heap) - -> Box { +fn posting_from_field_entry<'a>( + field_entry: &FieldEntry, + heap: &'a Heap, +) -> Box { match *field_entry.field_type() { FieldType::Str(ref text_options) => { match text_options.get_indexing_options() { @@ -51,9 +52,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { let per_field_postings_writers: Vec<_> = schema .fields() .iter() - .map(|field_entry| { - posting_from_field_entry(field_entry, heap) - }) + .map(|field_entry| posting_from_field_entry(field_entry, heap)) .collect(); MultiFieldPostingsWriter { @@ -78,7 +77,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { /// It pushes all term, one field at a time, towards the /// postings serializer. #[allow(needless_range_loop)] - pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> { + pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect(); term_offsets.sort_by_key(|&(k, _v)| k); @@ -101,8 +100,13 @@ impl<'a> MultiFieldPostingsWriter<'a> { let (field, start) = offsets[i]; let (_, stop) = offsets[i + 1]; let postings_writer = &self.per_field_postings_writers[field.0 as usize]; - postings_writer - .serialize(field, &term_offsets[start..stop], serializer, self.heap)?; + let mut field_serializer = serializer.new_field(field)?; + postings_writer.serialize( + &term_offsets[start..stop], + &mut field_serializer, + self.heap, + )?; + field_serializer.close()?; } Ok(()) } @@ -126,30 +130,33 @@ pub trait PostingsWriter { /// * term - the term /// * heap - heap used to store the postings informations as well as the terms /// in the hashmap. - fn suscribe(&mut self, - term_index: &mut HashMap, - doc: DocId, - pos: u32, - term: &Term, - heap: &Heap); + fn suscribe( + &mut self, + term_index: &mut HashMap, + doc: DocId, + pos: u32, + term: &Term, + heap: &Heap, + ); /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. - fn serialize(&self, - field: Field, - term_addrs: &[(&[u8], u32)], - serializer: &mut PostingsSerializer, - heap: &Heap) - -> io::Result<()>; + fn serialize( + &self, + term_addrs: &[(&[u8], u32)], + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()>; /// Tokenize a text and suscribe all of its token. - fn index_text<'a>(&mut self, - term_index: &mut HashMap, - doc_id: DocId, - field: Field, - field_values: &[&'a FieldValue], - heap: &Heap) - -> u32 { + fn index_text<'a>( + &mut self, + term_index: &mut HashMap, + doc_id: DocId, + field: Field, + field_values: &[&'a FieldValue], + heap: &Heap, + ) -> u32 { let mut pos = 0u32; let mut num_tokens: u32 = 0u32; let mut term = unsafe { Term::with_capacity(100) }; @@ -195,12 +202,14 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { } impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> { - fn suscribe(&mut self, - term_index: &mut HashMap, - doc: DocId, - position: u32, - term: &Term, - heap: &Heap) { + fn suscribe( + &mut self, + term_index: &mut HashMap, + doc: DocId, + position: u32, + term: &Term, + heap: &Heap, + ) { debug_assert!(term.as_slice().len() >= 4); let recorder: &mut Rec = term_index.get_or_create(term); let current_doc = recorder.current_doc(); @@ -213,20 +222,18 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' recorder.record_position(position, heap); } - fn serialize(&self, - field: Field, - term_addrs: &[(&[u8], u32)], - serializer: &mut PostingsSerializer, - heap: &Heap) - -> io::Result<()> { - serializer.new_field(field); + fn serialize( + &self, + term_addrs: &[(&[u8], u32)], + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { for &(term_bytes, addr) in term_addrs { let recorder: &mut Rec = self.heap.get_mut_ref(addr); - try!(serializer.new_term(term_bytes)); - try!(recorder.serialize(addr, serializer, heap)); - try!(serializer.close_term()); + serializer.new_term(term_bytes)?; + recorder.serialize(addr, serializer, heap)?; + serializer.close_term()?; } Ok(()) } } - diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index c340d13fd..07c0c4e19 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,6 +1,6 @@ use DocId; use std::io; -use postings::PostingsSerializer; +use postings::FieldSerializer; use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; @@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable { /// Close the document. It will help record the term frequency. fn close_doc(&mut self, heap: &Heap); /// Pushes the postings information to the serializer. - fn serialize(&self, - self_addr: u32, - serializer: &mut PostingsSerializer, - heap: &Heap) - -> io::Result<()>; + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()>; } /// Only records the doc ids @@ -64,13 +65,14 @@ impl Recorder for NothingRecorder { fn close_doc(&mut self, _heap: &Heap) {} - fn serialize(&self, - self_addr: u32, - serializer: &mut PostingsSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { for doc in self.stack.iter(self_addr, heap) { - try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)); + serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?; } Ok(()) } @@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder { } - fn serialize(&self, - self_addr: u32, - serializer: &mut PostingsSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { // the last document has not been closed... // its term freq is self.current_tf. - let mut doc_iter = self.stack - .iter(self_addr, heap) - .chain(Some(self.current_tf).into_iter()); + let mut doc_iter = self.stack.iter(self_addr, heap).chain( + Some(self.current_tf) + .into_iter(), + ); while let Some(doc) = doc_iter.next() { - let term_freq = doc_iter - .next() - .expect("The IndexWriter recorded a doc without a term freq."); + let term_freq = doc_iter.next().expect( + "The IndexWriter recorded a doc without a term freq.", + ); serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?; } Ok(()) @@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder { self.stack.push(POSITION_END, heap); } - fn serialize(&self, - self_addr: u32, - serializer: &mut PostingsSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { let mut doc_positions = Vec::with_capacity(100); let mut positions_iter = self.stack.iter(self_addr, heap); while let Some(doc) = positions_iter.next() { @@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder { prev_position = position; } } - try!(serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)); + serializer.write_doc( + doc, + doc_positions.len() as u32, + &doc_positions, + )?; } Ok(()) } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index ff283f24f..cadc85401 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,12 +1,65 @@ -use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; +use compression::{COMPRESSION_BLOCK_SIZE, BlockDecoder, VIntDecoder, CompressedIntStream}; use DocId; -use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; +use postings::{Postings, DocSet, HasLen, SkipResult}; use std::cmp; -use fastfield::DeleteBitSet; use fst::Streamer; +use fastfield::DeleteBitSet; +use std::cell::UnsafeCell; +use directory::{SourceRead, ReadOnlySource}; -const EMPTY_DATA: [u8; 0] = [0u8; 0]; +const EMPTY_POSITIONS: [u32; 0] = [0u32; 0]; + + + + +struct PositionComputer { + // store the amount of position int + // before reading positions. + // + // if none, position are already loaded in + // the positions vec. + position_to_skip: Option, + positions: Vec, + positions_stream: CompressedIntStream, +} + +impl PositionComputer { + pub fn new(positions_stream: CompressedIntStream) -> PositionComputer { + PositionComputer { + position_to_skip: None, + positions: vec![], + positions_stream: positions_stream, + } + } + + pub fn add_skip(&mut self, num_skip: usize) { + self.position_to_skip = Some( + self.position_to_skip + .map(|prev_skip| prev_skip + num_skip) + .unwrap_or(0), + ); + } + + pub fn positions(&mut self, term_freq: usize) -> &[u32] { + if let Some(num_skip) = self.position_to_skip { + + self.positions.resize(term_freq, 0u32); + + self.positions_stream.skip(num_skip); + self.positions_stream.read(&mut self.positions[..term_freq]); + + let mut cum = 0u32; + for i in 0..term_freq as usize { + cum += self.positions[i]; + self.positions[i] = cum; + } + self.position_to_skip = None; + } + &self.positions[..term_freq] + } +} + /// `SegmentPostings` represents the inverted list or postings associated to @@ -14,42 +67,60 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0]; /// /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. -pub struct SegmentPostings<'a> { - block_cursor: BlockSegmentPostings<'a>, +pub struct SegmentPostings { + block_cursor: BlockSegmentPostings, cur: usize, delete_bitset: DeleteBitSet, + position_computer: Option>, } -impl<'a> SegmentPostings<'a> { + +impl SegmentPostings { /// Reads a Segment postings from an &[u8] /// /// * `len` - number of document in the posting lists. /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, - delete_bitset: DeleteBitSet) - -> SegmentPostings<'a> { + pub fn from_block_postings( + segment_block_postings: BlockSegmentPostings, + delete_bitset: DeleteBitSet, + positions_stream_opt: Option, + ) -> SegmentPostings { + let position_computer = + positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream))); SegmentPostings { block_cursor: segment_block_postings, - cur: NUM_DOCS_PER_BLOCK, // cursor within the block + cur: COMPRESSION_BLOCK_SIZE, // cursor within the block delete_bitset: delete_bitset, + position_computer: position_computer, } } /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings<'static> { + pub fn empty() -> SegmentPostings { let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), - cur: NUM_DOCS_PER_BLOCK, + cur: COMPRESSION_BLOCK_SIZE, + position_computer: None, + } + } + + + fn position_add_skip usize>(&self, num_skips_fn: F) { + if let Some(ref position_computer) = self.position_computer.as_ref() { + let num_skips = num_skips_fn(); + unsafe { + (*position_computer.get()).add_skip(num_skips); + } } } } -impl<'a> DocSet for SegmentPostings<'a> { +impl DocSet for SegmentPostings { // goes to the next element. // next needs to be called a first time to point to the correct element. #[inline] @@ -59,10 +130,11 @@ impl<'a> DocSet for SegmentPostings<'a> { if self.cur >= self.block_cursor.block_len() { self.cur = 0; if !self.block_cursor.advance() { - self.cur = NUM_DOCS_PER_BLOCK; + self.cur = COMPRESSION_BLOCK_SIZE; return false; } } + self.position_add_skip(|| self.term_freq() as usize); if !self.delete_bitset.is_deleted(self.doc()) { return true; } @@ -75,6 +147,10 @@ impl<'a> DocSet for SegmentPostings<'a> { return SkipResult::End; } + // in the following, thanks to the call to advance above, + // we know that the position is not loaded and we need + // to skip every doc_freq we cross. + // skip blocks until one that might contain the target loop { // check if we need to go to the next block @@ -83,13 +159,26 @@ impl<'a> DocSet for SegmentPostings<'a> { (block_docs[self.cur], block_docs[block_docs.len() - 1]) }; if target > last_doc_in_block { + + // we add skip for the current term independantly, + // so that position_add_skip will decide if it should + // just set itself to Some(0) or effectively + // add the term freq. + //let num_skips: u32 = ; + self.position_add_skip(|| { + let freqs_skipped = &self.block_cursor.freqs()[self.cur..]; + let sum_freq: u32 = freqs_skipped.iter().cloned().sum(); + sum_freq as usize + }); + if !self.block_cursor.advance() { return SkipResult::End; } + self.cur = 0; } else { if target < current_doc { - // We've overpassed the target after the first `advance` call + // We've passed the target after the first `advance` call // or we're at the beginning of a block. // Either way, we're on the first `DocId` greater than `target` return SkipResult::OverStep; @@ -135,6 +224,13 @@ impl<'a> DocSet for SegmentPostings<'a> { // `doc` is now >= `target` let doc = block_docs[start]; + + self.position_add_skip(|| { + let freqs_skipped = &self.block_cursor.freqs()[self.cur..start]; + let sum_freqs: u32 = freqs_skipped.iter().sum(); + sum_freqs as usize + }); + self.cur = start; if !self.delete_bitset.is_deleted(doc) { @@ -156,31 +252,41 @@ impl<'a> DocSet for SegmentPostings<'a> { self.len() } + /// Return the current document's `DocId`. #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); - assert!(self.cur < docs.len(), - "Have you forgotten to call `.advance()` at least once before calling .doc()."); + debug_assert!( + self.cur < docs.len(), + "Have you forgotten to call `.advance()` at least once before calling .doc()." + ); docs[self.cur] } } -impl<'a> HasLen for SegmentPostings<'a> { +impl HasLen for SegmentPostings { fn len(&self) -> usize { self.block_cursor.doc_freq() } } -impl<'a> Postings for SegmentPostings<'a> { +impl Postings for SegmentPostings { fn term_freq(&self) -> u32 { - self.block_cursor.freq_handler().freq(self.cur) + self.block_cursor.freq(self.cur) } fn positions(&self) -> &[u32] { - self.block_cursor.freq_handler().positions(self.cur) + let term_freq = self.term_freq(); + self.position_computer + .as_ref() + .map(|position_computer| unsafe { + (&mut *position_computer.get()).positions(term_freq as usize) + }) + .unwrap_or(&EMPTY_POSITIONS[..]) } } + /// `BlockSegmentPostings` is a cursor iterating over blocks /// of documents. /// @@ -188,28 +294,35 @@ impl<'a> Postings for SegmentPostings<'a> { /// /// While it is useful for some very specific high-performance /// use cases, you should prefer using `SegmentPostings` for most usage. -pub struct BlockSegmentPostings<'a> { - block_decoder: BlockDecoder, +pub struct BlockSegmentPostings { + doc_decoder: BlockDecoder, + freq_decoder: BlockDecoder, + has_freq: bool, + doc_freq: usize, doc_offset: DocId, num_binpacked_blocks: usize, num_vint_docs: usize, - remaining_data: &'a [u8], - freq_handler: FreqHandler, + remaining_data: SourceRead, } -impl<'a> BlockSegmentPostings<'a> { - pub(crate) fn from_data(doc_freq: usize, - data: &'a [u8], - freq_handler: FreqHandler) - -> BlockSegmentPostings<'a> { - let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; - let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; +impl BlockSegmentPostings { + pub(crate) fn from_data( + doc_freq: usize, + data: SourceRead, + has_freq: bool, + ) -> BlockSegmentPostings { + let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE; + let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks; BlockSegmentPostings { num_binpacked_blocks: num_binpacked_blocks, num_vint_docs: num_vint_docs, - block_decoder: BlockDecoder::new(), - freq_handler: freq_handler, + + doc_decoder: BlockDecoder::new(), + freq_decoder: BlockDecoder::with_val(1), + + has_freq: has_freq, + remaining_data: data, doc_offset: 0, doc_freq: doc_freq, @@ -226,9 +339,9 @@ impl<'a> BlockSegmentPostings<'a> { // # Warning // // This does not reset the positions list. - pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) { - let num_binpacked_blocks: usize = doc_freq / NUM_DOCS_PER_BLOCK; - let num_vint_docs = doc_freq & (NUM_DOCS_PER_BLOCK - 1); + pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) { + let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE; + let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1); self.num_binpacked_blocks = num_binpacked_blocks; self.num_vint_docs = num_vint_docs; self.remaining_data = postings_data; @@ -250,7 +363,25 @@ impl<'a> BlockSegmentPostings<'a> { /// returned by `.docs()` is empty. #[inline] pub fn docs(&self) -> &[DocId] { - self.block_decoder.output_array() + self.doc_decoder.output_array() + } + + /// Return the document at index `idx` of the block. + #[inline] + pub fn doc(&self, idx: usize) -> u32 { + self.doc_decoder.output(idx) + } + + /// Return the array of `term freq` in the block. + #[inline] + pub fn freqs(&self) -> &[u32] { + self.freq_decoder.output_array() + } + + /// Return the frequency at index `idx` of the block. + #[inline] + pub fn freq(&self, idx: usize) -> u32 { + self.freq_decoder.output(idx) } /// Returns the length of the current block. @@ -260,13 +391,7 @@ impl<'a> BlockSegmentPostings<'a> { /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` #[inline] fn block_len(&self) -> usize { - self.block_decoder.output_len - } - - - /// Returns a reference to the frequency handler. - pub fn freq_handler(&self) -> &FreqHandler { - &self.freq_handler + self.doc_decoder.output_len } /// Advance to the next block. @@ -274,21 +399,35 @@ impl<'a> BlockSegmentPostings<'a> { /// Returns false iff there was no remaining blocks. pub fn advance(&mut self) -> bool { if self.num_binpacked_blocks > 0 { - self.remaining_data = - self.block_decoder - .uncompress_block_sorted(self.remaining_data, self.doc_offset); - self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); - self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); + let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + ); + self.remaining_data.advance(num_consumed_bytes); + + if self.has_freq { + let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted( + self.remaining_data.as_ref(), + ); + self.remaining_data.advance(num_consumed_bytes); + } + // it will be used as the next offset. + self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1); self.num_binpacked_blocks -= 1; true } else if self.num_vint_docs > 0 { - self.remaining_data = - self.block_decoder - .uncompress_vint_sorted(self.remaining_data, - self.doc_offset, - self.num_vint_docs); - self.freq_handler - .read_freq_vint(self.remaining_data, self.num_vint_docs); + let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + self.num_vint_docs, + ); + self.remaining_data.advance(num_compressed_bytes); + if self.has_freq { + self.freq_decoder.uncompress_vint_unsorted( + self.remaining_data.as_ref(), + self.num_vint_docs, + ); + } self.num_vint_docs = 0; true } else { @@ -297,20 +436,23 @@ impl<'a> BlockSegmentPostings<'a> { } /// Returns an empty segment postings object - pub fn empty() -> BlockSegmentPostings<'static> { + pub fn empty() -> BlockSegmentPostings { BlockSegmentPostings { num_binpacked_blocks: 0, num_vint_docs: 0, - block_decoder: BlockDecoder::new(), - freq_handler: FreqHandler::new_without_freq(), - remaining_data: &EMPTY_DATA, + + doc_decoder: BlockDecoder::new(), + freq_decoder: BlockDecoder::with_val(1), + has_freq: false, + + remaining_data: From::from(ReadOnlySource::empty()), doc_offset: 0, doc_freq: 0, } } } -impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> { +impl<'b> Streamer<'b> for BlockSegmentPostings { type Item = &'b [DocId]; fn next(&'b mut self) -> Option<&'b [DocId]> { @@ -366,11 +508,13 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0); + let inverted_index = segment_reader.inverted_index(int_field); let term = Term::from_field_u64(int_field, 0u64); - let term_info = segment_reader.get_term_info(&term).unwrap(); - let mut block_segments = - segment_reader - .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + let term_info = inverted_index.get_term_info(&term).unwrap(); + let mut block_segments = inverted_index.read_block_postings_from_terminfo( + &term_info, + SegmentPostingsOption::NoFreq, + ); let mut offset: u32 = 0u32; // checking that the block before calling advance is empty assert!(block_segments.docs().is_empty()); @@ -406,17 +550,20 @@ mod tests { let mut block_segments; { let term = Term::from_field_u64(int_field, 0u64); - let term_info = segment_reader.get_term_info(&term).unwrap(); - block_segments = - segment_reader - .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + let inverted_index = segment_reader.inverted_index(int_field); + let term_info = inverted_index.get_term_info(&term).unwrap(); + block_segments = inverted_index.read_block_postings_from_terminfo( + &term_info, + SegmentPostingsOption::NoFreq, + ); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[0, 2, 4]); { let term = Term::from_field_u64(int_field, 1u64); - let term_info = segment_reader.get_term_info(&term).unwrap(); - segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments); + let inverted_index = segment_reader.inverted_index(int_field); + let term_info = inverted_index.get_term_info(&term).unwrap(); + inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[1, 3, 5]); diff --git a/src/postings/segment_postings_option.rs b/src/postings/segment_postings_option.rs index 51a07bb0b..b50e2eee4 100644 --- a/src/postings/segment_postings_option.rs +++ b/src/postings/segment_postings_option.rs @@ -16,6 +16,26 @@ pub enum SegmentPostingsOption { FreqAndPositions, } +impl SegmentPostingsOption { + /// Returns true iff this option includes encoding + /// term frequencies. + pub fn has_freq(&self) -> bool { + match *self { + SegmentPostingsOption::NoFreq => false, + _ => true, + } + } + + /// Returns true iff this option include encoding + /// term positions. + pub fn has_positions(&self) -> bool { + match *self { + SegmentPostingsOption::FreqAndPositions => true, + _ => false, + } + } +} + #[cfg(test)] mod tests { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 8c6c4c1c9..4c37e015d 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -5,16 +5,14 @@ use schema::Field; use schema::FieldEntry; use schema::FieldType; use schema::Schema; -use schema::TextIndexingOptions; use directory::WritePtr; -use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder}; +use compression::{COMPRESSION_BLOCK_SIZE, BlockEncoder}; use DocId; use core::Segment; use std::io::{self, Write}; use compression::VIntEncoder; -use common::VInt; -use common::BinarySerializable; use common::CountingWriter; +use common::CompositeWrite; use termdict::TermDictionaryBuilder; @@ -49,74 +47,127 @@ use termdict::TermDictionaryBuilder; /// /// A description of the serialization format is /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). -pub struct PostingsSerializer { - terms_fst_builder: TermDictionaryBuilderImpl, - postings_write: CountingWriter, - positions_write: CountingWriter, - last_doc_id_encoded: u32, - positions_encoder: CompositeEncoder, - block_encoder: BlockEncoder, - doc_ids: Vec, - term_freqs: Vec, - position_deltas: Vec, +pub struct InvertedIndexSerializer { + terms_write: CompositeWrite, + postings_write: CompositeWrite, + positions_write: CompositeWrite, schema: Schema, - text_indexing_options: TextIndexingOptions, - term_open: bool, - current_term_info: TermInfo, } -impl PostingsSerializer { + +impl InvertedIndexSerializer { /// Open a new `PostingsSerializer` for the given segment - pub fn new(terms_write: WritePtr, - postings_write: WritePtr, - positions_write: WritePtr, - schema: Schema) - -> Result { - let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write)); - Ok(PostingsSerializer { - terms_fst_builder: terms_fst_builder, - postings_write: CountingWriter::wrap(postings_write), - positions_write: CountingWriter::wrap(positions_write), - last_doc_id_encoded: 0u32, - positions_encoder: CompositeEncoder::new(), - block_encoder: BlockEncoder::new(), - doc_ids: Vec::new(), - term_freqs: Vec::new(), - position_deltas: Vec::new(), - schema: schema, - text_indexing_options: TextIndexingOptions::Unindexed, - term_open: false, - current_term_info: TermInfo::default(), - }) + fn new( + terms_write: CompositeWrite, + postings_write: CompositeWrite, + positions_write: CompositeWrite, + schema: Schema, + ) -> Result { + Ok(InvertedIndexSerializer { + terms_write: terms_write, + postings_write: postings_write, + positions_write: positions_write, + schema: schema, + }) } /// Open a new `PostingsSerializer` for the given segment - pub fn open(segment: &mut Segment) -> Result { + pub fn open(segment: &mut Segment) -> Result { use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; - PostingsSerializer::new(segment.open_write(TERMS)?, - segment.open_write(POSTINGS)?, - segment.open_write(POSITIONS)?, - segment.schema()) + InvertedIndexSerializer::new( + CompositeWrite::wrap(segment.open_write(TERMS)?), + CompositeWrite::wrap(segment.open_write(POSTINGS)?), + CompositeWrite::wrap(segment.open_write(POSITIONS)?), + segment.schema(), + ) } /// Must be called before starting pushing terms of /// a given field. /// /// Loads the indexing options for the given field. - pub fn new_field(&mut self, field: Field) { + pub fn new_field(&mut self, field: Field) -> io::Result { let field_entry: &FieldEntry = self.schema.get_field_entry(field); - self.text_indexing_options = match *field_entry.field_type() { - FieldType::Str(ref text_options) => text_options.get_indexing_options(), - FieldType::U64(ref int_options) | - FieldType::I64(ref int_options) => { - if int_options.is_indexed() { - TextIndexingOptions::Unindexed - } else { - TextIndexingOptions::Untokenized - } + let term_dictionary_write = self.terms_write.for_field(field); + let postings_write = self.postings_write.for_field(field); + let positions_write = self.positions_write.for_field(field); + FieldSerializer::new( + field_entry.field_type().clone(), + term_dictionary_write, + postings_write, + positions_write, + ) + } + + /// Closes the serializer. + pub fn close(self) -> io::Result<()> { + self.terms_write.close()?; + self.postings_write.close()?; + self.positions_write.close()?; + Ok(()) + } +} + + +/// The field serializer is in charge of +/// the serialization of a specific field. +pub struct FieldSerializer<'a> { + term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter>, + postings_serializer: PostingsSerializer<&'a mut CountingWriter>, + positions_serializer_opt: Option>>, + current_term_info: TermInfo, + term_open: bool, +} + + +impl<'a> FieldSerializer<'a> { + fn new( + field_type: FieldType, + term_dictionary_write: &'a mut CountingWriter, + postings_write: &'a mut CountingWriter, + positions_write: &'a mut CountingWriter, + ) -> io::Result> { + + let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { + FieldType::Str(ref text_options) => { + let text_indexing_options = text_options.get_indexing_options(); + ( + text_indexing_options.is_termfreq_enabled(), + text_indexing_options.is_position_enabled(), + ) } + _ => (false, false), }; + let term_dictionary_builder = + TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?; + let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled); + let positions_serializer_opt = if position_enabled { + Some(PositionSerializer::new(positions_write)) + } else { + None + }; + + Ok(FieldSerializer { + term_dictionary_builder: term_dictionary_builder, + postings_serializer: postings_serializer, + positions_serializer_opt: positions_serializer_opt, + current_term_info: TermInfo::default(), + term_open: false, + }) + } + + fn current_term_info(&self) -> TermInfo { + let (filepos, offset) = self.positions_serializer_opt + .as_ref() + .map(|positions_serializer| positions_serializer.addr()) + .unwrap_or((0u32, 0u8)); + TermInfo { + doc_freq: 0, + postings_offset: self.postings_serializer.addr(), + positions_offset: filepos, + positions_inner_offset: offset, + } } /// Starts the postings for a new term. @@ -124,70 +175,16 @@ impl PostingsSerializer { /// to the lexicographical order. /// * doc_freq - return the number of document containing the term. pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> { - if self.term_open { - panic!("Called new_term, while the previous term was not closed."); - } + assert!( + !self.term_open, + "Called new_term, while the previous term was not closed." + ); self.term_open = true; - self.doc_ids.clear(); - self.last_doc_id_encoded = 0; - self.term_freqs.clear(); - self.position_deltas.clear(); - self.current_term_info = TermInfo { - doc_freq: 0, - postings_offset: self.postings_write.written_bytes() as u32, - positions_offset: self.positions_write.written_bytes() as u32, - }; - self.terms_fst_builder.insert_key(term) + self.postings_serializer.clear(); + self.current_term_info = self.current_term_info(); + self.term_dictionary_builder.insert_key(term) } - /// Finish the serialization for this term postings. - /// - /// If the current block is incomplete, it need to be encoded - /// using `VInt` encoding. - pub fn close_term(&mut self) -> io::Result<()> { - if self.term_open { - - self.terms_fst_builder - .insert_value(&self.current_term_info)?; - - if !self.doc_ids.is_empty() { - // we have doc ids waiting to be written - // this happens when the number of doc ids is - // not a perfect multiple of our block size. - // - // In that case, the remaining part is encoded - // using variable int encoding. - { - let block_encoded = - self.block_encoder - .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); - self.postings_write.write_all(block_encoded)?; - self.doc_ids.clear(); - } - // ... Idem for term frequencies - if self.text_indexing_options.is_termfreq_enabled() { - let block_encoded = self.block_encoder - .compress_vint_unsorted(&self.term_freqs[..]); - self.postings_write.write_all(block_encoded)?; - self.term_freqs.clear(); - } - } - // On the other hand, positions are entirely buffered until the - // end of the term, at which point they are compressed and written. - if self.text_indexing_options.is_position_enabled() { - let posdelta_len = VInt(self.position_deltas.len() as u64); - posdelta_len.serialize(&mut self.positions_write)?; - let positions_encoded: &[u8] = self.positions_encoder - .compress_unsorted(&self.position_deltas[..]); - self.positions_write.write_all(positions_encoded)?; - self.position_deltas.clear(); - } - self.term_open = false; - } - Ok(()) - } - - /// Serialize the information that a document contains the current term, /// its term frequency, and the position deltas. /// @@ -197,32 +194,93 @@ impl PostingsSerializer { /// /// Term frequencies and positions may be ignored by the serializer depending /// on the configuration of the field in the `Schema`. - pub fn write_doc(&mut self, - doc_id: DocId, - term_freq: u32, - position_deltas: &[u32]) - -> io::Result<()> { + pub fn write_doc( + &mut self, + doc_id: DocId, + term_freq: u32, + position_deltas: &[u32], + ) -> io::Result<()> { self.current_term_info.doc_freq += 1; + self.postings_serializer.write_doc(doc_id, term_freq)?; + if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() { + positions_serializer.write(position_deltas)?; + } + Ok(()) + } + + /// Finish the serialization for this term postings. + /// + /// If the current block is incomplete, it need to be encoded + /// using `VInt` encoding. + pub fn close_term(&mut self) -> io::Result<()> { + if self.term_open { + self.term_dictionary_builder.insert_value( + &self.current_term_info, + )?; + self.postings_serializer.close_term()?; + self.term_open = false; + } + Ok(()) + } + + + /// Closes the current current field. + pub fn close(mut self) -> io::Result<()> { + self.close_term()?; + if let Some(positions_serializer) = self.positions_serializer_opt { + positions_serializer.close()?; + } + self.postings_serializer.close()?; + self.term_dictionary_builder.finish()?; + Ok(()) + } +} + + +struct PostingsSerializer { + postings_write: CountingWriter, + last_doc_id_encoded: u32, + + block_encoder: BlockEncoder, + doc_ids: Vec, + term_freqs: Vec, + + termfreq_enabled: bool, +} + +impl PostingsSerializer { + fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer { + PostingsSerializer { + postings_write: CountingWriter::wrap(write), + + block_encoder: BlockEncoder::new(), + doc_ids: vec![], + term_freqs: vec![], + + last_doc_id_encoded: 0u32, + termfreq_enabled: termfreq_enabled, + } + } + + fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> { self.doc_ids.push(doc_id); - if self.text_indexing_options.is_termfreq_enabled() { + if self.termfreq_enabled { self.term_freqs.push(term_freq as u32); } - if self.text_indexing_options.is_position_enabled() { - self.position_deltas.extend_from_slice(position_deltas); - } - if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { + if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE { { // encode the doc ids - let block_encoded: &[u8] = - self.block_encoder - .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded: &[u8] = self.block_encoder.compress_block_sorted( + &self.doc_ids, + self.last_doc_id_encoded, + ); self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; self.postings_write.write_all(block_encoded)?; } - if self.text_indexing_options.is_termfreq_enabled() { + if self.termfreq_enabled { // encode the term_freqs - let block_encoded: &[u8] = self.block_encoder - .compress_block_unsorted(&self.term_freqs); + let block_encoded: &[u8] = + self.block_encoder.compress_block_unsorted(&self.term_freqs); self.postings_write.write_all(block_encoded)?; self.term_freqs.clear(); } @@ -231,12 +289,93 @@ impl PostingsSerializer { Ok(()) } - /// Closes the serializer. - pub fn close(mut self) -> io::Result<()> { - try!(self.close_term()); - try!(self.terms_fst_builder.finish()); - try!(self.postings_write.flush()); - try!(self.positions_write.flush()); + fn close_term(&mut self) -> io::Result<()> { + if !self.doc_ids.is_empty() { + // we have doc ids waiting to be written + // this happens when the number of doc ids is + // not a perfect multiple of our block size. + // + // In that case, the remaining part is encoded + // using variable int encoding. + { + let block_encoded = self.block_encoder.compress_vint_sorted( + &self.doc_ids, + self.last_doc_id_encoded, + ); + self.postings_write.write_all(block_encoded)?; + self.doc_ids.clear(); + } + // ... Idem for term frequencies + if self.termfreq_enabled { + let block_encoded = self.block_encoder.compress_vint_unsorted( + &self.term_freqs[..], + ); + self.postings_write.write_all(block_encoded)?; + self.term_freqs.clear(); + } + } Ok(()) } + + fn close(mut self) -> io::Result<()> { + self.postings_write.flush() + } + + + fn addr(&self) -> u32 { + self.postings_write.written_bytes() as u32 + } + + fn clear(&mut self) { + self.doc_ids.clear(); + self.term_freqs.clear(); + self.last_doc_id_encoded = 0; + } +} + +struct PositionSerializer { + buffer: Vec, + write: CountingWriter, // See if we can offset the original counting writer. + block_encoder: BlockEncoder, +} + +impl PositionSerializer { + fn new(write: W) -> PositionSerializer { + PositionSerializer { + buffer: Vec::with_capacity(COMPRESSION_BLOCK_SIZE), + write: CountingWriter::wrap(write), + block_encoder: BlockEncoder::new(), + } + } + + fn addr(&self) -> (u32, u8) { + (self.write.written_bytes() as u32, self.buffer.len() as u8) + } + + fn write_block(&mut self) -> io::Result<()> { + assert_eq!(self.buffer.len(), COMPRESSION_BLOCK_SIZE); + let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer); + self.write.write_all(block_compressed)?; + self.buffer.clear(); + Ok(()) + } + + fn write(&mut self, mut vals: &[u32]) -> io::Result<()> { + let mut buffer_len = self.buffer.len(); + while vals.len() + buffer_len >= COMPRESSION_BLOCK_SIZE { + let len_to_completion = COMPRESSION_BLOCK_SIZE - buffer_len; + self.buffer.extend_from_slice(&vals[..len_to_completion]); + self.write_block()?; + vals = &vals[len_to_completion..]; + buffer_len = self.buffer.len(); + } + self.buffer.extend_from_slice(&vals); + Ok(()) + } + + fn close(mut self) -> io::Result<()> { + self.buffer.resize(COMPRESSION_BLOCK_SIZE, 0u32); + self.write_block()?; + self.write.flush() + } } diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index d639e9afb..375f73202 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -12,7 +12,7 @@ use std::io; /// * `postings_offset` : an offset in the `.idx` file /// addressing the start of the posting list associated /// to this term. -#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)] +#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)] pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, @@ -20,6 +20,8 @@ pub struct TermInfo { pub postings_offset: u32, /// Offset within the position (`.pos`) file. pub positions_offset: u32, + /// Offset within the position block. + pub positions_inner_offset: u8, } @@ -27,17 +29,20 @@ impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut W) -> io::Result<()> { self.doc_freq.serialize(writer)?; self.postings_offset.serialize(writer)?; - self.positions_offset.serialize(writer) + self.positions_offset.serialize(writer)?; + self.positions_inner_offset.serialize(writer) } fn deserialize(reader: &mut R) -> io::Result { - let doc_freq = try!(u32::deserialize(reader)); - let postings_offset = try!(u32::deserialize(reader)); - let positions_offset = try!(u32::deserialize(reader)); + let doc_freq = u32::deserialize(reader)?; + let postings_offset = u32::deserialize(reader)?; + let positions_offset = u32::deserialize(reader)?; + let positions_inner_offset = u8::deserialize(reader)?; Ok(TermInfo { - doc_freq: doc_freq, - postings_offset: postings_offset, - positions_offset: positions_offset, - }) + doc_freq: doc_freq, + postings_offset: postings_offset, + positions_offset: positions_offset, + positions_inner_offset: positions_inner_offset, + }) } } diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index b471da320..ba9f93b19 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -37,10 +37,12 @@ impl Query for BooleanQuery { } fn weight(&self, searcher: &Searcher) -> Result> { - let sub_weights = try!(self.subqueries - .iter() - .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) - .collect()); + let sub_weights = try!( + self.subqueries + .iter() + .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) + .collect() + ); let occurs: Vec = self.subqueries .iter() .map(|&(ref occur, ref _subquery)| *occur) @@ -57,10 +59,9 @@ impl BooleanQuery { let occur_term_queries: Vec<(Occur, Box)> = terms .into_iter() .map(|term| { - let term_query: Box = box TermQuery::new(term, - SegmentPostingsOption::Freq); - (Occur::Should, term_query) - }) + let term_query: Box = box TermQuery::new(term, SegmentPostingsOption::Freq); + (Occur::Should, term_query) + }) .collect(); BooleanQuery::from(occur_term_queries) } diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index 595f54219..723e4a92d 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -55,11 +55,11 @@ impl BooleanScorer { .map(|posting| posting.doc()) .enumerate() .map(|(ord, doc)| { - HeapItem { - doc: doc, - ord: ord as u32, - } - }) + HeapItem { + doc: doc, + ord: ord as u32, + } + }) .collect(); BooleanScorer { scorers: non_empty_scorers, diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 04f22595c..0ff49cbde 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -22,11 +22,12 @@ impl BooleanWeight { impl Weight for BooleanWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { - let sub_scorers: Vec> = - try!(self.weights - .iter() - .map(|weight| weight.scorer(reader)) - .collect()); + let sub_scorers: Vec> = try!( + self.weights + .iter() + .map(|weight| weight.scorer(reader)) + .collect() + ); let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); Ok(box boolean_scorer) } diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 01ef9e824..73f659a03 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -64,8 +64,10 @@ mod tests { } let make_term_query = |text: &str| { - let term_query = TermQuery::new(Term::from_field_text(text_field, text), - SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new( + Term::from_field_text(text_field, text), + SegmentPostingsOption::NoFreq, + ); let query: Box = box term_query; query }; @@ -87,19 +89,25 @@ mod tests { assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), - (Occur::Should, make_term_query("b"))]); + let boolean_query = BooleanQuery::from(vec![ + (Occur::Should, make_term_query("a")), + (Occur::Should, make_term_query("b")), + ]); assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), - (Occur::Should, make_term_query("b"))]); + let boolean_query = BooleanQuery::from(vec![ + (Occur::Must, make_term_query("a")), + (Occur::Should, make_term_query("b")), + ]); assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), - (Occur::Should, make_term_query("b")), - (Occur::MustNot, make_term_query("d"))]); + let boolean_query = BooleanQuery::from(vec![ + (Occur::Must, make_term_query("a")), + (Occur::Should, make_term_query("b")), + (Occur::MustNot, make_term_query("d")), + ]); assert_eq!(matching_docs(&boolean_query), vec![0, 1]); } { diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 4ad89a3b2..8adc4728b 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -61,9 +61,9 @@ mod tests { .map(|text| Term::from_field_text(text_field, text)) .collect(); let phrase_query = PhraseQuery::from(terms); - searcher - .search(&phrase_query, &mut test_collector) - .expect("search should succeed"); + searcher.search(&phrase_query, &mut test_collector).expect( + "search should succeed", + ); test_collector.docs() }; diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 1726340d1..d9c887afb 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -5,12 +5,12 @@ use postings::Postings; use postings::IntersectionDocSet; use DocId; -pub struct PhraseScorer<'a> { - pub intersection_docset: IntersectionDocSet>, +pub struct PhraseScorer { + pub intersection_docset: IntersectionDocSet, } -impl<'a> PhraseScorer<'a> { +impl PhraseScorer { fn phrase_match(&self) -> bool { let mut positions_arr: Vec<&[u32]> = self.intersection_docset .docsets() @@ -54,7 +54,7 @@ impl<'a> PhraseScorer<'a> { } } -impl<'a> DocSet for PhraseScorer<'a> { +impl DocSet for PhraseScorer { fn advance(&mut self) -> bool { while self.intersection_docset.advance() { if self.phrase_match() { @@ -74,7 +74,7 @@ impl<'a> DocSet for PhraseScorer<'a> { } -impl<'a> Scorer for PhraseScorer<'a> { +impl Scorer for PhraseScorer { fn score(&self) -> f32 { 1f32 } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index a171b4160..1a85342b9 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -22,14 +22,17 @@ impl Weight for PhraseWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let mut term_postings_list = Vec::new(); for term in &self.phrase_terms { + let inverted_index = reader.inverted_index(term.field()); let term_postings_option = - reader.read_postings(term, SegmentPostingsOption::FreqAndPositions); + inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions); if let Some(term_postings) = term_postings_option { term_postings_list.push(term_postings); } else { return Ok(box EmptyScorer); } } - Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) }) + Ok(box PhraseScorer { + intersection_docset: IntersectionDocSet::from(term_postings_list), + }) } } diff --git a/src/query/query.rs b/src/query/query.rs index a0c1f409d..c531cc266 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -61,10 +61,8 @@ pub trait Query: fmt::Debug { /// - iterate throw the matched documents and push them to the collector. /// fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result { - let mut timer_tree = TimerTree::default(); let weight = try!(self.weight(searcher)); - { let mut search_timer = timer_tree.open("search"); for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 08f167b25..8fa2a3c11 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -3,7 +3,8 @@ use combine::char::*; use super::user_input_ast::*; fn literal(input: I) -> ParseResult - where I: Stream +where + I: Stream, { let term_val = || { let word = many1(satisfy(|c: char| c.is_alphanumeric())); @@ -11,27 +12,29 @@ fn literal(input: I) -> ParseResult phrase.or(word) }; - let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))) - .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); + let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map( + |(s1, s2): (char, String)| format!("{}{}", s1, s2), + ); - let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))) - .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); + let field = ( + letter(), + many(satisfy(|c: char| c.is_alphanumeric() || c == '_')), + ).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); let term_val_with_field = negative_numbers.or(term_val()); let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| { - UserInputLiteral { - field_name: - Some(field_name), - phrase: phrase, - } - }); + UserInputLiteral { + field_name: Some(field_name), + phrase: phrase, + } + }); let term_default_field = term_val().map(|phrase| { - UserInputLiteral { - field_name: None, - phrase: phrase, - } - }); + UserInputLiteral { + field_name: None, + phrase: phrase, + } + }); try(term_query) .or(term_default_field) .map(UserInputAST::from) @@ -40,25 +43,29 @@ fn literal(input: I) -> ParseResult fn leaf(input: I) -> ParseResult - where I: Stream +where + I: Stream, { (char('-'), parser(literal)) .map(|(_, expr)| UserInputAST::Not(box expr)) - .or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr))) + .or((char('+'), parser(literal)).map(|(_, expr)| { + UserInputAST::Must(box expr) + })) .or(parser(literal)) .parse_stream(input) } pub fn parse_to_ast(input: I) -> ParseResult - where I: Stream +where + I: Stream, { sep_by(parser(leaf), spaces()) .map(|subqueries: Vec| if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) - }) + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) + }) .parse_stream(input) } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 0b6b43efe..5beb42745 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -117,20 +117,22 @@ impl QueryParser { /// Parse the user query into an AST. fn parse_query_to_logical_ast(&self, query: &str) -> Result { - let (user_input_ast, _remaining) = parse_to_ast(query) - .map_err(|_| QueryParserError::SyntaxError)?; + let (user_input_ast, _remaining) = parse_to_ast(query).map_err( + |_| QueryParserError::SyntaxError, + )?; self.compute_logical_ast(user_input_ast) } fn resolve_field_name(&self, field_name: &str) -> Result { - self.schema - .get_field(field_name) - .ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name))) + self.schema.get_field(field_name).ok_or_else(|| { + QueryParserError::FieldDoesNotExist(String::from(field_name)) + }) } - fn compute_logical_ast(&self, - user_input_ast: UserInputAST) - -> Result { + fn compute_logical_ast( + &self, + user_input_ast: UserInputAST, + ) -> Result { let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?; if occur == Occur::MustNot { return Err(QueryParserError::AllButQueryForbidden); @@ -138,10 +140,11 @@ impl QueryParser { Ok(ast) } - fn compute_logical_ast_for_leaf(&self, - field: Field, - phrase: &str) - -> Result, QueryParserError> { + fn compute_logical_ast_for_leaf( + &self, + field: Field, + phrase: &str, + ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -174,7 +177,9 @@ impl QueryParser { if terms.is_empty() { Ok(None) } else if terms.len() == 1 { - Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))) + Ok(Some( + LogicalLiteral::Term(terms.into_iter().next().unwrap()), + )) } else { Ok(Some(LogicalLiteral::Phrase(terms))) } @@ -191,18 +196,24 @@ impl QueryParser { } } - fn compute_logical_ast_with_occur(&self, - user_input_ast: UserInputAST) - -> Result<(Occur, LogicalAST), QueryParserError> { + fn compute_logical_ast_with_occur( + &self, + user_input_ast: UserInputAST, + ) -> Result<(Occur, LogicalAST), QueryParserError> { match user_input_ast { UserInputAST::Clause(sub_queries) => { let default_occur = self.default_occur(); - let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter() - .map(|sub_query| self.compute_logical_ast_with_occur(*sub_query)) - .map(|res| { - res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast)) - }) - .collect()); + let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!( + sub_queries + .into_iter() + .map(|sub_query| self.compute_logical_ast_with_occur(*sub_query)) + .map(|res| { + res.map(|(occur, sub_ast)| { + (compose_occur(default_occur, occur), sub_ast) + }) + }) + .collect() + ); Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } UserInputAST::Not(subquery) => { @@ -320,9 +331,10 @@ mod test { } - fn parse_query_to_logical_ast(query: &str, - default_conjunction: bool) - -> Result { + fn parse_query_to_logical_ast( + query: &str, + default_conjunction: bool, + ) -> Result { let mut query_parser = make_query_parser(); if default_conjunction { query_parser.set_conjunction_by_default(); @@ -330,9 +342,11 @@ mod test { query_parser.parse_query_to_logical_ast(query) } - fn test_parse_query_to_logical_ast_helper(query: &str, - expected: &str, - default_conjunction: bool) { + fn test_parse_query_to_logical_ast_helper( + query: &str, + expected: &str, + default_conjunction: bool, + ) { let query = parse_query_to_logical_ast(query, default_conjunction).unwrap(); let query_str = format!("{:?}", query); assert_eq!(query_str, expected); @@ -358,21 +372,29 @@ mod test { } }; - assert_eq!(is_not_indexed_err("notindexed_text:titi"), - Some(String::from("notindexed_text"))); - assert_eq!(is_not_indexed_err("notindexed_u64:23424"), - Some(String::from("notindexed_u64"))); - assert_eq!(is_not_indexed_err("notindexed_i64:-234324"), - Some(String::from("notindexed_i64"))); + assert_eq!( + is_not_indexed_err("notindexed_text:titi"), + Some(String::from("notindexed_text")) + ); + assert_eq!( + is_not_indexed_err("notindexed_u64:23424"), + Some(String::from("notindexed_u64")) + ); + assert_eq!( + is_not_indexed_err("notindexed_i64:-234324"), + Some(String::from("notindexed_i64")) + ); } #[test] pub fn test_parse_query_untokenized() { - test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"", - "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \ + test_parse_query_to_logical_ast_helper( + "nottokenized:\"wordone wordtwo\"", + "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \ 101, 32, 119, 111, 114, 100, 116, 119, 111])", - false); + false, + ); } #[test] @@ -381,82 +403,115 @@ mod test { assert!(query_parser.parse_query("signed:2324").is_ok()); assert!(query_parser.parse_query("signed:\"22\"").is_ok()); assert!(query_parser.parse_query("signed:\"-2234\"").is_ok()); - assert!(query_parser - .parse_query("signed:\"-9999999999999\"") - .is_ok()); + assert!( + query_parser + .parse_query("signed:\"-9999999999999\"") + .is_ok() + ); assert!(query_parser.parse_query("signed:\"a\"").is_err()); assert!(query_parser.parse_query("signed:\"2a\"").is_err()); - assert!(query_parser - .parse_query("signed:\"18446744073709551615\"") - .is_err()); + assert!( + query_parser + .parse_query("signed:\"18446744073709551615\"") + .is_err() + ); assert!(query_parser.parse_query("unsigned:\"2\"").is_ok()); assert!(query_parser.parse_query("unsigned:\"-2\"").is_err()); - assert!(query_parser - .parse_query("unsigned:\"18446744073709551615\"") - .is_ok()); - test_parse_query_to_logical_ast_helper("unsigned:2324", - "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])", - false); + assert!( + query_parser + .parse_query("unsigned:\"18446744073709551615\"") + .is_ok() + ); + test_parse_query_to_logical_ast_helper( + "unsigned:2324", + "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])", + false, + ); - test_parse_query_to_logical_ast_helper("signed:-2324", - &format!("{:?}", - Term::from_field_i64(Field(2u32), -2324)), - false); + test_parse_query_to_logical_ast_helper( + "signed:-2324", + &format!("{:?}", Term::from_field_i64(Field(2u32), -2324)), + false, + ); } #[test] pub fn test_parse_query_to_ast_disjunction() { - test_parse_query_to_logical_ast_helper("title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - false); - test_parse_query_to_logical_ast_helper("+title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - false); - test_parse_query_to_logical_ast_helper("+title:toto -titi", - "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ + test_parse_query_to_logical_ast_helper( + "title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + false, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + false, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto -titi", + "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \ Term([0, 0, 0, 1, 116, 105, 116, 105])))", - false); - assert_eq!(parse_query_to_logical_ast("-title:toto", false) - .err() - .unwrap(), - QueryParserError::AllButQueryForbidden); - test_parse_query_to_logical_ast_helper("title:a b", - "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \ + false, + ); + assert_eq!( + parse_query_to_logical_ast("-title:toto", false) + .err() + .unwrap(), + QueryParserError::AllButQueryForbidden + ); + test_parse_query_to_logical_ast_helper( + "title:a b", + "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \ Term([0, 0, 0, 1, 98])))", - false); - test_parse_query_to_logical_ast_helper("title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), \ + false, + ); + test_parse_query_to_logical_ast_helper( + "title:\"a b\"", + "\"[Term([0, 0, 0, 0, 97]), \ Term([0, 0, 0, 0, 98])]\"", - false); + false, + ); } #[test] pub fn test_parse_query_to_ast_conjunction() { - test_parse_query_to_logical_ast_helper("title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - true); - test_parse_query_to_logical_ast_helper("+title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - true); - test_parse_query_to_logical_ast_helper("+title:toto -titi", - "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ + test_parse_query_to_logical_ast_helper( + "title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + true, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + true, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto -titi", + "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \ Term([0, 0, 0, 1, 116, 105, 116, 105])))", - true); - assert_eq!(parse_query_to_logical_ast("-title:toto", true) - .err() - .unwrap(), - QueryParserError::AllButQueryForbidden); - test_parse_query_to_logical_ast_helper("title:a b", - "(+Term([0, 0, 0, 0, 97]) \ + true, + ); + assert_eq!( + parse_query_to_logical_ast("-title:toto", true) + .err() + .unwrap(), + QueryParserError::AllButQueryForbidden + ); + test_parse_query_to_logical_ast_helper( + "title:a b", + "(+Term([0, 0, 0, 0, 97]) \ +(Term([0, 0, 0, 0, 98]) \ Term([0, 0, 0, 1, 98])))", - true); - test_parse_query_to_logical_ast_helper("title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), \ + true, + ); + test_parse_query_to_logical_ast_helper( + "title:\"a b\"", + "\"[Term([0, 0, 0, 0, 97]), \ Term([0, 0, 0, 0, 98])]\"", - true); + true, + ); } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 9670e73e2..bbc751c5e 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -44,8 +44,10 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); - let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), - SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new( + Term::from_field_text(text_field, "a"), + SegmentPostingsOption::NoFreq, + ); let term_weight = term_query.weight(&searcher).unwrap(); let segment_reader = searcher.segment_reader(0); let mut term_scorer = term_weight.scorer(segment_reader).unwrap(); diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 73ea46b4b..95787a030 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -7,7 +7,8 @@ use postings::Postings; use fastfield::FastFieldReader; pub struct TermScorer - where TPostings: Postings +where + TPostings: Postings, { pub idf: Score, pub fieldnorm_reader_opt: Option, @@ -15,7 +16,8 @@ pub struct TermScorer } impl TermScorer - where TPostings: Postings +where + TPostings: Postings, { pub fn postings(&self) -> &TPostings { &self.postings @@ -23,7 +25,8 @@ impl TermScorer } impl DocSet for TermScorer - where TPostings: Postings +where + TPostings: Postings, { fn advance(&mut self) -> bool { self.postings.advance() @@ -40,7 +43,8 @@ impl DocSet for TermScorer } impl Scorer for TermScorer - where TPostings: Postings +where + TPostings: Postings, { fn score(&self) -> Score { let doc = self.postings.doc(); diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index e781ebdbd..d837a63fd 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -27,24 +27,28 @@ impl TermWeight { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } - pub fn specialized_scorer<'a>(&'a self, - reader: &'a SegmentReader) - -> Result>> { + /// If the field is not found, returns an empty `DocSet`. + pub fn specialized_scorer( + &self, + reader: &SegmentReader, + ) -> Result> { let field = self.term.field(); + let inverted_index = reader.inverted_index(field); let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - Ok(reader - .read_postings(&self.term, self.segment_postings_options) - .map(|segment_postings| { - TermScorer { - idf: self.idf(), - fieldnorm_reader_opt: fieldnorm_reader_opt, - postings: segment_postings, - } - }) - .unwrap_or(TermScorer { - idf: 1f32, - fieldnorm_reader_opt: None, - postings: SegmentPostings::empty(), - })) + let postings_opt: Option = + inverted_index.read_postings(&self.term, self.segment_postings_options); + if let Some(segment_postings) = postings_opt { + Ok(TermScorer { + idf: self.idf(), + fieldnorm_reader_opt: fieldnorm_reader_opt, + postings: segment_postings, + }) + } else { + Ok(TermScorer { + idf: 1f32, + fieldnorm_reader_opt: None, + postings: SegmentPostings::empty(), + }) + } } } diff --git a/src/schema/field.rs b/src/schema/field.rs index 9df8e149b..b7ecc3737 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -10,7 +10,7 @@ use common::BinarySerializable; /// /// Because the field id is a `u8`, tantivy can only have at most `255` fields. /// Value 255 is reserved. -#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)] pub struct Field(pub u32); impl BinarySerializable for Field { diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 883dc49ff..7487ff7c1 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -89,7 +89,8 @@ impl FieldEntry { impl Serialize for FieldEntry { fn serialize(&self, serializer: S) -> Result - where S: Serializer + where + S: Serializer, { let mut s = serializer.serialize_struct("field_entry", 3)?; s.serialize_field("name", &self.name)?; @@ -115,7 +116,8 @@ impl Serialize for FieldEntry { impl<'de> Deserialize<'de> for FieldEntry { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> + where + D: Deserializer<'de>, { #[derive(Deserialize)] #[serde(field_identifier, rename_all = "lowercase")] @@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry { } fn visit_map(self, mut map: V) -> Result - where V: MapAccess<'de> + where + V: MapAccess<'de>, { let mut name = None; let mut ty = None; @@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry { let name = name.ok_or_else(|| de::Error::missing_field("name"))?; ty.ok_or_else(|| de::Error::missing_field("ty"))?; - let field_type = field_type - .ok_or_else(|| de::Error::missing_field("options"))?; + let field_type = field_type.ok_or_else( + || de::Error::missing_field("options"), + )?; Ok(FieldEntry { - name: name, - field_type: field_type, - }) + name: name, + field_type: field_type, + }) } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 7a494c9e4..f31c6e9da 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -80,8 +80,9 @@ impl FieldType { FieldType::Str(_) => Ok(Value::Str(field_text.clone())), FieldType::U64(_) | FieldType::I64(_) => { - Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}", - json))) + Err(ValueParsingError::TypeError( + format!("Expected an integer, got {:?}", json), + )) } } } @@ -110,9 +111,11 @@ impl FieldType { } } _ => { - let msg = format!("Json value not supported error {:?}. Expected {:?}", - json, - self); + let msg = format!( + "Json value not supported error {:?}. Expected {:?}", + json, + self + ); Err(ValueParsingError::TypeError(msg)) } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 7c5f480dc..93f50ff48 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -105,9 +105,9 @@ impl SchemaBuilder { /// This will consume your `SchemaBuilder` pub fn build(self) -> Schema { Schema(Arc::new(InnerSchema { - fields: self.fields, - fields_map: self.fields_map, - })) + fields: self.fields, + fields_map: self.fields_map, + })) } } @@ -206,15 +206,14 @@ impl Schema { /// Build a document object from a json-object. pub fn parse_document(&self, doc_json: &str) -> Result { - let json_obj: JsonObject = serde_json::from_str(doc_json) - .map_err(|_| { - let doc_json_sample: String = if doc_json.len() < 20 { - String::from(doc_json) - } else { - format!("{:?}...", &doc_json[0..20]) - }; - DocParsingError::NotJSON(doc_json_sample) - })?; + let json_obj: JsonObject = serde_json::from_str(doc_json).map_err(|_| { + let doc_json_sample: String = if doc_json.len() < 20 { + String::from(doc_json) + } else { + format!("{:?}...", &doc_json[0..20]) + }; + DocParsingError::NotJSON(doc_json_sample) + })?; let mut doc = Document::default(); for (field_name, json_value) in json_obj.iter() { @@ -225,18 +224,15 @@ impl Schema { match *json_value { JsonValue::Array(ref json_items) => { for json_item in json_items { - let value = try!(field_type - .value_from_json(json_item) - .map_err(|e| { - DocParsingError::ValueError(field_name.clone(), e) - })); + let value = + try!(field_type.value_from_json(json_item).map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })); doc.add(FieldValue::new(field, value)); } } _ => { - let value = try!(field_type - .value_from_json(json_value) - .map_err(|e| { + let value = try!(field_type.value_from_json(json_value).map_err(|e| { DocParsingError::ValueError(field_name.clone(), e) })); doc.add(FieldValue::new(field, value)); @@ -259,7 +255,8 @@ impl fmt::Debug for Schema { impl Serialize for Schema { fn serialize(&self, serializer: S) -> Result - where S: Serializer + where + S: Serializer, { let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?; for e in &self.0.fields { @@ -271,7 +268,8 @@ impl Serialize for Schema { impl<'de> Deserialize<'de> for Schema { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> + where + D: Deserializer<'de>, { struct SchemaVisitor; @@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema { } fn visit_seq(self, mut seq: A) -> Result - where A: SeqAccess<'de> + where + A: SeqAccess<'de>, { let mut schema = SchemaBuilder { fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)), @@ -430,12 +429,14 @@ mod tests { } { let doc = schema - .parse_document(r#"{ + .parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 4, "popularity": 10 - }"#) + }"#, + ) .unwrap(); assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); @@ -443,13 +444,15 @@ mod tests { assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 4, "popularity": 10, "jambon": "bayonne" - }"#); + }"#, + ); match json_err { Err(DocParsingError::NoSuchFieldInSchema(field_name)) => { assert_eq!(field_name, "jambon"); @@ -460,13 +463,15 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": "5", "popularity": "10", "jambon": "bayonne" - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => { assert!(true); @@ -477,12 +482,14 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": -5, "popularity": 10 - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { assert!(true); @@ -493,12 +500,14 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 9223372036854775808, "popularity": 10 - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { panic!("expected 9223372036854775808 to fit into u64, but it didn't"); @@ -509,12 +518,14 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 50, "popularity": 9223372036854775808 - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { assert!(true); @@ -525,11 +536,13 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 50, - }"#); + }"#, + ); match json_err { Err(NotJSON(_)) => { assert!(true); diff --git a/src/schema/term.rs b/src/schema/term.rs index f66144b07..197f4975a 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8; /// /// It actually wraps a `Vec`. #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] -pub struct Term>(B) where B: AsRef<[u8]>; +pub struct Term>(B) +where + B: AsRef<[u8]>; impl Term { /// Builds a term given a field, and a u64-value @@ -109,7 +111,8 @@ impl Term { } impl Term - where B: AsRef<[u8]> +where + B: AsRef<[u8]>, { /// Wraps a source of data pub fn wrap(data: B) -> Term { @@ -166,7 +169,8 @@ impl Term } impl AsRef<[u8]> for Term - where B: AsRef<[u8]> +where + B: AsRef<[u8]>, { fn as_ref(&self) -> &[u8] { self.0.as_ref() diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 472bd3e1e..ddcd9948e 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -2,7 +2,7 @@ use std::ops::BitOr; /// Define how a text field should be handled by tantivy. -#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct TextOptions { indexing: TextIndexingOptions, stored: bool, @@ -45,10 +45,10 @@ impl Default for TextOptions { /// Describe how a field should be indexed -#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)] +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)] pub enum TextIndexingOptions { /// Unindexed fields will not generate any postings. They will not be searchable either. - #[serde(rename="unindexed")] + #[serde(rename = "unindexed")] Unindexed, /// Untokenized means that the field text will not be split into tokens before being indexed. /// A field with the value "Hello world", will have the document suscribe to one single @@ -56,23 +56,23 @@ pub enum TextIndexingOptions { /// /// It will **not** be searchable if the user enter "hello" for instance. /// This can be useful for tags, or ids for instance. - #[serde(rename="untokenized")] + #[serde(rename = "untokenized")] Untokenized, /// TokenizedNoFreq will tokenize the field value, and append the document doc id /// to the posting lists associated to all of the tokens. /// The frequence of appearance of the term in the document however will be lost. /// The term frequency used in the TfIdf formula will always be 1. - #[serde(rename="tokenize")] + #[serde(rename = "tokenize")] TokenizedNoFreq, /// TokenizedWithFreq will tokenize the field value, and encode /// both the docid and the term frequency in the posting lists associated to all - #[serde(rename="freq")] + #[serde(rename = "freq")] TokenizedWithFreq, /// Like TokenizedWithFreq, but also encodes the positions of the /// terms in a separate file. This option is required for phrase queries. /// Don't use this if you are certain you won't need it, the term positions file /// can be very big. - #[serde(rename="position")] + #[serde(rename = "position")] TokenizedWithFreqAndPosition, } diff --git a/src/schema/value.rs b/src/schema/value.rs index ad24688ee..828822a8e 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -16,7 +16,8 @@ pub enum Value { impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where S: Serializer + where + S: Serializer, { match *self { Value::Str(ref v) => serializer.serialize_str(v), @@ -28,7 +29,8 @@ impl Serialize for Value { impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> + where + D: Deserializer<'de>, { struct ValueVisitor; @@ -162,9 +164,13 @@ mod binary_serialize { Ok(Value::I64(value)) } _ => { - Err(io::Error::new(io::ErrorKind::InvalidData, - format!("No field type is associated with code {:?}", - type_code))) + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "No field type is associated with code {:?}", + type_code + ), + )) } } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 59e0558d1..46138d556 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -54,17 +54,19 @@ mod tests { fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema { let mut schema_builder = SchemaBuilder::default(); let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored()); - let field_title = schema_builder - .add_text_field("title", TextOptions::default().set_stored()); + let field_title = + schema_builder.add_text_field("title", TextOptions::default().set_stored()); let schema = schema_builder.build(); - let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ + let lorem = String::from( + "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ Ut enim ad minim veniam, quis nostrud exercitation ullamco \ laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \ dolor in reprehenderit in voluptate velit esse cillum dolore eu \ fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \ proident, sunt in culpa qui officia deserunt mollit anim id est \ - laborum."); + laborum.", + ); { let mut store_writer = StoreWriter::new(writer); for i in 0..num_docs { @@ -96,8 +98,10 @@ mod tests { let store_source = directory.open_read(path).unwrap(); let store = StoreReader::from_source(store_source); for i in 0..1_000 { - assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(), - format!("Doc {}", i)); + assert_eq!( + *store.get(i).unwrap().get_first(field_title).unwrap().text(), + format!("Doc {}", i) + ); } } @@ -106,9 +110,9 @@ mod tests { let mut directory = MmapDirectory::create_from_tempdir().unwrap(); let path = Path::new("store"); b.iter(|| { - write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); - directory.delete(path).unwrap(); - }); + write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); + directory.delete(path).unwrap(); + }); } diff --git a/src/store/reader.rs b/src/store/reader.rs index 05781a583..72f9b2da7 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -49,7 +49,7 @@ impl StoreReader { let mut cursor = &total_buffer[block_offset..]; let block_length = u32::deserialize(&mut cursor).unwrap(); let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize).. - (block_offset + 4 + block_length as usize)]; + (block_offset + 4 + block_length as usize)]; let mut lz4_decoder = try!(lz4::Decoder::new(block_array)); *self.current_block_offset.borrow_mut() = usize::max_value(); try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())); @@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) let offset = u64::deserialize(&mut serialized_offset_buf).unwrap(); let offset = offset as usize; let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap(); - (data.slice(0, offset), data.slice(offset, footer_offset), max_doc) + ( + data.slice(0, offset), + data.slice(offset, footer_offset), + max_doc, + ) } diff --git a/src/store/writer.rs b/src/store/writer.rs index 28befa7af..2b7aacb19 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -49,12 +49,15 @@ impl StoreWriter { /// pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> { self.intermediary_buffer.clear(); - try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); + try!((field_values.len() as u32).serialize( + &mut self.intermediary_buffer, + )); for field_value in field_values { try!((*field_value).serialize(&mut self.intermediary_buffer)); } - (self.intermediary_buffer.len() as u32) - .serialize(&mut self.current_block)?; + (self.intermediary_buffer.len() as u32).serialize( + &mut self.current_block, + )?; self.current_block.write_all(&self.intermediary_buffer[..])?; self.doc += 1; if self.current_block.len() > BLOCK_SIZE { @@ -66,16 +69,22 @@ impl StoreWriter { fn write_and_compress_block(&mut self) -> io::Result<()> { self.intermediary_buffer.clear(); { - let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)); + let mut encoder = try!(lz4::EncoderBuilder::new().build( + &mut self.intermediary_buffer, + )); try!(encoder.write_all(&self.current_block)); let (_, encoder_result) = encoder.finish(); try!(encoder_result); } - (self.intermediary_buffer.len() as u32) - .serialize(&mut self.writer)?; + (self.intermediary_buffer.len() as u32).serialize( + &mut self.writer, + )?; self.writer.write_all(&self.intermediary_buffer)?; - self.offset_index_writer - .insert(self.doc, &(self.writer.written_bytes() as u64))?; + self.offset_index_writer.insert( + self.doc, + &(self.writer.written_bytes() as + u64), + )?; self.current_block.clear(); Ok(()) } @@ -90,8 +99,7 @@ impl StoreWriter { try!(self.write_and_compress_block()); } let header_offset: u64 = self.writer.written_bytes() as u64; - try!(self.offset_index_writer - .write(&mut self.writer)); + try!(self.offset_index_writer.write(&mut self.writer)); try!(header_offset.serialize(&mut self.writer)); try!(self.doc.serialize(&mut self.writer)); self.writer.flush() diff --git a/src/termdict/fstdict/streamer.rs b/src/termdict/fstdict/streamer.rs index 053942bf9..1d90fe9c1 100644 --- a/src/termdict/fstdict/streamer.rs +++ b/src/termdict/fstdict/streamer.rs @@ -1,23 +1,17 @@ use fst::{IntoStreamer, Streamer}; use fst::map::{StreamBuilder, Stream}; -use common::BinarySerializable; +use postings::TermInfo; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - fst_map: &'a TermDictionaryImpl, +pub struct TermStreamerBuilderImpl<'a> { + fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>, } -impl<'a, V> TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - pub(crate) fn new(fst_map: &'a TermDictionaryImpl, - stream_builder: StreamBuilder<'a>) - -> Self { +impl<'a> TermStreamerBuilderImpl<'a> { + pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self { TermStreamerBuilderImpl { fst_map: fst_map, stream_builder: stream_builder, @@ -25,10 +19,8 @@ impl<'a, V> TermStreamerBuilderImpl<'a, V> } } -impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - type Streamer = TermStreamerImpl<'a, V>; +impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { + type Streamer = TermStreamerImpl<'a>; fn ge>(mut self, bound: T) -> Self { self.stream_builder = self.stream_builder.ge(bound); @@ -56,35 +48,30 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> stream: self.stream_builder.into_stream(), offset: 0u64, current_key: Vec::with_capacity(100), - current_value: V::default(), + current_value: TermInfo::default(), } } } /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - fst_map: &'a TermDictionaryImpl, +pub struct TermStreamerImpl<'a> { + fst_map: &'a TermDictionaryImpl, stream: Stream<'a>, offset: u64, current_key: Vec, - current_value: V, + current_value: TermInfo, } -impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> - where V: BinarySerializable + Default -{ +impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { if let Some((term, offset)) = self.stream.next() { self.current_key.clear(); self.current_key.extend_from_slice(term); self.offset = offset; - self.current_value = - self.fst_map - .read_value(self.offset) - .expect("Fst data is corrupted. Failed to deserialize a value."); + self.current_value = self.fst_map.read_value(self.offset).expect( + "Fst data is corrupted. Failed to deserialize a value.", + ); true } else { false @@ -95,7 +82,7 @@ impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> &self.current_key } - fn value(&self) -> &V { + fn value(&self) -> &TermInfo { &self.current_value } } diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 78edaf203..ce608113b 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -3,7 +3,7 @@ use fst; use fst::raw::Fst; use directory::ReadOnlySource; use common::BinarySerializable; -use std::marker::PhantomData; +use schema::FieldType; use postings::TermInfo; use termdict::{TermDictionary, TermDictionaryBuilder}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; @@ -13,18 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error { } /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) -pub struct TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default -{ +pub struct TermDictionaryBuilderImpl { fst_builder: fst::MapBuilder, data: Vec, - _phantom_: PhantomData, } -impl TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilderImpl +where + W: Write, { /// # Warning /// Horribly dangerous internal API @@ -43,26 +39,25 @@ impl TermDictionaryBuilderImpl /// # Warning /// /// Horribly dangerous internal API. See `.insert_key(...)`. - pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> { + pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { value.serialize(&mut self.data)?; Ok(()) } } -impl TermDictionaryBuilder for TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilder for TermDictionaryBuilderImpl +where + W: Write, { - fn new(w: W) -> io::Result { + fn new(w: W, _field_type: FieldType) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilderImpl { - fst_builder: fst_builder, - data: Vec::new(), - _phantom_: PhantomData, - }) + fst_builder: fst_builder, + data: Vec::new(), + }) } - fn insert>(&mut self, key_ref: K, value: &V) -> io::Result<()> { + fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { let key = key_ref.as_ref(); self.fst_builder .insert(key, self.data.len() as u64) @@ -81,73 +76,65 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl } } -fn open_fst_index(source: ReadOnlySource) -> io::Result { +fn open_fst_index(source: ReadOnlySource) -> fst::Map { let fst = match source { ReadOnlySource::Anonymous(data) => { - Fst::from_shared_bytes(data.data, data.start, data.len) - .map_err(convert_fst_error)? + Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted") } ReadOnlySource::Mmap(mmap_readonly) => { - Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)? + Fst::from_mmap(mmap_readonly).expect("FST data is corrupted") } }; - Ok(fst::Map::from(fst)) + fst::Map::from(fst) } /// See [`TermDictionary`](./trait.TermDictionary.html) -pub struct TermDictionaryImpl - where V: BinarySerializable + Default -{ +pub struct TermDictionaryImpl { fst_index: fst::Map, values_mmap: ReadOnlySource, - _phantom_: PhantomData, } -impl TermDictionaryImpl - where V: BinarySerializable + Default -{ +impl TermDictionaryImpl { /// Deserialize and returns the value at address `offset` - pub(crate) fn read_value(&self, offset: u64) -> io::Result { + pub(crate) fn read_value(&self, offset: u64) -> io::Result { let buffer = self.values_mmap.as_slice(); let mut cursor = &buffer[(offset as usize)..]; - V::deserialize(&mut cursor) + TermInfo::deserialize(&mut cursor) } } -impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl - where V: BinarySerializable + Default + 'a -{ - type Streamer = TermStreamerImpl<'a, V>; +impl<'a> TermDictionary<'a> for TermDictionaryImpl { + type Streamer = TermStreamerImpl<'a>; - type StreamBuilder = TermStreamerBuilderImpl<'a, V>; + type StreamBuilder = TermStreamerBuilderImpl<'a>; - fn from_source(source: ReadOnlySource) -> io::Result { + fn from_source(source: ReadOnlySource) -> Self { let total_len = source.len(); let length_offset = total_len - 4; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = u32::deserialize(&mut split_len_buffer)? as usize; + let footer_size = u32::deserialize(&mut split_len_buffer).expect( + "Deserializing 4 bytes should always work", + ) as usize; let split_len = length_offset - footer_size; let fst_source = source.slice(0, split_len); let values_source = source.slice(split_len, length_offset); - let fst_index = open_fst_index(fst_source)?; - Ok(TermDictionaryImpl { - fst_index: fst_index, - values_mmap: values_source, - _phantom_: PhantomData, - }) + let fst_index = open_fst_index(fst_source); + TermDictionaryImpl { + fst_index: fst_index, + values_mmap: values_source, + } } - fn get>(&self, key: K) -> Option { - self.fst_index - .get(key) - .map(|offset| { - self.read_value(offset) - .expect("The fst is corrupted. Failed to deserialize a value.") - }) + fn get>(&self, key: K) -> Option { + self.fst_index.get(key).map(|offset| { + self.read_value(offset).expect( + "The fst is corrupted. Failed to deserialize a value.", + ) + }) } - fn range(&self) -> TermStreamerBuilderImpl { + fn range(&self) -> TermStreamerBuilderImpl { TermStreamerBuilderImpl::new(self, self.fst_index.range()) } } diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 4689e0673..517f9589a 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -1,42 +1,30 @@ use std::collections::BinaryHeap; -use core::SegmentReader; use termdict::TermStreamerImpl; -use common::BinarySerializable; -use postings::TermInfo; use std::cmp::Ordering; use termdict::TermStreamer; -use termdict::TermDictionary; use schema::Term; -pub struct HeapItem<'a, V> - where V: 'a + BinarySerializable + Default -{ - pub streamer: TermStreamerImpl<'a, V>, +pub struct HeapItem<'a> { + pub streamer: TermStreamerImpl<'a>, pub segment_ord: usize, } -impl<'a, V> PartialEq for HeapItem<'a, V> - where V: 'a + BinarySerializable + Default -{ +impl<'a> PartialEq for HeapItem<'a> { fn eq(&self, other: &Self) -> bool { self.segment_ord == other.segment_ord } } -impl<'a, V> Eq for HeapItem<'a, V> where V: 'a + BinarySerializable + Default {} +impl<'a> Eq for HeapItem<'a> {} -impl<'a, V> PartialOrd for HeapItem<'a, V> - where V: 'a + BinarySerializable + Default -{ - fn partial_cmp(&self, other: &HeapItem<'a, V>) -> Option { +impl<'a> PartialOrd for HeapItem<'a> { + fn partial_cmp(&self, other: &HeapItem<'a>) -> Option { Some(self.cmp(other)) } } -impl<'a, V> Ord for HeapItem<'a, V> - where V: 'a + BinarySerializable + Default -{ - fn cmp(&self, other: &HeapItem<'a, V>) -> Ordering { +impl<'a> Ord for HeapItem<'a> { + fn cmp(&self, other: &HeapItem<'a>) -> Ordering { (&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord)) } } @@ -48,28 +36,27 @@ impl<'a, V> Ord for HeapItem<'a, V> /// - the term /// - a slice with the ordinal of the segments containing /// the terms. -pub struct TermMerger<'a, V> - where V: 'a + BinarySerializable + Default -{ - heap: BinaryHeap>, - current_streamers: Vec>, +pub struct TermMerger<'a> { + heap: BinaryHeap>, + current_streamers: Vec>, } -impl<'a, V> TermMerger<'a, V> - where V: 'a + BinarySerializable + Default -{ - fn new(streams: Vec>) -> TermMerger<'a, V> { +impl<'a> TermMerger<'a> { + /// Stream of merged term dictionary + /// + /// + pub fn new(streams: Vec>) -> TermMerger<'a> { TermMerger { heap: BinaryHeap::new(), current_streamers: streams .into_iter() .enumerate() .map(|(ord, streamer)| { - HeapItem { - streamer: streamer, - segment_ord: ord, - } - }) + HeapItem { + streamer: streamer, + segment_ord: ord, + } + }) .collect(), } } @@ -125,7 +112,7 @@ impl<'a, V> TermMerger<'a, V> /// This method may be called /// iff advance() has been called before /// and "true" was returned. - pub fn current_kvs(&self) -> &[HeapItem<'a, V>] { + pub fn current_kvs(&self) -> &[HeapItem<'a>] { &self.current_streamers[..] } @@ -139,14 +126,3 @@ impl<'a, V> TermMerger<'a, V> } } } - - - -impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> { - fn from(segment_readers: &'a [SegmentReader]) -> TermMerger<'a, TermInfo> { - TermMerger::new(segment_readers - .iter() - .map(|reader| reader.terms().stream()) - .collect()) - } -} diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index c4786f539..9150b8f85 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -1,36 +1,10 @@ /*! The term dictionary is one of the key datastructure of -tantivy. It associates sorted `terms` to their respective -posting list. +tantivy. It associates sorted `terms` to a `TermInfo` struct +that serves as an address in their respective posting list. -The term dictionary makes it possible to iterate through -the keys in a sorted manner. - -# Example - -``` -extern crate tantivy; -use tantivy::termdict::*; -use tantivy::directory::ReadOnlySource; - -# fn main() { -# run().expect("Test failed"); -# } -# fn run() -> tantivy::Result<()> { -let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec!())?; - -// keys have to be insert in order. -term_dictionary_builder.insert("apple", &1u32)?; -term_dictionary_builder.insert("grape", &2u32)?; -term_dictionary_builder.insert("pear", &3u32)?; -let buffer: Vec = term_dictionary_builder.finish()?; - -let source = ReadOnlySource::from(buffer); -let term_dictionary = TermDictionaryImpl::from_source(source)?; - -assert_eq!(term_dictionary.get("grape"), Some(2u32)); -# Ok(()) -# } +The term dictionary API makes it possible to iterate through +a range of keys in a sorted manner. ``` @@ -74,48 +48,45 @@ followed by a streaming through at most `1024` elements in the term `stream`. */ -use schema::{Field, Term}; -use common::BinarySerializable; +use schema::{Field, Term, FieldType}; use directory::ReadOnlySource; - +use postings::TermInfo; pub use self::merger::TermMerger; - -#[cfg(not(feature="streamdict"))] +#[cfg(not(feature = "streamdict"))] mod fstdict; -#[cfg(not(feature="streamdict"))] +#[cfg(not(feature = "streamdict"))] pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, TermStreamerBuilderImpl}; -#[cfg(feature="streamdict")] +#[cfg(feature = "streamdict")] mod streamdict; -#[cfg(feature="streamdict")] +#[cfg(feature = "streamdict")] pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, TermStreamerBuilderImpl}; - mod merger; use std::io; /// Dictionary associating sorted `&[u8]` to values -pub trait TermDictionary<'a, V> - where V: BinarySerializable + Default + 'a, - Self: Sized +pub trait TermDictionary<'a> +where + Self: Sized, { /// Streamer type associated to the term dictionary - type Streamer: TermStreamer + 'a; + type Streamer: TermStreamer + 'a; /// StreamerBuilder type associated to the term dictionary - type StreamBuilder: TermStreamerBuilder + 'a; + type StreamBuilder: TermStreamerBuilder + 'a; /// Opens a `TermDictionary` given a data source. - fn from_source(source: ReadOnlySource) -> io::Result; + fn from_source(source: ReadOnlySource) -> Self; /// Lookups the value corresponding to the key. - fn get>(&self, target_key: K) -> Option; + fn get>(&self, target_key: K) -> Option; /// Returns a range builder, to stream all of the terms /// within an interval. @@ -140,17 +111,17 @@ pub trait TermDictionary<'a, V> /// Builder for the new term dictionary. /// /// Inserting must be done in the order of the `keys`. -pub trait TermDictionaryBuilder: Sized - where W: io::Write, - V: BinarySerializable + Default +pub trait TermDictionaryBuilder: Sized +where + W: io::Write, { /// Creates a new `TermDictionaryBuilder` - fn new(write: W) -> io::Result; + fn new(write: W, field_type: FieldType) -> io::Result; /// Inserts a `(key, value)` pair in the term dictionary. /// /// *Keys have to be inserted in order.* - fn insert>(&mut self, key: K, value: &V) -> io::Result<()>; + fn insert>(&mut self, key: K, value: &TermInfo) -> io::Result<()>; /// Finalize writing the builder, and returns the underlying /// `Write` object. @@ -160,7 +131,7 @@ pub trait TermDictionaryBuilder: Sized /// `TermStreamer` acts as a cursor over a range of terms of a segment. /// Terms are guaranteed to be sorted. -pub trait TermStreamer: Sized { +pub trait TermStreamer: Sized { /// Advance position the stream on the next item. /// Before the first call to `.advance()`, the stream /// is an unitialized state. @@ -187,10 +158,10 @@ pub trait TermStreamer: Sized { /// /// Calling `.value()` before the first call to `.advance()` returns /// `V::default()`. - fn value(&self) -> &V; + fn value(&self) -> &TermInfo; /// Return the next `(key, value)` pair. - fn next(&mut self) -> Option<(Term<&[u8]>, &V)> { + fn next(&mut self) -> Option<(Term<&[u8]>, &TermInfo)> { if self.advance() { Some((Term::wrap(self.key()), self.value())) } else { @@ -202,11 +173,9 @@ pub trait TermStreamer: Sized { /// `TermStreamerBuilder` is an helper object used to define /// a range of terms that should be streamed. -pub trait TermStreamerBuilder - where V: BinarySerializable + Default -{ +pub trait TermStreamerBuilder { /// Associated `TermStreamer` type that this builder is building. - type Streamer: TermStreamer; + type Streamer: TermStreamer; /// Limit the range to terms greater or equal to the bound fn ge>(self, bound: T) -> Self; @@ -231,60 +200,70 @@ mod tests { use super::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl}; use directory::{RAMDirectory, Directory, ReadOnlySource}; use std::path::PathBuf; - use schema::{Term, SchemaBuilder, Document, TEXT}; + use schema::{FieldType, Term, SchemaBuilder, Document, TEXT}; use core::Index; use std::str; use termdict::TermStreamer; use termdict::TermStreamerBuilder; use termdict::TermDictionary; use termdict::TermDictionaryBuilder; + use postings::TermInfo; + const BLOCK_SIZE: usize = 1_500; + fn make_term_info(val: u32) -> TermInfo { + TermInfo { + doc_freq: val, + positions_offset: val * 2u32, + postings_offset: val * 3u32, + positions_inner_offset: 5u8, + } + } + #[test] - fn test_term_dictionary() { + fn test_term_dictionary_simple() { let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write).unwrap(); - term_dictionary_builder - .insert("abc".as_bytes(), &34u32) + let field_type = FieldType::Str(TEXT); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type) .unwrap(); term_dictionary_builder - .insert("abcd".as_bytes(), &346u32) + .insert("abc".as_bytes(), &make_term_info(34u32)) + .unwrap(); + term_dictionary_builder + .insert("abcd".as_bytes(), &make_term_info(346u32)) .unwrap(); term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source).unwrap(); - assert_eq!(term_dict.get("abc"), Some(34u32)); - assert_eq!(term_dict.get("abcd"), Some(346u32)); + let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source); + assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); + assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); let mut stream = term_dict.stream(); { { let (k, v) = stream.next().unwrap(); assert_eq!(k.as_ref(), "abc".as_bytes()); - assert_eq!(v, &34u32); + assert_eq!(v.doc_freq, 34u32); } assert_eq!(stream.key(), "abc".as_bytes()); - assert_eq!(*stream.value(), 34u32); + assert_eq!(stream.value().doc_freq, 34u32); } { { let (k, v) = stream.next().unwrap(); assert_eq!(k.as_slice(), "abcd".as_bytes()); - assert_eq!(v, &346u32); + assert_eq!(v.doc_freq, 346u32); } assert_eq!(stream.key(), "abcd".as_bytes()); - assert_eq!(*stream.value(), 346u32); + assert_eq!(stream.value().doc_freq, 346u32); } assert!(!stream.advance()); } - - - #[test] fn test_term_iterator() { let mut schema_builder = SchemaBuilder::default(); @@ -319,7 +298,9 @@ mod tests { } index.load_searchers().unwrap(); let searcher = index.searcher(); - let mut term_it = searcher.terms(); + + let field_searcher = searcher.field(text_field); + let mut term_it = field_searcher.terms(); let mut term_string = String::new(); while term_it.advance() { let term = Term::from_bytes(term_it.key()); @@ -334,23 +315,26 @@ mod tests { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); + let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); for &(ref id, ref i) in &ids { - term_dictionary_builder.insert(id.as_bytes(), i).unwrap(); + term_dictionary_builder + .insert(id.as_bytes(), &make_term_info(*i)) + .unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); { let mut streamer = term_dictionary.stream(); let mut i = 0; while let Some((streamer_k, streamer_v)) = streamer.next() { let &(ref key, ref v) = &ids[i]; assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v, v); + assert_eq!(streamer_v, &make_term_info(*v)); i += 1; } } @@ -359,23 +343,59 @@ mod tests { term_dictionary.get(key.as_bytes()); } + + #[test] + fn test_stream_high_range_prefix_suffix() { + let field_type = FieldType::Str(TEXT); + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); + // term requires more than 16bits + term_dictionary_builder + .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) + .unwrap(); + term_dictionary_builder + .insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)) + .unwrap(); + term_dictionary_builder + .insert("abr", &make_term_info(2)) + .unwrap(); + term_dictionary_builder.finish().unwrap() + }; + let source = ReadOnlySource::from(buffer); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); + let mut kv_stream = term_dictionary.stream(); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(1)); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(2)); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abr".as_bytes()); + assert!(!kv_stream.advance()); + } + #[test] fn test_stream_range() { - let ids: Vec<_> = (0u32..50_000u32) + let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); + let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); for &(ref id, ref i) in &ids { - term_dictionary_builder.insert(id.as_bytes(), i).unwrap(); + term_dictionary_builder + .insert(id.as_bytes(), &make_term_info(*i)) + .unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); { for i in (0..20).chain(6000..8_000) { let &(ref target_key, _) = &ids[i]; @@ -387,7 +407,8 @@ mod tests { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j]; assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key); - assert_eq!(streamer_v, v); + assert_eq!(streamer_v.doc_freq, *v); + assert_eq!(streamer_v, &make_term_info(*v)); } } } @@ -403,7 +424,7 @@ mod tests { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j + 1]; assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v, v); + assert_eq!(streamer_v.doc_freq, *v); } } } @@ -430,45 +451,56 @@ mod tests { #[test] fn test_stream_range_boundaries() { + let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; - term_dictionary_builder.insert(&number_arr, &i).unwrap(); + term_dictionary_builder + .insert(&number_arr, &make_term_info(i as u32)) + .unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); - let value_list = |mut streamer: TermStreamerImpl| { - let mut res: Vec = vec![]; - while let Some((_, &v)) = streamer.next() { - res.push(v); + let value_list = |mut streamer: TermStreamerImpl| { + let mut res: Vec = vec![]; + while let Some((_, ref v)) = streamer.next() { + res.push(v.doc_freq); } res }; { let range = term_dictionary.range().ge([2u8]).into_stream(); - assert_eq!(value_list(range), - vec![2u8, 3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]); + assert_eq!( + value_list(range), + vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); } { let range = term_dictionary.range().gt([2u8]).into_stream(); - assert_eq!(value_list(range), vec![3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]); + assert_eq!( + value_list(range), + vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); } { let range = term_dictionary.range().lt([6u8]).into_stream(); - assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8]); + assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]); } { let range = term_dictionary.range().le([6u8]).into_stream(); - assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8, 6u8]); + assert_eq!( + value_list(range), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] + ); } { let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream(); - assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8]); + assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]); } } diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs new file mode 100644 index 000000000..5ba466203 --- /dev/null +++ b/src/termdict/streamdict/delta_encoder.rs @@ -0,0 +1,175 @@ +use postings::TermInfo; +use super::CheckPoint; +use std::mem; +use common::BinarySerializable; + +/// Returns the len of the longest +/// common prefix of `s1` and `s2`. +/// +/// ie: the greatest `L` such that +/// for all `0 <= i < L`, `s1[i] == s2[i]` +fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { + s1.iter() + .zip(s2.iter()) + .take_while(|&(a, b)| a == b) + .count() +} + + +#[derive(Default)] +pub struct TermDeltaEncoder { + last_term: Vec, + prefix_len: usize, +} + +impl TermDeltaEncoder { + pub fn encode<'a>(&mut self, term: &'a [u8]) { + self.prefix_len = common_prefix_len(term, &self.last_term); + self.last_term.truncate(self.prefix_len); + self.last_term.extend_from_slice(&term[self.prefix_len..]); + } + + pub fn term(&self) -> &[u8] { + &self.last_term[..] + } + + pub fn prefix_suffix(&mut self) -> (usize, &[u8]) { + (self.prefix_len, &self.last_term[self.prefix_len..]) + } +} + +#[derive(Default)] +pub struct TermDeltaDecoder { + term: Vec, +} + +impl TermDeltaDecoder { + pub fn with_previous_term(term: Vec) -> TermDeltaDecoder { + TermDeltaDecoder { term: Vec::from(term) } + } + + #[inline(always)] + pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] { + let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 { + let b = cursor[0]; + cursor = &cursor[1..]; + let prefix_len = (b & 15u8) as usize; + let suffix_len = (b >> 4u8) as usize; + (prefix_len, suffix_len) + } else { + let prefix_len = u32::deserialize(&mut cursor).unwrap(); + let suffix_len = u32::deserialize(&mut cursor).unwrap(); + (prefix_len as usize, suffix_len as usize) + }; + unsafe { self.term.set_len(prefix_len) }; + self.term.extend_from_slice(&(*cursor)[..suffix_len]); + &cursor[suffix_len..] + } + + pub fn term(&self) -> &[u8] { + &self.term[..] + } +} + +#[derive(Default)] +pub struct DeltaTermInfo { + pub doc_freq: u32, + pub delta_postings_offset: u32, + pub delta_positions_offset: u32, + pub positions_inner_offset: u8, +} + +pub struct TermInfoDeltaEncoder { + term_info: TermInfo, + pub has_positions: bool, +} + +impl TermInfoDeltaEncoder { + pub fn new(has_positions: bool) -> Self { + TermInfoDeltaEncoder { + term_info: TermInfo::default(), + has_positions: has_positions, + } + } + + pub fn term_info(&self) -> &TermInfo { + &self.term_info + } + + pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo { + let mut delta_term_info = DeltaTermInfo { + doc_freq: term_info.doc_freq, + delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset, + delta_positions_offset: 0, + positions_inner_offset: 0, + }; + if self.has_positions { + delta_term_info.delta_positions_offset = term_info.positions_offset - + self.term_info.positions_offset; + delta_term_info.positions_inner_offset = term_info.positions_inner_offset; + } + mem::replace(&mut self.term_info, term_info); + delta_term_info + } +} + + +pub struct TermInfoDeltaDecoder { + term_info: TermInfo, + has_positions: bool, +} + + +#[inline(always)] +pub fn make_mask(num_bytes: usize) -> u32 { + const MASK: [u32; 4] = [0xffu32, 0xffffu32, 0xffffffu32, 0xffffffffu32]; + *unsafe { MASK.get_unchecked(num_bytes.wrapping_sub(1) as usize) } +} + +impl TermInfoDeltaDecoder { + pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder { + TermInfoDeltaDecoder { + term_info: term_info, + has_positions: has_positions, + } + } + + pub fn from_checkpoint(checkpoint: &CheckPoint, has_positions: bool) -> TermInfoDeltaDecoder { + TermInfoDeltaDecoder { + term_info: TermInfo { + doc_freq: 0u32, + postings_offset: checkpoint.postings_offset, + positions_offset: checkpoint.positions_offset, + positions_inner_offset: 0u8, + }, + has_positions: has_positions, + } + } + + #[inline(always)] + pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] { + let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize + 1; + let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize + 1; + let mut v: u64 = unsafe { *(cursor.as_ptr() as *const u64) }; + let doc_freq: u32 = (v as u32) & make_mask(num_bytes_docfreq); + v >>= (num_bytes_docfreq as u64) * 8u64; + let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset); + cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..]; + self.term_info.doc_freq = doc_freq; + self.term_info.postings_offset += delta_postings_offset; + if self.has_positions { + let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1; + let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & + make_mask(num_bytes_positions_offset); + self.term_info.positions_offset += delta_positions_offset; + self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset]; + &cursor[num_bytes_positions_offset + 1..] + } else { + cursor + } + } + + pub fn term_info(&self) -> &TermInfo { + &self.term_info + } +} diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index 90b719dda..176f63377 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -1,8 +1,42 @@ +use std::io::{self, Write, Read}; +use common::BinarySerializable; mod termdict; mod streamer; +mod delta_encoder; + + +pub use self::delta_encoder::{TermDeltaEncoder, TermDeltaDecoder}; +pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder, DeltaTermInfo}; pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; + +#[derive(Debug)] +pub struct CheckPoint { + pub stream_offset: u32, + pub postings_offset: u32, + pub positions_offset: u32, +} + +impl BinarySerializable for CheckPoint { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + self.stream_offset.serialize(writer)?; + self.postings_offset.serialize(writer)?; + self.positions_offset.serialize(writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let stream_offset = u32::deserialize(reader)?; + let postings_offset = u32::deserialize(reader)?; + let positions_offset = u32::deserialize(reader)?; + Ok(CheckPoint { + stream_offset: stream_offset, + postings_offset: postings_offset, + positions_offset: positions_offset, + }) + } +} diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 5de91a343..22f687da1 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -1,47 +1,54 @@ #![allow(should_implement_trait)] use std::cmp::max; -use common::BinarySerializable; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; +use postings::TermInfo; +use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder}; -pub(crate) fn stream_before<'a, V>(term_dictionary: &'a TermDictionaryImpl, - target_key: &[u8]) - -> TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref()); - let offset: usize = offset as usize; + +fn stream_before<'a>( + term_dictionary: &'a TermDictionaryImpl, + target_key: &[u8], + has_positions: bool, +) -> TermStreamerImpl<'a> { + + let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref()); + let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..]; TermStreamerImpl { - cursor: &term_dictionary.stream_data()[offset..], - current_key: Vec::from(prev_key), - current_value: V::default(), + cursor: stream_data, + term_delta_decoder: TermDeltaDecoder::with_previous_term(prev_key), + term_info_decoder: TermInfoDeltaDecoder::from_checkpoint(&checkpoint, has_positions), } } + /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - term_dictionary: &'a TermDictionaryImpl, +pub struct TermStreamerBuilderImpl<'a> { + term_dictionary: &'a TermDictionaryImpl, origin: usize, offset_from: usize, offset_to: usize, current_key: Vec, + term_info: TermInfo, + has_positions: bool, } -impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - type Streamer = TermStreamerImpl<'a, V>; +impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { + type Streamer = TermStreamerImpl<'a>; /// Limit the range to terms greater or equal to the bound fn ge>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.lt(target_key); - let (offset_before, current_key) = get_offset(smaller_than, streamer); + let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer); self.current_key = current_key; + self.term_info = term_info; self.offset_from = offset_before - self.origin; self } @@ -49,10 +56,15 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Limit the range to terms strictly greater than the bound fn gt>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.le(target_key); - let (offset_before, current_key) = get_offset(smaller_than, streamer); + let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer); self.current_key = current_key; + self.term_info = term_info; self.offset_from = offset_before - self.origin; self } @@ -60,9 +72,13 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Limit the range to terms lesser or equal to the bound fn lt>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.lt(target_key); - let (offset_before, _) = get_offset(smaller_than, streamer); + let (offset_before, _, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self } @@ -70,9 +86,13 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Limit the range to terms lesser or equal to the bound fn le>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.le(target_key); - let (offset_before, _) = get_offset(smaller_than, streamer); + let (offset_before, _, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self } @@ -82,10 +102,13 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> let data: &[u8] = self.term_dictionary.stream_data(); let start = self.offset_from; let stop = max(self.offset_to, start); + let term_delta_decoder = TermDeltaDecoder::with_previous_term(self.current_key); + let term_info_decoder = + TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions); TermStreamerImpl { cursor: &data[start..stop], - current_key: self.current_key, - current_value: V::default(), + term_delta_decoder: term_delta_decoder, + term_info_decoder: term_info_decoder, } } } @@ -93,100 +116,77 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Returns offset information for the first /// key in the stream matching a given predicate. /// -/// returns (start offset, the data required to load the value) -fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P, - mut streamer: TermStreamerImpl) - -> (usize, Vec) - where V: 'a + BinarySerializable + Default -{ +/// returns +/// - the block start +/// - the index within this block +/// - the term_buffer state to initialize the block) +fn get_offset<'a, P: Fn(&[u8]) -> bool>( + predicate: P, + mut streamer: TermStreamerImpl<'a>, +) -> (usize, Vec, TermInfo) { let mut prev: &[u8] = streamer.cursor; - let mut prev_data: Vec = streamer.current_key.clone(); + let mut term_info = streamer.value().clone(); + let mut prev_data: Vec = Vec::from(streamer.term_delta_decoder.term()); - while let Some((iter_key, _)) = streamer.next() { + while let Some((iter_key, iter_term_info)) = streamer.next() { if !predicate(iter_key.as_ref()) { - return (prev.as_ptr() as usize, prev_data); + return (prev.as_ptr() as usize, prev_data, term_info); } prev = streamer.cursor; prev_data.clear(); prev_data.extend_from_slice(iter_key.as_ref()); + term_info = iter_term_info.clone(); } - (prev.as_ptr() as usize, prev_data) + (prev.as_ptr() as usize, prev_data, term_info) } -impl<'a, V> TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl) -> Self { +impl<'a> TermStreamerBuilderImpl<'a> { + pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl, has_positions: bool) -> Self { let data = term_dictionary.stream_data(); let origin = data.as_ptr() as usize; TermStreamerBuilderImpl { term_dictionary: term_dictionary, + term_info: TermInfo::default(), origin: origin, offset_from: 0, offset_to: data.len(), current_key: Vec::with_capacity(300), + has_positions: has_positions, } } } + + /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ +pub struct TermStreamerImpl<'a> { cursor: &'a [u8], - current_key: Vec, - current_value: V, + term_delta_decoder: TermDeltaDecoder, + term_info_decoder: TermInfoDeltaDecoder, } -impl<'a, V: BinarySerializable> TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default -{ - pub(crate) fn extract_value(self) -> V { - self.current_value - } -} -fn deserialize_vint(data: &mut &[u8]) -> u64 { - let mut res = 0; - let mut shift = 0; - for i in 0.. { - let b = data[i]; - res |= ((b % 128u8) as u64) << shift; - if b & 128u8 != 0u8 { - *data = &data[(i + 1)..]; - break; - } - shift += 7; - } - res -} -impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> - where V: BinarySerializable + Default -{ +impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { if self.cursor.is_empty() { return false; } - let common_length: usize = deserialize_vint(&mut self.cursor) as usize; - self.current_key.truncate(common_length); - let added_length: usize = deserialize_vint(&mut self.cursor) as usize; - self.current_key.extend(&self.cursor[..added_length]); - - self.cursor = &self.cursor[added_length..]; - self.current_value = - V::deserialize(&mut self.cursor) - .expect("Term dictionary corrupted. Failed to deserialize a value"); + let mut cursor: &[u8] = &self.cursor; + let code: u8 = cursor[0]; + cursor = self.term_delta_decoder.decode(code, &cursor[1..]); + cursor = self.term_info_decoder.decode(code, cursor); + self.cursor = cursor; true } fn key(&self) -> &[u8] { - &self.current_key + self.term_delta_decoder.term() } - fn value(&self) -> &V { - &self.current_value + fn value(&self) -> &TermInfo { + &self.term_info_decoder.term_info() } } diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index 5759ce1e2..f0f7c618f 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -1,46 +1,54 @@ #![allow(should_implement_trait)] use std::io::{self, Write}; +use super::CheckPoint; use fst; + use fst::raw::Fst; -use common::VInt; use directory::ReadOnlySource; use common::BinarySerializable; -use std::marker::PhantomData; use common::CountingWriter; -use std::cmp::Ordering; use postings::TermInfo; +use schema::FieldType; +use super::{TermDeltaEncoder, TermInfoDeltaEncoder, DeltaTermInfo}; use fst::raw::Node; -use super::streamer::stream_before; use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; +use termdict::TermStreamerBuilder; +use std::mem::transmute; -const BLOCK_SIZE: usize = 1024; +const PADDING_SIZE: usize = 4; +const INDEX_INTERVAL: usize = 1024; fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) } -/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) -pub struct TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default -{ - write: CountingWriter, - block_index: fst::MapBuilder>, - last_key: Vec, - len: usize, - _phantom_: PhantomData, +fn has_positions(field_type: &FieldType) -> bool { + match *field_type { + FieldType::Str(ref text_options) => { + let indexing_options = text_options.get_indexing_options(); + if indexing_options.is_position_enabled() { + true + } else { + false + } + } + _ => false, + } } -fn common_prefix_length(left: &[u8], right: &[u8]) -> usize { - left.iter() - .cloned() - .zip(right.iter().cloned()) - .take_while(|&(b1, b2)| b1 == b2) - .count() +/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) +pub struct TermDictionaryBuilderImpl { + write: CountingWriter, + term_delta_encoder: TermDeltaEncoder, + term_info_encoder: TermInfoDeltaEncoder, + block_index: fst::MapBuilder>, + checkpoints: Vec, + len: usize, } + fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec) { while let Some(transition) = node.transitions().last() { buffer.push(transition.inp); @@ -48,14 +56,32 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec) { } } -impl TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilderImpl +where + W: Write, { fn add_index_entry(&mut self) { + let stream_offset = self.write.written_bytes() as u32; + let term_info = self.term_info_encoder.term_info(); + let postings_offset = term_info.postings_offset as u32; + let positions_offset = term_info.positions_offset as u32; + let checkpoint = CheckPoint { + stream_offset: stream_offset, + postings_offset: postings_offset, + positions_offset: positions_offset, + }; self.block_index - .insert(&self.last_key, self.write.written_bytes() as u64) - .unwrap(); + .insert( + &self.term_delta_encoder.term(), + self.checkpoints.len() as u64, + ) + .expect( + "Serializing fst on a Vec should never fail. \ + Where your terms not in order maybe?", + ); + checkpoint.serialize(&mut self.checkpoints).expect( + "Serializing checkpoint on a Vec should never fail.", + ); } /// # Warning @@ -66,59 +92,131 @@ impl TermDictionaryBuilderImpl /// /// Prefer using `.insert(key, value)` pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { - if self.len % BLOCK_SIZE == 0 { + if self.len % INDEX_INTERVAL == 0 { self.add_index_entry(); } - self.len += 1; - let common_len = common_prefix_length(key, &self.last_key); - VInt(common_len as u64).serialize(&mut self.write)?; - self.last_key.truncate(common_len); - self.last_key.extend_from_slice(&key[common_len..]); - VInt((key.len() - common_len) as u64) - .serialize(&mut self.write)?; - self.write.write_all(&key[common_len..])?; + self.term_delta_encoder.encode(key); Ok(()) } - pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> { - value.serialize(&mut self.write)?; + pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> { + let delta_term_info = self.term_info_encoder.encode(term_info.clone()); + let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix(); + write_term_kv( + prefix_len, + suffix, + &delta_term_info, + self.term_info_encoder.has_positions, + &mut self.write, + )?; + self.len += 1; Ok(()) } } -impl TermDictionaryBuilder for TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +fn num_bytes_required(mut n: u32) -> u8 { + for i in 1u8..5u8 { + if n < 256u32 { + return i; + } else { + n /= 256; + } + } + 0u8 +} + +fn write_term_kv( + prefix_len: usize, + suffix: &[u8], + delta_term_info: &DeltaTermInfo, + has_positions: bool, + write: &mut W, +) -> io::Result<()> { + let suffix_len = suffix.len(); + let mut code = 0u8; + let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq); + let num_bytes_postings_offset = num_bytes_required(delta_term_info.delta_postings_offset); + let num_bytes_positions_offset = num_bytes_required(delta_term_info.delta_positions_offset); + code |= (num_bytes_docfreq - 1) << 1u8; + code |= (num_bytes_postings_offset - 1) << 3u8; + code |= (num_bytes_positions_offset - 1) << 5u8; + if (prefix_len < 16) && (suffix_len < 16) { + code |= 1u8; + write.write_all( + &[ + code, + (prefix_len as u8) | ((suffix_len as u8) << 4u8), + ], + )?; + } else { + write.write_all(&[code])?; + (prefix_len as u32).serialize(write)?; + (suffix_len as u32).serialize(write)?; + } + write.write_all(suffix)?; + { + let bytes: [u8; 4] = unsafe { transmute(delta_term_info.doc_freq) }; + write.write_all(&bytes[0..num_bytes_docfreq as usize])?; + } + { + let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) }; + write.write_all( + &bytes[0..num_bytes_postings_offset as usize], + )?; + } + if has_positions { + let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) }; + write.write_all( + &bytes[0..num_bytes_positions_offset as usize], + )?; + write.write_all(&[delta_term_info.positions_inner_offset])?; + } + Ok(()) + +} + +impl TermDictionaryBuilder for TermDictionaryBuilderImpl +where + W: Write, { /// Creates a new `TermDictionaryBuilder` - fn new(write: W) -> io::Result { - let buffer: Vec = vec![]; + fn new(mut write: W, field_type: FieldType) -> io::Result { + let has_positions = has_positions(&field_type); + let has_positions_code = if has_positions { 255u8 } else { 0u8 }; + write.write_all(&[has_positions_code])?; Ok(TermDictionaryBuilderImpl { - write: CountingWriter::wrap(write), - block_index: fst::MapBuilder::new(buffer).expect("This cannot fail"), - last_key: Vec::with_capacity(128), - len: 0, - _phantom_: PhantomData, - }) + write: CountingWriter::wrap(write), + term_delta_encoder: TermDeltaEncoder::default(), + term_info_encoder: TermInfoDeltaEncoder::new(has_positions), + block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), + checkpoints: vec![], + len: 0, + }) } /// Inserts a `(key, value)` pair in the term dictionary. /// /// *Keys have to be inserted in order.* - fn insert>(&mut self, key_ref: K, value: &V) -> io::Result<()> { + fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { let key = key_ref.as_ref(); self.insert_key(key)?; - self.insert_value(value) + self.insert_value(value)?; + Ok(()) } /// Finalize writing the builder, and returns the underlying /// `Write` object. fn finish(mut self) -> io::Result { self.add_index_entry(); - let (mut w, split_len) = self.write.finish()?; + self.write.write_all(&[0u8; PADDING_SIZE])?; + let fst_addr = self.write.written_bytes(); let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?; - w.write_all(&fst_write)?; - (split_len as u64).serialize(&mut w)?; + self.write.write_all(&fst_write)?; + let check_points_addr = self.write.written_bytes(); + let (mut w, _) = self.write.finish()?; + w.write_all(&self.checkpoints)?; + (fst_addr as u64).serialize(&mut w)?; + (check_points_addr as u64).serialize(&mut w)?; w.flush()?; Ok(w) } @@ -126,34 +224,37 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl fn open_fst_index(source: ReadOnlySource) -> io::Result { - Ok(fst::Map::from(match source { - ReadOnlySource::Anonymous(data) => { - try!(Fst::from_shared_bytes(data.data, data.start, data.len) - .map_err(convert_fst_error)) - } - ReadOnlySource::Mmap(mmap_readonly) => { - try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)) - } - })) + use self::ReadOnlySource::*; + let fst_result = match source { + Anonymous(data) => Fst::from_shared_bytes(data.data, data.start, data.len), + Mmap(mmap_readonly) => Fst::from_mmap(mmap_readonly), + }; + let fst = fst_result.map_err(convert_fst_error)?; + Ok(fst::Map::from(fst)) } /// See [`TermDictionary`](./trait.TermDictionary.html) -pub struct TermDictionaryImpl - where V: BinarySerializable + Default -{ +pub struct TermDictionaryImpl { stream_data: ReadOnlySource, fst_index: fst::Map, - _phantom_: PhantomData, + checkpoints_data: ReadOnlySource, + has_positions: bool, } -impl TermDictionaryImpl - where V: BinarySerializable + Default -{ +impl TermDictionaryImpl { pub(crate) fn stream_data(&self) -> &[u8] { self.stream_data.as_slice() } - pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec, u64) { + pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec, CheckPoint) { + let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key); + let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..]; + let checkpoint = + CheckPoint::deserialize(&mut checkpoint_data).expect("Checkpoint data is corrupted"); + (term, checkpoint) + } + + fn strictly_previous_key_checkpoint_offset(&self, key: &[u8]) -> (Vec, usize) { let fst_map = &self.fst_index; let fst = fst_map.as_fst(); let mut node = fst.root(); @@ -186,12 +287,12 @@ impl TermDictionaryImpl result.push(last_transition.inp); let fork_node = fst.node(last_transition.addr); fill_last(fst, fork_node, &mut result); - let val = fst_map.get(&result).unwrap(); + let val = fst_map.get(&result).expect("Fst data corrupted") as usize; return (result, val); } else if cur_node.is_final() { // the previous key is a prefix let result_buffer = Vec::from(&key[..i]); - let val = fst_map.get(&result_buffer).unwrap(); + let val = fst_map.get(&result_buffer).expect("Fst data corrupted") as usize; return (result_buffer, val); } } @@ -200,51 +301,70 @@ impl TermDictionaryImpl } -impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl - where V: BinarySerializable + Default + 'a -{ - type Streamer = TermStreamerImpl<'a, V>; - type StreamBuilder = TermStreamerBuilderImpl<'a, V>; +impl<'a> TermDictionary<'a> for TermDictionaryImpl { + type Streamer = TermStreamerImpl<'a>; + + type StreamBuilder = TermStreamerBuilderImpl<'a>; /// Opens a `TermDictionary` given a data source. - fn from_source(source: ReadOnlySource) -> io::Result { - let total_len = source.len(); - let length_offset = total_len - 8; - let split_len: usize = { - let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - u64::deserialize(&mut split_len_buffer)? as usize - }; - let stream_data = source.slice(0, split_len); - let fst_data = source.slice(split_len, length_offset); - let fst_index = open_fst_index(fst_data)?; + fn from_source(mut source: ReadOnlySource) -> Self { + let has_positions = source.slice(0, 1)[0] == 255u8; + source = source.slice_from(1); - Ok(TermDictionaryImpl { - stream_data: stream_data, - fst_index: fst_index, - _phantom_: PhantomData, - }) + let total_len = source.len(); + let (body, footer) = source.split(total_len - 16); + + let mut footer_buffer: &[u8] = footer.as_slice(); + let fst_addr = u64::deserialize(&mut footer_buffer).expect( + "deserializing 8 byte should never fail", + ) as usize; + let checkpoints_addr = u64::deserialize(&mut footer_buffer).expect( + "deserializing 8 byte should never fail", + ) as usize; + + let stream_data = body.slice(0, fst_addr - PADDING_SIZE); + let fst_data = body.slice(fst_addr, checkpoints_addr); + let checkpoints_data = body.slice_from(checkpoints_addr); + + let fst_index = open_fst_index(fst_data).expect("Index FST data corrupted"); + + TermDictionaryImpl { + has_positions: has_positions, + stream_data: stream_data, + checkpoints_data: checkpoints_data, + fst_index: fst_index, + } } /// Lookups the value corresponding to the key. - fn get>(&self, target_key: K) -> Option { - let mut streamer = stream_before(self, target_key.as_ref()); - while streamer.advance() { - let position = streamer.key().cmp(target_key.as_ref()); - match position { - Ordering::Less => {} - Ordering::Equal => return Some(streamer.extract_value()), - Ordering::Greater => { - return None; - } - } + fn get>(&self, target_key: K) -> Option { + let mut streamer = self.range().ge(&target_key).into_stream(); + if streamer.advance() && streamer.key() == target_key.as_ref() { + Some(streamer.value().clone()) + } else { + None } - None } /// Returns a range builder, to stream all of the terms /// within an interval. fn range(&'a self) -> Self::StreamBuilder { - Self::StreamBuilder::new(self) + Self::StreamBuilder::new(self, self.has_positions) + } +} + + +#[cfg(test)] +mod tests { + use super::num_bytes_required; + + #[test] + fn test_num_bytes_required() { + assert_eq!(num_bytes_required(0), 1); + assert_eq!(num_bytes_required(1), 1); + assert_eq!(num_bytes_required(255), 1); + assert_eq!(num_bytes_required(256), 2); + assert_eq!(num_bytes_required(u32::max_value()), 4); } }