From 09e27740e2611df32b483046cc1c900daa1189c6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 14 Jun 2017 18:28:30 +0900 Subject: [PATCH 01/29] Added fill_buffer in DocSet --- src/common/bitpacker.rs | 2 +- src/fastfield/reader.rs | 11 ++++------- src/postings/docset.rs | 12 ++++++++++++ src/query/query.rs | 2 -- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 49ea9f9e6..7d7aeb23c 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -15,7 +15,7 @@ use std::ops::Deref; /// reasons, we want to ensure that a value spawns over at most 8 bytes /// of aligns bytes. /// -/// Spawning over 9 bytes is possible for instance, if we do +/// Spanning over 9 bytes is possible for instance, if we do /// bitpacking with an amplitude of 63 bits. /// In this case, the second int will start on bit /// 63 (which belongs to byte 7) and ends at byte 15; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index aae1dd797..06190b111 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,20 +1,17 @@ use std::io; use std::collections::HashMap; use directory::ReadOnlySource; -use common::BinarySerializable; +use common::{self, BinarySerializable}; +use common::bitpacker::{compute_num_bits, BitUnpacker}; use DocId; use schema::{Field, SchemaBuilder}; use std::path::Path; use schema::FAST; use directory::{WritePtr, RAMDirectory, Directory}; -use fastfield::FastFieldSerializer; -use fastfield::FastFieldsWriter; -use common::bitpacker::compute_num_bits; -use common::bitpacker::BitUnpacker; +use fastfield::{FastFieldSerializer, FastFieldsWriter}; use schema::FieldType; use error::ResultExt; use std::mem; -use common; use owning_ref::OwningRef; /// Trait for accessing a fastfield. @@ -212,7 +209,7 @@ impl FastFieldReader for I64FastFieldReader { let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; self.underlying.get_range(start, output_u64); for mut_val in output_u64.iter_mut() { - *mut_val ^= 1 << 63; + *mut_val = common::u64_to_i64(*mut_val as u64) as u64; } } diff --git a/src/postings/docset.rs b/src/postings/docset.rs index ea4211a5f..22fa3d9f3 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -52,6 +52,18 @@ pub trait DocSet { } } + fn fill_buffer(&mut self, buffer: &mut [DocId]) -> bool { + for buffer_val in buffer.iter_mut() { + if self.advance() { + *buffer_val = self.doc(); + } + else { + return false; + } + } + return true; + } + /// Returns the current document fn doc(&self) -> DocId; diff --git a/src/query/query.rs b/src/query/query.rs index f091442a8..683281dc6 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -58,10 +58,8 @@ pub trait Query: fmt::Debug { /// - iterate throw the matched documents and push them to the collector. /// fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result { - let mut timer_tree = TimerTree::default(); let weight = try!(self.weight(searcher)); - { let mut search_timer = timer_tree.open("search"); for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { From 4fe96483bcef95d3d5f3cdc387c32bb1d3c3783e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 14 Jun 2017 23:32:58 +0900 Subject: [PATCH 02/29] fill_buffer --- src/postings/docset.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 22fa3d9f3..219a85dcb 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -52,16 +52,16 @@ pub trait DocSet { } } - fn fill_buffer(&mut self, buffer: &mut [DocId]) -> bool { - for buffer_val in buffer.iter_mut() { + fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize { + for (i, buffer_val) in buffer.iter_mut().enumerate() { if self.advance() { *buffer_val = self.doc(); } else { - return false; + return i; } } - return true; + return buffer.len(); } /// Returns the current document From aff7e64d4ed718b4c012a261f74d0b6a6df419b7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 4 Aug 2017 10:28:59 +0900 Subject: [PATCH 03/29] test --- Cargo.toml | 2 +- src/compression/composite.rs | 170 ------------------ src/compression/mod.rs | 4 - .../pack/compression_pack_nosimd.rs | 2 +- src/core/segment_reader.rs | 2 +- src/indexer/merger.rs | 28 +-- src/postings/freq_handler.rs | 59 +++--- src/postings/postings.rs | 20 +++ src/postings/segment_postings.rs | 5 + src/postings/serializer.rs | 85 ++++++--- src/postings/term_info.rs | 21 ++- src/postings/vec_postings.rs | 5 + 12 files changed, 141 insertions(+), 262 deletions(-) delete mode 100644 src/compression/composite.rs diff --git a/Cargo.toml b/Cargo.toml index 952dc55d7..845f1d31d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.4.3" +version = "0.5.0-dev" authors = ["Paul Masurel "] build = "build.rs" license = "MIT" diff --git a/src/compression/composite.rs b/src/compression/composite.rs deleted file mode 100644 index c363860ee..000000000 --- a/src/compression/composite.rs +++ /dev/null @@ -1,170 +0,0 @@ -use super::{BlockEncoder, BlockDecoder}; -use super::NUM_DOCS_PER_BLOCK; -use compression::{VIntEncoder, VIntDecoder}; - -pub struct CompositeEncoder { - block_encoder: BlockEncoder, - output: Vec, -} - -impl CompositeEncoder { - pub fn new() -> CompositeEncoder { - CompositeEncoder { - block_encoder: BlockEncoder::new(), - output: Vec::with_capacity(500_000), - } - } - - pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] { - self.output.clear(); - let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; - let mut offset = 0u32; - for i in 0..num_blocks { - let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK]; - let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset); - offset = vals_slice[NUM_DOCS_PER_BLOCK - 1]; - self.output.extend_from_slice(block_compressed); - } - let vint_compressed = - self.block_encoder - .compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset); - self.output.extend_from_slice(vint_compressed); - &self.output - } - - pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] { - self.output.clear(); - let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; - for i in 0..num_blocks { - let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK]; - let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice); - self.output.extend_from_slice(block_compressed); - } - let vint_compressed = self.block_encoder - .compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]); - self.output.extend_from_slice(vint_compressed); - &self.output - } -} - - -pub struct CompositeDecoder { - block_decoder: BlockDecoder, - vals: Vec, -} - - -impl CompositeDecoder { - pub fn new() -> CompositeDecoder { - CompositeDecoder { - block_decoder: BlockDecoder::new(), - vals: Vec::with_capacity(500_000), - } - } - - pub fn uncompress_sorted(&mut self, - mut compressed_data: &[u8], - uncompressed_len: usize) - -> &[u32] { - if uncompressed_len > self.vals.capacity() { - let extra_capacity = uncompressed_len - self.vals.capacity(); - self.vals.reserve(extra_capacity); - } - let mut offset = 0u32; - self.vals.clear(); - let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; - for _ in 0..num_blocks { - compressed_data = self.block_decoder - .uncompress_block_sorted(compressed_data, offset); - offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - } - self.block_decoder - .uncompress_vint_sorted(compressed_data, - offset, - uncompressed_len % NUM_DOCS_PER_BLOCK); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - &self.vals - } - - pub fn uncompress_unsorted(&mut self, - mut compressed_data: &[u8], - uncompressed_len: usize) - -> &[u32] { - self.vals.clear(); - let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; - for _ in 0..num_blocks { - compressed_data = self.block_decoder - .uncompress_block_unsorted(compressed_data); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - } - self.block_decoder - .uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK); - self.vals - .extend_from_slice(self.block_decoder.output_array()); - &self.vals - } -} - -impl Into> for CompositeDecoder { - fn into(self) -> Vec { - self.vals - } -} - - -#[cfg(test)] -pub mod tests { - - use test::Bencher; - use super::*; - use tests; - - #[test] - fn test_composite_unsorted() { - let data = tests::generate_array(10_000, 0.1); - let mut encoder = CompositeEncoder::new(); - let compressed = encoder.compress_unsorted(&data); - assert!(compressed.len() <= 19_794); - let mut decoder = CompositeDecoder::new(); - let result = decoder.uncompress_unsorted(&compressed, data.len()); - for i in 0..data.len() { - assert_eq!(data[i], result[i]); - } - } - - #[test] - fn test_composite_sorted() { - let data = tests::generate_array(10_000, 0.1); - let mut encoder = CompositeEncoder::new(); - let compressed = encoder.compress_sorted(&data); - assert!(compressed.len() <= 7_826); - let mut decoder = CompositeDecoder::new(); - let result = decoder.uncompress_sorted(&compressed, data.len()); - for i in 0..data.len() { - assert_eq!(data[i], result[i]); - } - } - - - const BENCH_NUM_INTS: usize = 99_968; - - #[bench] - fn bench_compress(b: &mut Bencher) { - let mut encoder = CompositeEncoder::new(); - let data = tests::generate_array(BENCH_NUM_INTS, 0.1); - b.iter(|| { encoder.compress_sorted(&data); }); - } - - #[bench] - fn bench_uncompress(b: &mut Bencher) { - let mut encoder = CompositeEncoder::new(); - let data = tests::generate_array(BENCH_NUM_INTS, 0.1); - let compressed = encoder.compress_sorted(&data); - let mut decoder = CompositeDecoder::new(); - b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); }); - } -} diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 0c3df4b2f..8a44c24b7 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -1,10 +1,6 @@ #![allow(dead_code)] -mod composite; -pub use self::composite::{CompositeEncoder, CompositeDecoder}; - - #[cfg(not(feature="simdcompression"))] mod pack { mod compression_pack_nosimd; diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 4086688d2..1c15567e4 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -18,7 +18,7 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> local_offset = val; } } - let num_bits = compute_num_bits(max_delta); + let num_bits = compute_num_bits(max_delta as u64); output.write_all(&[num_bits]).unwrap(); let mut bit_packer = BitPacker::new(num_bits as usize); for val in vals { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ed155e56b..95c53a6eb 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -240,7 +240,7 @@ impl SegmentReader { SegmentPostingsOption::FreqAndPositions => { let offset = term_info.positions_offset as usize; let offseted_position_data = &self.positions_data[offset..]; - FreqHandler::new_with_freq_and_position(offseted_position_data) + FreqHandler::new_with_freq_and_position(offseted_position_data, term_info.positions_inner_offset) } }; BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b71774059..86f0a0e78 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -28,29 +28,6 @@ pub struct IndexMerger { } -struct DeltaPositionComputer { - buffer: Vec, -} - -impl DeltaPositionComputer { - fn new() -> DeltaPositionComputer { - DeltaPositionComputer { buffer: vec![0u32; 512] } - } - - fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] { - if positions.len() > self.buffer.len() { - self.buffer.resize(positions.len(), 0u32); - } - let mut last_pos = 0u32; - for (i, position) in positions.iter().cloned().enumerate() { - self.buffer[i] = position - last_pos; - last_pos = position; - } - &self.buffer[..positions.len()] - } -} - - fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) @@ -193,7 +170,6 @@ impl IndexMerger { fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> { let mut merged_terms = TermMerger::from(&self.readers[..]); - let mut delta_position_computer = DeltaPositionComputer::new(); let mut max_doc = 0; @@ -294,9 +270,7 @@ impl IndexMerger { old_to_new_doc_id[segment_postings.doc() as usize] { // we make sure to only write the term iff // there is at least one document. - let delta_positions: &[u32] = - delta_position_computer - .compute_delta_positions(segment_postings.positions()); + let delta_positions: &[u32] = segment_postings.delta_positions(); let term_freq = segment_postings.term_freq(); serializer .write_doc(remapped_doc_id, term_freq, delta_positions)?; diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index f1e3f256c..9bc6fb49e 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -1,37 +1,26 @@ use compression::BlockDecoder; -use common::VInt; -use common::BinarySerializable; -use compression::{CompositeDecoder, VIntDecoder}; +use compression::VIntDecoder; use postings::SegmentPostingsOption; use compression::NUM_DOCS_PER_BLOCK; - +use std::cell::UnsafeCell; /// `FreqHandler` is in charge of decompressing /// frequencies and/or positions. pub struct FreqHandler { freq_decoder: BlockDecoder, - positions: Vec, + positions: UnsafeCell>, option: SegmentPostingsOption, positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1], } -fn read_positions(data: &[u8]) -> Vec { - let mut composite_reader = CompositeDecoder::new(); - let mut readable: &[u8] = data; - let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize; - composite_reader.uncompress_unsorted(readable, uncompressed_len); - composite_reader.into() -} - - impl FreqHandler { /// Returns a `FreqHandler` that just decodes `DocId`s. pub fn new_without_freq() -> FreqHandler { FreqHandler { freq_decoder: BlockDecoder::with_val(1u32), - positions: Vec::new(), + positions: UnsafeCell::new(Vec::with_capacity(0)), option: SegmentPostingsOption::NoFreq, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } @@ -41,23 +30,23 @@ impl FreqHandler { pub fn new_with_freq() -> FreqHandler { FreqHandler { freq_decoder: BlockDecoder::new(), - positions: Vec::new(), + positions: UnsafeCell::new(Vec::with_capacity(0)), option: SegmentPostingsOption::Freq, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions. - pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler { - let positions = read_positions(position_data); + pub fn new_with_freq_and_position(position_data: &[u8], within_block_offset: u8) -> FreqHandler { FreqHandler { freq_decoder: BlockDecoder::new(), - positions: positions, + positions: UnsafeCell::new(Vec::with_capacity(NUM_DOCS_PER_BLOCK)), option: SegmentPostingsOption::FreqAndPositions, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } + /* fn fill_positions_offset(&mut self) { let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK]; let mut i: usize = 0; @@ -75,7 +64,7 @@ impl FreqHandler { self.positions_offsets[i] = cur_position; last_cur_position = cur_position; } - } + }*/ /// Accessor to term frequency @@ -91,11 +80,31 @@ impl FreqHandler { /// idx is the offset of the current doc in the block. /// It takes value between 0 and 128. pub fn positions(&self, idx: usize) -> &[u32] { - let start = self.positions_offsets[idx]; - let stop = self.positions_offsets[idx + 1]; - &self.positions[start..stop] + //unsafe { &self.positions.get() } + println!("fix positions"); + self.delta_positions(idx) + } + /// Accessor to the delta positions. + /// Delta positions is simply the difference between + /// two consecutive positions. + /// The first delta position is the first position of the + /// term in the document. + /// + /// For instance, if positions are `[7,13,17]` + /// then delta positions `[7, 6, 4]` + /// + /// idx is the offset of the current doc in the docid/freq block. + /// It takes value between 0 and 128. + pub fn delta_positions(&self, idx: usize) -> &[u32] { + let freq = self.freq(idx); + let positions: &mut Vec = unsafe { &mut *self.positions.get() }; + positions.resize(freq as usize, 0u32); + &positions[..] + } + + /// Decompresses a complete frequency block pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { match self.option { @@ -103,7 +112,7 @@ impl FreqHandler { SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data), SegmentPostingsOption::FreqAndPositions => { let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data); - self.fill_positions_offset(); + // self.fill_positions_offset(); remaining } } @@ -118,7 +127,7 @@ impl FreqHandler { } SegmentPostingsOption::FreqAndPositions => { self.freq_decoder.uncompress_vint_unsorted(data, num_els); - self.fill_positions_offset(); + // self.fill_positions_offset(); } } } diff --git a/src/postings/postings.rs b/src/postings/postings.rs index 52f16198a..29538e0d2 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -17,6 +17,16 @@ pub trait Postings: DocSet { /// Returns the list of positions of the term, expressed as a list of /// token ordinals. fn positions(&self) -> &[u32]; + /// Return the list of delta positions. + /// + /// Delta positions is simply the difference between + /// two consecutive positions. + /// The first delta position is the first position of the + /// term in the document. + /// + /// For instance, if positions are `[7,13,17]` + /// then delta positions `[7, 6, 4]` + fn delta_positions(&self) -> &[u32]; } impl Postings for Box { @@ -29,6 +39,11 @@ impl Postings for Box { let unboxed: &TPostings = self.borrow(); unboxed.positions() } + + fn delta_positions(&self) -> &[u32] { + let unboxed: &TPostings = self.borrow(); + unboxed.delta_positions() + } } impl<'a, TPostings: Postings> Postings for &'a mut TPostings { @@ -41,4 +56,9 @@ impl<'a, TPostings: Postings> Postings for &'a mut TPostings { let unref: &TPostings = *self; unref.positions() } + + fn delta_positions(&self) -> &[u32] { + let unref: &TPostings = *self; + unref.delta_positions() + } } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index ff283f24f..d89b2aec0 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -179,6 +179,11 @@ impl<'a> Postings for SegmentPostings<'a> { fn positions(&self) -> &[u32] { self.block_cursor.freq_handler().positions(self.cur) } + + fn delta_positions(&self) -> &[u32] { + self.block_cursor.freq_handler().delta_positions(self.cur) + } + } /// `BlockSegmentPostings` is a cursor iterating over blocks diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 8c6c4c1c9..34ba47382 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -7,7 +7,7 @@ use schema::FieldType; use schema::Schema; use schema::TextIndexingOptions; use directory::WritePtr; -use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder}; +use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder}; use DocId; use core::Segment; use std::io::{self, Write}; @@ -16,6 +16,7 @@ use common::VInt; use common::BinarySerializable; use common::CountingWriter; use termdict::TermDictionaryBuilder; +use datastruct::{SkipList, SkipListBuilder}; /// `PostingsSerializer` is in charge of serializing @@ -52,19 +53,64 @@ use termdict::TermDictionaryBuilder; pub struct PostingsSerializer { terms_fst_builder: TermDictionaryBuilderImpl, postings_write: CountingWriter, - positions_write: CountingWriter, last_doc_id_encoded: u32, - positions_encoder: CompositeEncoder, + positions_writer: PositionWriter, block_encoder: BlockEncoder, doc_ids: Vec, term_freqs: Vec, - position_deltas: Vec, schema: Schema, text_indexing_options: TextIndexingOptions, term_open: bool, current_term_info: TermInfo, } +struct PositionWriter { + buffer: Vec, + write: CountingWriter, + block_encoder: BlockEncoder, +} + +impl PositionWriter { + fn new(write: WritePtr) -> PositionWriter { + PositionWriter { + buffer: Vec::with_capacity(NUM_DOCS_PER_BLOCK), + write: CountingWriter::wrap(write), + block_encoder: BlockEncoder::new(), + } + } + + fn addr(&self) -> (u32, u8) { + (self.write.written_bytes() as u32, self.buffer.len() as u8) + } + + fn write_block(&mut self) -> io::Result<()> { + assert_eq!(self.buffer.len(), NUM_DOCS_PER_BLOCK); + let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer); + self.write.write_all(block_compressed)?; + self.buffer.clear(); + Ok(()) + } + + fn write(&mut self, mut vals: &[u32]) -> io::Result<()> { + let mut buffer_len = self.buffer.len(); + while vals.len() + buffer_len >= NUM_DOCS_PER_BLOCK { + let len_to_completion = NUM_DOCS_PER_BLOCK - buffer_len; + self.buffer.extend_from_slice(&vals[..len_to_completion]); + self.write_block()?; + vals = &vals[len_to_completion..]; + buffer_len = self.buffer.len(); + } + self.buffer.extend_from_slice(&vals); + Ok(()) + } + + fn close(mut self) -> io::Result<()> { + self.buffer.resize(NUM_DOCS_PER_BLOCK, 0u32); + self.write_block()?; + self.write.flush() + } +} + impl PostingsSerializer { /// Open a new `PostingsSerializer` for the given segment pub fn new(terms_write: WritePtr, @@ -72,17 +118,15 @@ impl PostingsSerializer { positions_write: WritePtr, schema: Schema) -> Result { - let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write)); + let terms_fst_builder = TermDictionaryBuilderImpl::new(terms_write)?; Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, postings_write: CountingWriter::wrap(postings_write), - positions_write: CountingWriter::wrap(positions_write), + positions_writer: PositionWriter::new(positions_write), last_doc_id_encoded: 0u32, - positions_encoder: CompositeEncoder::new(), block_encoder: BlockEncoder::new(), doc_ids: Vec::new(), term_freqs: Vec::new(), - position_deltas: Vec::new(), schema: schema, text_indexing_options: TextIndexingOptions::Unindexed, term_open: false, @@ -131,11 +175,12 @@ impl PostingsSerializer { self.doc_ids.clear(); self.last_doc_id_encoded = 0; self.term_freqs.clear(); - self.position_deltas.clear(); + let (filepos, offset) = self.positions_writer.addr(); self.current_term_info = TermInfo { doc_freq: 0, postings_offset: self.postings_write.written_bytes() as u32, - positions_offset: self.positions_write.written_bytes() as u32, + positions_offset: filepos, + positions_inner_offset: offset, }; self.terms_fst_builder.insert_key(term) } @@ -172,16 +217,6 @@ impl PostingsSerializer { self.term_freqs.clear(); } } - // On the other hand, positions are entirely buffered until the - // end of the term, at which point they are compressed and written. - if self.text_indexing_options.is_position_enabled() { - let posdelta_len = VInt(self.position_deltas.len() as u64); - posdelta_len.serialize(&mut self.positions_write)?; - let positions_encoded: &[u8] = self.positions_encoder - .compress_unsorted(&self.position_deltas[..]); - self.positions_write.write_all(positions_encoded)?; - self.position_deltas.clear(); - } self.term_open = false; } Ok(()) @@ -208,7 +243,7 @@ impl PostingsSerializer { self.term_freqs.push(term_freq as u32); } if self.text_indexing_options.is_position_enabled() { - self.position_deltas.extend_from_slice(position_deltas); + self.positions_writer.write(position_deltas)?; } if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { { @@ -233,10 +268,10 @@ impl PostingsSerializer { /// Closes the serializer. pub fn close(mut self) -> io::Result<()> { - try!(self.close_term()); - try!(self.terms_fst_builder.finish()); - try!(self.postings_write.flush()); - try!(self.positions_write.flush()); + self.close_term()?; + self.terms_fst_builder.finish()?; + self.postings_write.flush()?; + self.positions_writer.close()?; Ok(()) } } diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index d639e9afb..51ae7083a 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -20,6 +20,8 @@ pub struct TermInfo { pub postings_offset: u32, /// Offset within the position (`.pos`) file. pub positions_offset: u32, + /// Offset within the position block. + pub positions_inner_offset: u8, } @@ -27,17 +29,20 @@ impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut W) -> io::Result<()> { self.doc_freq.serialize(writer)?; self.postings_offset.serialize(writer)?; - self.positions_offset.serialize(writer) + self.positions_offset.serialize(writer)?; + self.positions_inner_offset.serialize(writer) } fn deserialize(reader: &mut R) -> io::Result { - let doc_freq = try!(u32::deserialize(reader)); - let postings_offset = try!(u32::deserialize(reader)); - let positions_offset = try!(u32::deserialize(reader)); + let doc_freq = u32::deserialize(reader)?; + let postings_offset = u32::deserialize(reader)?; + let positions_offset = u32::deserialize(reader)?; + let positions_inner_offset = u8::deserialize(reader)?; Ok(TermInfo { - doc_freq: doc_freq, - postings_offset: postings_offset, - positions_offset: positions_offset, - }) + doc_freq: doc_freq, + postings_offset: postings_offset, + positions_offset: positions_offset, + positions_inner_offset: positions_inner_offset, + }) } } diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index 8c9512fb1..eb47933b4 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -54,6 +54,11 @@ impl Postings for VecPostings { fn positions(&self) -> &[u32] { &EMPTY_ARRAY } + + fn delta_positions(&self) -> &[u32] { + &EMPTY_ARRAY + } + } #[cfg(test)] From efb910f4e84a77a6f21e098cf07e343cbc6a7a5a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 5 Aug 2017 12:28:57 +0900 Subject: [PATCH 04/29] Added CompressedIntStream --- cpp/simdcomp_wrapper.c | 5 + src/compression/mod.rs | 6 +- src/compression/pack/compression_pack_simd.rs | 6 + src/compression/stream.rs | 110 ++++++++++++++++++ src/postings/freq_handler.rs | 4 - 5 files changed, 126 insertions(+), 5 deletions(-) create mode 100644 src/compression/stream.rs diff --git a/cpp/simdcomp_wrapper.c b/cpp/simdcomp_wrapper.c index 4530e3f3b..1ffff9778 100644 --- a/cpp/simdcomp_wrapper.c +++ b/cpp/simdcomp_wrapper.c @@ -40,3 +40,8 @@ size_t uncompress_unsorted( simdunpack((__m128i *)compressed_data, output, b); return 1 + b * sizeof(__m128i); } + + +size_t compressedbytes(const uint32_t length, const uint8_t num_bits) { + return simdpack_compressedbytes((int)length, (uint32_t)num_bits); +} diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 8a44c24b7..8384c65eb 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -1,6 +1,10 @@ #![allow(dead_code)] +mod stream; + +pub use self::stream::CompressedIntStream; + #[cfg(not(feature="simdcompression"))] mod pack { mod compression_pack_nosimd; @@ -13,7 +17,7 @@ mod pack { pub use self::compression_pack_simd::*; } -pub use self::pack::{BlockEncoder, BlockDecoder}; +pub use self::pack::{BlockEncoder, BlockDecoder, compressedbytes}; #[cfg( any(not(feature="simdcompression"), target_env="msvc") )] mod vint { diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index 78cf58c37..ba3518521 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -16,9 +16,15 @@ mod simdcomp { pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t; pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t; + + pub fn compressedbytes(length: u32, num_bits: u8) -> size_t; } } +pub fn compressedbytes(length: u32, num_bits: u8) -> usize { + unsafe { simdcomp::compressedbytes(length, num_bits) } +} + fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize { unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) } } diff --git a/src/compression/stream.rs b/src/compression/stream.rs new file mode 100644 index 000000000..274310b77 --- /dev/null +++ b/src/compression/stream.rs @@ -0,0 +1,110 @@ +use compression::BlockDecoder; +use compression::NUM_DOCS_PER_BLOCK; +use compression::compressedbytes; + +pub struct CompressedIntStream<'a> { + buffer: &'a [u8], + block_decoder: BlockDecoder, + inner_offset: usize, +} + +impl<'a> CompressedIntStream<'a> { + fn wrap(buffer: &'a [u8]) -> CompressedIntStream<'a> { + CompressedIntStream { + buffer: buffer, + block_decoder: BlockDecoder::new(), + inner_offset: NUM_DOCS_PER_BLOCK, + } + } + + fn read(&mut self, mut output: &mut [u32]) { + let mut num_els: usize = output.len(); + let mut start: usize = 0; + loop { + let available = NUM_DOCS_PER_BLOCK - self.inner_offset; + if num_els >= available { + if available > 0 { + let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..]; + &mut output[start..start + available].clone_from_slice(uncompressed_block); + } + num_els -= available; + start += available; + self.buffer = self.block_decoder.uncompress_block_unsorted(self.buffer); + self.inner_offset = 0; + } + else { + let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..self.inner_offset + num_els]; + &output[start..start + num_els].clone_from_slice(uncompressed_block); + self.inner_offset += num_els; + break; + } + } + } + + fn skip(&mut self, mut skip_len: usize) { + let available = NUM_DOCS_PER_BLOCK - self.inner_offset; + if available >= skip_len { + self.inner_offset += skip_len; + } + else { + skip_len -= available; + // entirely skip decompressing some blocks. + while skip_len >= NUM_DOCS_PER_BLOCK { + skip_len -= NUM_DOCS_PER_BLOCK; + let num_bits: u8 = self.buffer[0]; + let block_len = compressedbytes(128, num_bits); + self.buffer = &self.buffer[1 + block_len..]; + } + self.buffer = self.block_decoder.uncompress_block_unsorted(self.buffer); + self.inner_offset = skip_len; + } + } +} + + +#[cfg(test)] +pub mod tests { + + use super::CompressedIntStream; + use tests; + use compression::compressedbytes; + use compression::NUM_DOCS_PER_BLOCK; + use compression::BlockEncoder; + + fn create_stream_buffer() -> Vec { + let mut buffer: Vec = vec!(); + let mut encoder = BlockEncoder::new(); + let vals: Vec = (0u32..1_025u32).collect(); + for chunk in vals.chunks(NUM_DOCS_PER_BLOCK) { + let compressed_block = encoder.compress_block_unsorted(chunk); + let num_bits = compressed_block[0]; + assert_eq!(compressedbytes(128, num_bits) + 1, compressed_block.len()); + buffer.extend_from_slice(compressed_block); + } + buffer + } + + #[test] + fn test_compressed_int_stream() { + let buffer = create_stream_buffer(); + let mut stream = CompressedIntStream::wrap(&buffer[..]); + let mut block: [u32; NUM_DOCS_PER_BLOCK] = [0u32; NUM_DOCS_PER_BLOCK]; + + stream.read(&mut block[0..2]); + assert_eq!(block[0], 0); + assert_eq!(block[1], 1); + stream.skip(5); + stream.read(&mut block[0..3]); + assert_eq!(block[0], 7); + assert_eq!(block[1], 8); + assert_eq!(block[2], 9); + stream.skip(500); + stream.read(&mut block[0..3]); + assert_eq!(block[0], 510); + assert_eq!(block[1], 511); + assert_eq!(block[2], 512); + stream.skip(511); + stream.read(&mut block[..1]); + assert_eq!(block[0], 1024); + } +} diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 9bc6fb49e..acc91ce9a 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -10,7 +10,6 @@ pub struct FreqHandler { freq_decoder: BlockDecoder, positions: UnsafeCell>, option: SegmentPostingsOption, - positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1], } @@ -22,7 +21,6 @@ impl FreqHandler { freq_decoder: BlockDecoder::with_val(1u32), positions: UnsafeCell::new(Vec::with_capacity(0)), option: SegmentPostingsOption::NoFreq, - positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } @@ -32,7 +30,6 @@ impl FreqHandler { freq_decoder: BlockDecoder::new(), positions: UnsafeCell::new(Vec::with_capacity(0)), option: SegmentPostingsOption::Freq, - positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } @@ -42,7 +39,6 @@ impl FreqHandler { freq_decoder: BlockDecoder::new(), positions: UnsafeCell::new(Vec::with_capacity(NUM_DOCS_PER_BLOCK)), option: SegmentPostingsOption::FreqAndPositions, - positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } From 63b35dd87be1c0a38afbcab357c10e6016f0a762 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 5 Aug 2017 18:09:19 +0900 Subject: [PATCH 05/29] removing freq handler. --- src/compression/stream.rs | 6 +- src/core/segment_reader.rs | 34 +++++-- src/postings/freq_handler.rs | 130 ------------------------ src/postings/segment_postings.rs | 86 ++++++++++------ src/postings/segment_postings_option.rs | 16 +++ 5 files changed, 99 insertions(+), 173 deletions(-) delete mode 100644 src/postings/freq_handler.rs diff --git a/src/compression/stream.rs b/src/compression/stream.rs index 274310b77..b3bbc8716 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -9,7 +9,7 @@ pub struct CompressedIntStream<'a> { } impl<'a> CompressedIntStream<'a> { - fn wrap(buffer: &'a [u8]) -> CompressedIntStream<'a> { + pub fn wrap(buffer: &'a [u8]) -> CompressedIntStream<'a> { CompressedIntStream { buffer: buffer, block_decoder: BlockDecoder::new(), @@ -17,7 +17,7 @@ impl<'a> CompressedIntStream<'a> { } } - fn read(&mut self, mut output: &mut [u32]) { + pub fn read(&mut self, mut output: &mut [u32]) { let mut num_els: usize = output.len(); let mut start: usize = 0; loop { @@ -41,7 +41,7 @@ impl<'a> CompressedIntStream<'a> { } } - fn skip(&mut self, mut skip_len: usize) { + pub fn skip(&mut self, mut skip_len: usize) { let available = NUM_DOCS_PER_BLOCK - self.inner_offset; if available >= skip_len { self.inner_offset += skip_len; diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 95c53a6eb..ff66273e2 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,6 +4,7 @@ use core::SegmentId; use core::SegmentComponent; use schema::Term; use common::HasLen; +use compression::CompressedIntStream; use core::SegmentMeta; use fastfield::{self, FastFieldNotAvailableError}; use fastfield::DeleteBitSet; @@ -220,7 +221,23 @@ impl SegmentReader { -> SegmentPostings { let block_postings = self.read_block_postings_from_terminfo(term_info, option); let delete_bitset = self.delete_bitset.clone(); - SegmentPostings::from_block_postings(block_postings, delete_bitset) + let position_stream = { + if option.has_positions() { + let position_offset = term_info.positions_offset; + let positions_data = &self.positions_data[position_offset as usize..]; + let mut stream = CompressedIntStream::wrap(positions_data); + stream.skip(term_info.positions_inner_offset as usize); + Some(stream) + } + else { + None + } + }; + SegmentPostings::from_block_postings( + block_postings, + delete_bitset, + position_stream + ) } @@ -234,16 +251,11 @@ impl SegmentReader { -> BlockSegmentPostings { let offset = term_info.postings_offset as usize; let postings_data = &self.postings_data[offset..]; - let freq_handler = match option { - SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(), - SegmentPostingsOption::Freq => FreqHandler::new_with_freq(), - SegmentPostingsOption::FreqAndPositions => { - let offset = term_info.positions_offset as usize; - let offseted_position_data = &self.positions_data[offset..]; - FreqHandler::new_with_freq_and_position(offseted_position_data, term_info.positions_inner_offset) - } - }; - BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler) + let has_freq = option.has_freq(); + BlockSegmentPostings::from_data( + term_info.doc_freq as usize, + postings_data, + has_freq) } diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs deleted file mode 100644 index acc91ce9a..000000000 --- a/src/postings/freq_handler.rs +++ /dev/null @@ -1,130 +0,0 @@ -use compression::BlockDecoder; -use compression::VIntDecoder; -use postings::SegmentPostingsOption; -use compression::NUM_DOCS_PER_BLOCK; -use std::cell::UnsafeCell; - -/// `FreqHandler` is in charge of decompressing -/// frequencies and/or positions. -pub struct FreqHandler { - freq_decoder: BlockDecoder, - positions: UnsafeCell>, - option: SegmentPostingsOption, -} - - - -impl FreqHandler { - /// Returns a `FreqHandler` that just decodes `DocId`s. - pub fn new_without_freq() -> FreqHandler { - FreqHandler { - freq_decoder: BlockDecoder::with_val(1u32), - positions: UnsafeCell::new(Vec::with_capacity(0)), - option: SegmentPostingsOption::NoFreq, - } - } - - /// Returns a `FreqHandler` that decodes `DocId`s and term frequencies. - pub fn new_with_freq() -> FreqHandler { - FreqHandler { - freq_decoder: BlockDecoder::new(), - positions: UnsafeCell::new(Vec::with_capacity(0)), - option: SegmentPostingsOption::Freq, - } - } - - /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions. - pub fn new_with_freq_and_position(position_data: &[u8], within_block_offset: u8) -> FreqHandler { - FreqHandler { - freq_decoder: BlockDecoder::new(), - positions: UnsafeCell::new(Vec::with_capacity(NUM_DOCS_PER_BLOCK)), - option: SegmentPostingsOption::FreqAndPositions, - } - } - - /* - fn fill_positions_offset(&mut self) { - let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK]; - let mut i: usize = 0; - self.positions_offsets[i] = cur_position; - let mut last_cur_position = cur_position; - for &doc_freq in self.freq_decoder.output_array() { - i += 1; - let mut cumulated_pos = 0u32; - // this next loop decodes delta positions into normal positions. - for j in last_cur_position..(last_cur_position + (doc_freq as usize)) { - cumulated_pos += self.positions[j]; - self.positions[j] = cumulated_pos; - } - cur_position += doc_freq as usize; - self.positions_offsets[i] = cur_position; - last_cur_position = cur_position; - } - }*/ - - - /// Accessor to term frequency - /// - /// idx is the offset of the current doc in the block. - /// It takes value between 0 and 128. - pub fn freq(&self, idx: usize) -> u32 { - self.freq_decoder.output(idx) - } - - /// Accessor to the positions - /// - /// idx is the offset of the current doc in the block. - /// It takes value between 0 and 128. - pub fn positions(&self, idx: usize) -> &[u32] { - //unsafe { &self.positions.get() } - println!("fix positions"); - self.delta_positions(idx) - - } - - /// Accessor to the delta positions. - /// Delta positions is simply the difference between - /// two consecutive positions. - /// The first delta position is the first position of the - /// term in the document. - /// - /// For instance, if positions are `[7,13,17]` - /// then delta positions `[7, 6, 4]` - /// - /// idx is the offset of the current doc in the docid/freq block. - /// It takes value between 0 and 128. - pub fn delta_positions(&self, idx: usize) -> &[u32] { - let freq = self.freq(idx); - let positions: &mut Vec = unsafe { &mut *self.positions.get() }; - positions.resize(freq as usize, 0u32); - &positions[..] - } - - - /// Decompresses a complete frequency block - pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { - match self.option { - SegmentPostingsOption::NoFreq => data, - SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data), - SegmentPostingsOption::FreqAndPositions => { - let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data); - // self.fill_positions_offset(); - remaining - } - } - } - - /// Decompresses an incomplete frequency block - pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) { - match self.option { - SegmentPostingsOption::NoFreq => {} - SegmentPostingsOption::Freq => { - self.freq_decoder.uncompress_vint_unsorted(data, num_els); - } - SegmentPostingsOption::FreqAndPositions => { - self.freq_decoder.uncompress_vint_unsorted(data, num_els); - // self.fill_positions_offset(); - } - } - } -} diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index d89b2aec0..553c50f70 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,9 +1,9 @@ -use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; +use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder, CompressedIntStream}; use DocId; -use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; +use postings::{Postings, DocSet, HasLen, SkipResult}; use std::cmp; -use fastfield::DeleteBitSet; use fst::Streamer; +use fastfield::DeleteBitSet; const EMPTY_DATA: [u8; 0] = [0u8; 0]; @@ -18,6 +18,7 @@ pub struct SegmentPostings<'a> { block_cursor: BlockSegmentPostings<'a>, cur: usize, delete_bitset: DeleteBitSet, + positions_stream: Option>, } impl<'a> SegmentPostings<'a> { @@ -28,12 +29,14 @@ impl<'a> SegmentPostings<'a> { /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, - delete_bitset: DeleteBitSet) + delete_bitset: DeleteBitSet, + positions_stream: Option>) -> SegmentPostings<'a> { SegmentPostings { block_cursor: segment_block_postings, cur: NUM_DOCS_PER_BLOCK, // cursor within the block delete_bitset: delete_bitset, + positions_stream: positions_stream } } @@ -44,6 +47,7 @@ impl<'a> SegmentPostings<'a> { block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), cur: NUM_DOCS_PER_BLOCK, + positions_stream: None, } } } @@ -159,7 +163,7 @@ impl<'a> DocSet for SegmentPostings<'a> { #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); - assert!(self.cur < docs.len(), + debug_assert!(self.cur < docs.len(), "Have you forgotten to call `.advance()` at least once before calling .doc()."); docs[self.cur] } @@ -173,15 +177,15 @@ impl<'a> HasLen for SegmentPostings<'a> { impl<'a> Postings for SegmentPostings<'a> { fn term_freq(&self) -> u32 { - self.block_cursor.freq_handler().freq(self.cur) + self.block_cursor.freq(self.cur) } fn positions(&self) -> &[u32] { - self.block_cursor.freq_handler().positions(self.cur) + unimplemented!(); } fn delta_positions(&self) -> &[u32] { - self.block_cursor.freq_handler().delta_positions(self.cur) + unimplemented!(); } } @@ -194,27 +198,33 @@ impl<'a> Postings for SegmentPostings<'a> { /// While it is useful for some very specific high-performance /// use cases, you should prefer using `SegmentPostings` for most usage. pub struct BlockSegmentPostings<'a> { - block_decoder: BlockDecoder, + doc_decoder: BlockDecoder, + freq_decoder: BlockDecoder, + has_freq: bool, + doc_freq: usize, doc_offset: DocId, num_binpacked_blocks: usize, num_vint_docs: usize, remaining_data: &'a [u8], - freq_handler: FreqHandler, } impl<'a> BlockSegmentPostings<'a> { pub(crate) fn from_data(doc_freq: usize, data: &'a [u8], - freq_handler: FreqHandler) + has_freq: bool) -> BlockSegmentPostings<'a> { let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; BlockSegmentPostings { num_binpacked_blocks: num_binpacked_blocks, num_vint_docs: num_vint_docs, - block_decoder: BlockDecoder::new(), - freq_handler: freq_handler, + + doc_decoder: BlockDecoder::new(), + freq_decoder: BlockDecoder::with_val(1), + + has_freq: has_freq, + remaining_data: data, doc_offset: 0, doc_freq: doc_freq, @@ -255,7 +265,22 @@ impl<'a> BlockSegmentPostings<'a> { /// returned by `.docs()` is empty. #[inline] pub fn docs(&self) -> &[DocId] { - self.block_decoder.output_array() + self.doc_decoder.output_array() + } + + #[inline] + pub fn doc(&self, idx: usize) -> u32 { + self.doc_decoder.output(idx) + } + + #[inline] + pub fn freqs(&self) -> &[u32] { + self.freq_decoder.output_array() + } + + #[inline] + pub fn freq(&self, idx: usize) -> u32 { + self.freq_decoder.output(idx) } /// Returns the length of the current block. @@ -265,13 +290,7 @@ impl<'a> BlockSegmentPostings<'a> { /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` #[inline] fn block_len(&self) -> usize { - self.block_decoder.output_len - } - - - /// Returns a reference to the frequency handler. - pub fn freq_handler(&self) -> &FreqHandler { - &self.freq_handler + self.doc_decoder.output_len } /// Advance to the next block. @@ -279,21 +298,27 @@ impl<'a> BlockSegmentPostings<'a> { /// Returns false iff there was no remaining blocks. pub fn advance(&mut self) -> bool { if self.num_binpacked_blocks > 0 { + // TODO could self.doc_offset be just a local variable? self.remaining_data = - self.block_decoder + self.doc_decoder .uncompress_block_sorted(self.remaining_data, self.doc_offset); - self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); - self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); + if self.has_freq { + self.remaining_data = self.freq_decoder.uncompress_block_unsorted(self.remaining_data); + } + // it will be used as the next offset. + self.doc_offset = self.doc_decoder.output(NUM_DOCS_PER_BLOCK - 1); self.num_binpacked_blocks -= 1; true } else if self.num_vint_docs > 0 { self.remaining_data = - self.block_decoder + self.doc_decoder .uncompress_vint_sorted(self.remaining_data, self.doc_offset, self.num_vint_docs); - self.freq_handler - .read_freq_vint(self.remaining_data, self.num_vint_docs); + if self.has_freq { + self.freq_decoder + .uncompress_vint_unsorted(self.remaining_data, self.num_vint_docs); + } self.num_vint_docs = 0; true } else { @@ -306,8 +331,11 @@ impl<'a> BlockSegmentPostings<'a> { BlockSegmentPostings { num_binpacked_blocks: 0, num_vint_docs: 0, - block_decoder: BlockDecoder::new(), - freq_handler: FreqHandler::new_without_freq(), + + doc_decoder: BlockDecoder::new(), + freq_decoder: BlockDecoder::new(), + has_freq: false, + remaining_data: &EMPTY_DATA, doc_offset: 0, doc_freq: 0, diff --git a/src/postings/segment_postings_option.rs b/src/postings/segment_postings_option.rs index 51a07bb0b..1f87d2e41 100644 --- a/src/postings/segment_postings_option.rs +++ b/src/postings/segment_postings_option.rs @@ -16,6 +16,22 @@ pub enum SegmentPostingsOption { FreqAndPositions, } +impl SegmentPostingsOption { + pub fn has_freq(&self) -> bool { + match *self { + SegmentPostingsOption::NoFreq => false, + _ => true, + } + } + + pub fn has_positions(&self) -> bool { + match *self { + SegmentPostingsOption::FreqAndPositions => true, + _ => false, + } + } +} + #[cfg(test)] mod tests { From 236fa74767c8dd0feb3bb2a4d01c346eb6dafdbb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 5 Aug 2017 23:17:35 +0900 Subject: [PATCH 06/29] Positions almost working. --- src/core/segment_reader.rs | 5 +- src/postings/mod.rs | 13 ++-- src/postings/segment_postings.rs | 101 +++++++++++++++++++++++++--- src/query/term_query/term_weight.rs | 6 +- 4 files changed, 103 insertions(+), 22 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ff66273e2..56247e942 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -24,7 +24,6 @@ use postings::SegmentPostingsOption; use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; -use postings::FreqHandler; @@ -198,10 +197,10 @@ impl SegmentReader { /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings(&self, + pub fn read_postings<'a>(&'a self, term: &Term, option: SegmentPostingsOption) - -> Option { + -> Option> { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(term)); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index ed8a6998f..21cfa6777 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -12,7 +12,6 @@ mod term_info; mod vec_postings; mod segment_postings; mod intersection; -mod freq_handler; mod docset; mod segment_postings_option; @@ -28,7 +27,6 @@ pub use self::vec_postings::VecPostings; pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings}; pub use self::intersection::IntersectionDocSet; -pub use self::freq_handler::FreqHandler; pub use self::segment_postings_option::SegmentPostingsOption; pub use common::HasLen; @@ -63,18 +61,18 @@ mod tests { let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); posting_serializer.new_field(text_field); posting_serializer.new_term("abc".as_bytes()).unwrap(); - for doc_id in 0u32..3u32 { - let positions = vec![1, 2, 3, 2]; - posting_serializer.write_doc(doc_id, 2, &positions).unwrap(); + for doc_id in 0u32..120u32 { + let delta_positions = vec![1, 2, 3, 2]; + posting_serializer.write_doc(doc_id, 2, &delta_positions).unwrap(); } posting_serializer.close_term().unwrap(); posting_serializer.close().unwrap(); let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); - assert!(read.len() <= 16); + assert!(read.len() <= 140); } #[test] - pub fn test_position_and_fieldnorm() { + pub fn test_position_and_fieldnorm1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -144,6 +142,7 @@ mod tests { assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.term_freq(), 6); assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); + assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 1u32); assert_eq!(postings_a.term_freq(), 1); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 553c50f70..ab4805d5e 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -4,9 +4,65 @@ use postings::{Postings, DocSet, HasLen, SkipResult}; use std::cmp; use fst::Streamer; use fastfield::DeleteBitSet; - +use std::cell::UnsafeCell; const EMPTY_DATA: [u8; 0] = [0u8; 0]; +const EMPTY_POSITIONS: [u32; 0] = [0u32; 0]; + +struct PositionComputer<'a> { + // store the amount of position int + // before reading positions. + // + // if none, position are already loaded in + // the positions vec. + position_to_skip: Option, + + delta_positions: Vec, + positions: Vec, + positions_stream: CompressedIntStream<'a>, +} + +impl<'a> PositionComputer<'a> { + + pub fn new(positions_stream: CompressedIntStream<'a>) -> PositionComputer<'a> { + PositionComputer { + position_to_skip: None, + positions: vec!(), + delta_positions: vec!(), + positions_stream: positions_stream, + } + } + + pub fn add_skip(&mut self, num_skip: usize) { + self.position_to_skip = Some( + self.position_to_skip + .map(|prev_skip| prev_skip + num_skip) + .unwrap_or(0) + ); + } + + pub fn positions(&mut self, term_freq: usize) -> &[u32] { + self.delta_positions(term_freq); + &self.positions[..term_freq] + } + + pub fn delta_positions(&mut self, term_freq: usize) -> &[u32] { + if let Some(num_skip) = self.position_to_skip { + self.delta_positions.resize(term_freq, 0u32); + self.positions_stream.skip(num_skip); + self.positions_stream.read(&mut self.delta_positions[..term_freq]); + self.positions.resize(term_freq, 0u32); + let mut cum = 0u32; + for i in 0..term_freq as usize { + cum += self.delta_positions[i]; + self.positions[i] = cum; + } + self.position_to_skip = None; + } + &self.delta_positions[..term_freq] + } +} + /// `SegmentPostings` represents the inverted list or postings associated to @@ -18,9 +74,11 @@ pub struct SegmentPostings<'a> { block_cursor: BlockSegmentPostings<'a>, cur: usize, delete_bitset: DeleteBitSet, - positions_stream: Option>, + + position_computer: Option>>, } + impl<'a> SegmentPostings<'a> { /// Reads a Segment postings from an &[u8] /// @@ -30,24 +88,27 @@ impl<'a> SegmentPostings<'a> { /// frequencies and/or positions pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, delete_bitset: DeleteBitSet, - positions_stream: Option>) + positions_stream_opt: Option>) -> SegmentPostings<'a> { + let position_computer = positions_stream_opt.map(|stream| { + UnsafeCell::new(PositionComputer::new(stream)) + }); SegmentPostings { block_cursor: segment_block_postings, cur: NUM_DOCS_PER_BLOCK, // cursor within the block delete_bitset: delete_bitset, - positions_stream: positions_stream + position_computer: position_computer, } } /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings<'static> { + pub fn empty() -> SegmentPostings<'a> { let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), cur: NUM_DOCS_PER_BLOCK, - positions_stream: None, + position_computer: None, } } } @@ -58,7 +119,9 @@ impl<'a> DocSet for SegmentPostings<'a> { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> bool { + let mut pos_to_skip = 0u32; loop { + pos_to_skip += self.term_freq(); self.cur += 1; if self.cur >= self.block_cursor.block_len() { self.cur = 0; @@ -68,6 +131,11 @@ impl<'a> DocSet for SegmentPostings<'a> { } } if !self.delete_bitset.is_deleted(self.doc()) { + if let Some(ref mut position_computer) = self.position_computer.as_mut() { + unsafe { + (*position_computer.get()).add_skip(pos_to_skip as usize); + } + } return true; } } @@ -181,11 +249,26 @@ impl<'a> Postings for SegmentPostings<'a> { } fn positions(&self) -> &[u32] { - unimplemented!(); + let term_freq = self.term_freq(); + let position_computer_ptr: *mut PositionComputer = self.position_computer + .as_ref() + .expect("Segment reader does not have positions.") + .get(); + unsafe { + (&mut *position_computer_ptr).positions(term_freq as usize) + } } fn delta_positions(&self) -> &[u32] { - unimplemented!(); + let term_freq = self.term_freq(); + self.position_computer + .as_ref() + .map(|position_computer| { + unsafe { + (&mut *position_computer.get()).delta_positions(term_freq as usize) + } + }) + .unwrap_or(&EMPTY_POSITIONS[..]) } } @@ -333,7 +416,7 @@ impl<'a> BlockSegmentPostings<'a> { num_vint_docs: 0, doc_decoder: BlockDecoder::new(), - freq_decoder: BlockDecoder::new(), + freq_decoder: BlockDecoder::with_val(1), has_freq: false, remaining_data: &EMPTY_DATA, diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index e781ebdbd..99bfa7d47 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -27,13 +27,13 @@ impl TermWeight { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } - pub fn specialized_scorer<'a>(&'a self, + pub fn specialized_scorer<'a>(&self, reader: &'a SegmentReader) -> Result>> { let field = self.term.field(); let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - Ok(reader - .read_postings(&self.term, self.segment_postings_options) + let postings: Option> = reader.read_postings(&self.term, self.segment_postings_options); + Ok(postings .map(|segment_postings| { TermScorer { idf: self.idf(), From d1f61a50c1f1598c0698adc1c55dfd661efddca4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 6 Aug 2017 16:03:07 +0900 Subject: [PATCH 07/29] issue/207 Lazily decompressing positions. --- cpp/simdcomp_wrapper.c | 5 -- src/compression/mod.rs | 20 ++++- src/compression/pack/compression_pack_simd.rs | 7 +- src/compression/stream.rs | 11 ++- src/core/segment_reader.rs | 4 +- src/indexer/merger.rs | 29 ++++++- src/postings/postings.rs | 20 ----- src/postings/segment_postings.rs | 84 +++++++++++-------- src/postings/segment_postings_option.rs | 5 ++ src/postings/serializer.rs | 3 - src/postings/vec_postings.rs | 5 -- 11 files changed, 110 insertions(+), 83 deletions(-) diff --git a/cpp/simdcomp_wrapper.c b/cpp/simdcomp_wrapper.c index 1ffff9778..4530e3f3b 100644 --- a/cpp/simdcomp_wrapper.c +++ b/cpp/simdcomp_wrapper.c @@ -40,8 +40,3 @@ size_t uncompress_unsorted( simdunpack((__m128i *)compressed_data, output, b); return 1 + b * sizeof(__m128i); } - - -size_t compressedbytes(const uint32_t length, const uint8_t num_bits) { - return simdpack_compressedbytes((int)length, (uint32_t)num_bits); -} diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 8384c65eb..d8540892b 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -17,7 +17,7 @@ mod pack { pub use self::compression_pack_simd::*; } -pub use self::pack::{BlockEncoder, BlockDecoder, compressedbytes}; +pub use self::pack::{BlockEncoder, BlockDecoder}; #[cfg( any(not(feature="simdcompression"), target_env="msvc") )] mod vint { @@ -31,6 +31,10 @@ mod vint { pub use self::compression_vint_simd::*; } +/// Returns the size in bytes of a compressed block, given num_bits. +pub fn compressed_block_size(num_bits: u8) -> usize { + 1 + (num_bits as usize) * 16 +} pub trait VIntEncoder { fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8]; @@ -87,6 +91,7 @@ pub mod tests { use super::*; use tests; use test::Bencher; + use std::iter; #[test] fn test_encode_sorted_block() { @@ -194,6 +199,19 @@ pub mod tests { b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); }); } + #[test] + fn test_all_docs_compression_numbits() { + for num_bits in 0..33 { + let mut data: Vec = iter::repeat(0u32).take(128).collect(); + if num_bits > 0 { + data[0] = 1 << (num_bits - 1); + } + let mut encoder = BlockEncoder::new(); + let compressed = encoder.compress_block_unsorted(&data); + assert_eq!(compressed[0] as usize, num_bits); + assert_eq!(compressed.len(), compressed_block_size(compressed[0])); + } + } const NUM_INTS_BENCH_VINT: usize = 10; diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index ba3518521..6842e0cc2 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -16,15 +16,9 @@ mod simdcomp { pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t; pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t; - - pub fn compressedbytes(length: u32, num_bits: u8) -> size_t; } } -pub fn compressedbytes(length: u32, num_bits: u8) -> usize { - unsafe { simdcomp::compressedbytes(length, num_bits) } -} - fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize { unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) } } @@ -123,4 +117,5 @@ mod tests { let compressed = encoder.compress_block_sorted(&data, 0u32); assert_eq!(compressed.len(), 17); } + } diff --git a/src/compression/stream.rs b/src/compression/stream.rs index b3bbc8716..735eb7bef 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -1,6 +1,6 @@ use compression::BlockDecoder; use compression::NUM_DOCS_PER_BLOCK; -use compression::compressedbytes; +use compression::compressed_block_size; pub struct CompressedIntStream<'a> { buffer: &'a [u8], @@ -52,8 +52,8 @@ impl<'a> CompressedIntStream<'a> { while skip_len >= NUM_DOCS_PER_BLOCK { skip_len -= NUM_DOCS_PER_BLOCK; let num_bits: u8 = self.buffer[0]; - let block_len = compressedbytes(128, num_bits); - self.buffer = &self.buffer[1 + block_len..]; + let block_len = compressed_block_size(num_bits); + self.buffer = &self.buffer[block_len..]; } self.buffer = self.block_decoder.uncompress_block_unsorted(self.buffer); self.inner_offset = skip_len; @@ -66,8 +66,7 @@ impl<'a> CompressedIntStream<'a> { pub mod tests { use super::CompressedIntStream; - use tests; - use compression::compressedbytes; + use compression::compressed_block_size; use compression::NUM_DOCS_PER_BLOCK; use compression::BlockEncoder; @@ -78,7 +77,7 @@ pub mod tests { for chunk in vals.chunks(NUM_DOCS_PER_BLOCK) { let compressed_block = encoder.compress_block_unsorted(chunk); let num_bits = compressed_block[0]; - assert_eq!(compressedbytes(128, num_bits) + 1, compressed_block.len()); + assert_eq!(compressed_block_size(num_bits), compressed_block.len()); buffer.extend_from_slice(compressed_block); } buffer diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 56247e942..619888228 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -197,10 +197,10 @@ impl SegmentReader { /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings<'a>(&'a self, + pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) - -> Option> { + -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(term)); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 86f0a0e78..75f329186 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -61,6 +61,31 @@ fn extract_fast_field_reader(segment_reader: &SegmentReader, segment_reader.fast_fields_reader().open_reader(field) } +struct DeltaComputer { + buffer: Vec, +} + +impl DeltaComputer { + fn new() -> DeltaComputer { + DeltaComputer { buffer: vec![0u32; 512] } + } + + fn compute_delta(&mut self, positions: &[u32]) -> &[u32] { + if positions.len() > self.buffer.len() { + self.buffer.resize(positions.len(), 0u32); + } + let mut last_pos = 0u32; + let num_positions = positions.len(); + for i in 0..num_positions { + let cur_pos = positions[i]; + self.buffer[i] = cur_pos - last_pos; + last_pos = cur_pos; + } + &self.buffer[..positions.len()] + } +} + + impl IndexMerger { pub fn open(schema: Schema, segments: &[Segment]) -> Result { let mut readers = vec![]; @@ -169,6 +194,7 @@ impl IndexMerger { fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> { + let mut delta_computer = DeltaComputer::new(); let mut merged_terms = TermMerger::from(&self.readers[..]); let mut max_doc = 0; @@ -270,8 +296,9 @@ impl IndexMerger { old_to_new_doc_id[segment_postings.doc() as usize] { // we make sure to only write the term iff // there is at least one document. - let delta_positions: &[u32] = segment_postings.delta_positions(); + let positions: &[u32] = segment_postings.positions(); let term_freq = segment_postings.term_freq(); + let delta_positions = delta_computer.compute_delta(positions); serializer .write_doc(remapped_doc_id, term_freq, delta_positions)?; } diff --git a/src/postings/postings.rs b/src/postings/postings.rs index 29538e0d2..52f16198a 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -17,16 +17,6 @@ pub trait Postings: DocSet { /// Returns the list of positions of the term, expressed as a list of /// token ordinals. fn positions(&self) -> &[u32]; - /// Return the list of delta positions. - /// - /// Delta positions is simply the difference between - /// two consecutive positions. - /// The first delta position is the first position of the - /// term in the document. - /// - /// For instance, if positions are `[7,13,17]` - /// then delta positions `[7, 6, 4]` - fn delta_positions(&self) -> &[u32]; } impl Postings for Box { @@ -39,11 +29,6 @@ impl Postings for Box { let unboxed: &TPostings = self.borrow(); unboxed.positions() } - - fn delta_positions(&self) -> &[u32] { - let unboxed: &TPostings = self.borrow(); - unboxed.delta_positions() - } } impl<'a, TPostings: Postings> Postings for &'a mut TPostings { @@ -56,9 +41,4 @@ impl<'a, TPostings: Postings> Postings for &'a mut TPostings { let unref: &TPostings = *self; unref.positions() } - - fn delta_positions(&self) -> &[u32] { - let unref: &TPostings = *self; - unref.delta_positions() - } } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index ab4805d5e..26810edf4 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -16,8 +16,6 @@ struct PositionComputer<'a> { // if none, position are already loaded in // the positions vec. position_to_skip: Option, - - delta_positions: Vec, positions: Vec, positions_stream: CompressedIntStream<'a>, } @@ -28,7 +26,6 @@ impl<'a> PositionComputer<'a> { PositionComputer { position_to_skip: None, positions: vec!(), - delta_positions: vec!(), positions_stream: positions_stream, } } @@ -42,24 +39,21 @@ impl<'a> PositionComputer<'a> { } pub fn positions(&mut self, term_freq: usize) -> &[u32] { - self.delta_positions(term_freq); - &self.positions[..term_freq] - } - - pub fn delta_positions(&mut self, term_freq: usize) -> &[u32] { if let Some(num_skip) = self.position_to_skip { - self.delta_positions.resize(term_freq, 0u32); - self.positions_stream.skip(num_skip); - self.positions_stream.read(&mut self.delta_positions[..term_freq]); + self.positions.resize(term_freq, 0u32); + + self.positions_stream.skip(num_skip); + self.positions_stream.read(&mut self.positions[..term_freq]); + let mut cum = 0u32; for i in 0..term_freq as usize { - cum += self.delta_positions[i]; + cum += self.positions[i]; self.positions[i] = cum; } self.position_to_skip = None; } - &self.delta_positions[..term_freq] + &self.positions[..term_freq] } } @@ -74,7 +68,6 @@ pub struct SegmentPostings<'a> { block_cursor: BlockSegmentPostings<'a>, cur: usize, delete_bitset: DeleteBitSet, - position_computer: Option>>, } @@ -111,6 +104,16 @@ impl<'a> SegmentPostings<'a> { position_computer: None, } } + + + fn position_add_skipusize>(&self, num_skips_fn: F) { + if let Some(ref position_computer) = self.position_computer.as_ref() { + let num_skips = num_skips_fn(); + unsafe { + (*position_computer.get()).add_skip(num_skips); + } + } + } } @@ -119,9 +122,7 @@ impl<'a> DocSet for SegmentPostings<'a> { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> bool { - let mut pos_to_skip = 0u32; loop { - pos_to_skip += self.term_freq(); self.cur += 1; if self.cur >= self.block_cursor.block_len() { self.cur = 0; @@ -130,12 +131,8 @@ impl<'a> DocSet for SegmentPostings<'a> { return false; } } + self.position_add_skip(|| { self.term_freq() as usize }); if !self.delete_bitset.is_deleted(self.doc()) { - if let Some(ref mut position_computer) = self.position_computer.as_mut() { - unsafe { - (*position_computer.get()).add_skip(pos_to_skip as usize); - } - } return true; } } @@ -147,6 +144,10 @@ impl<'a> DocSet for SegmentPostings<'a> { return SkipResult::End; } + // in the following, thanks to the call to advance above, + // we know that the position is not loaded and we need + // to skip every doc_freq we cross. + // skip blocks until one that might contain the target loop { // check if we need to go to the next block @@ -155,13 +156,26 @@ impl<'a> DocSet for SegmentPostings<'a> { (block_docs[self.cur], block_docs[block_docs.len() - 1]) }; if target > last_doc_in_block { + + // we add skip for the current term independantly, + // so that position_add_skip will decide if it should + // just set itself to Some(0) or effectively + // add the term freq. + //let num_skips: u32 = ; + self.position_add_skip(|| { + let freqs_skipped = &self.block_cursor.freqs()[self.cur..]; + let sum_freq: u32 = freqs_skipped.iter().cloned().sum(); + sum_freq as usize + }); + if !self.block_cursor.advance() { return SkipResult::End; } + self.cur = 0; } else { if target < current_doc { - // We've overpassed the target after the first `advance` call + // We've passed the target after the first `advance` call // or we're at the beginning of a block. // Either way, we're on the first `DocId` greater than `target` return SkipResult::OverStep; @@ -207,6 +221,13 @@ impl<'a> DocSet for SegmentPostings<'a> { // `doc` is now >= `target` let doc = block_docs[start]; + + self.position_add_skip(|| { + let freqs_skipped = &self.block_cursor.freqs()[self.cur..start]; + let sum_freqs: u32 = freqs_skipped.iter().sum(); + sum_freqs as usize + }); + self.cur = start; if !self.delete_bitset.is_deleted(doc) { @@ -228,6 +249,7 @@ impl<'a> DocSet for SegmentPostings<'a> { self.len() } + /// Return the current document's `DocId`. #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); @@ -249,28 +271,19 @@ impl<'a> Postings for SegmentPostings<'a> { } fn positions(&self) -> &[u32] { - let term_freq = self.term_freq(); - let position_computer_ptr: *mut PositionComputer = self.position_computer - .as_ref() - .expect("Segment reader does not have positions.") - .get(); - unsafe { - (&mut *position_computer_ptr).positions(term_freq as usize) - } - } - - fn delta_positions(&self) -> &[u32] { let term_freq = self.term_freq(); self.position_computer .as_ref() .map(|position_computer| { unsafe { - (&mut *position_computer.get()).delta_positions(term_freq as usize) + (&mut *position_computer.get()).positions(term_freq as usize) } }) .unwrap_or(&EMPTY_POSITIONS[..]) } + + } /// `BlockSegmentPostings` is a cursor iterating over blocks @@ -351,16 +364,19 @@ impl<'a> BlockSegmentPostings<'a> { self.doc_decoder.output_array() } + /// Return the document at index `idx` of the block. #[inline] pub fn doc(&self, idx: usize) -> u32 { self.doc_decoder.output(idx) } + /// Return the array of `term freq` in the block. #[inline] pub fn freqs(&self) -> &[u32] { self.freq_decoder.output_array() } + /// Return the frequency at index `idx` of the block. #[inline] pub fn freq(&self, idx: usize) -> u32 { self.freq_decoder.output(idx) diff --git a/src/postings/segment_postings_option.rs b/src/postings/segment_postings_option.rs index 1f87d2e41..2aba4ec8e 100644 --- a/src/postings/segment_postings_option.rs +++ b/src/postings/segment_postings_option.rs @@ -17,6 +17,9 @@ pub enum SegmentPostingsOption { } impl SegmentPostingsOption { + + /// Returns true iff this option includes encoding + /// term frequencies. pub fn has_freq(&self) -> bool { match *self { SegmentPostingsOption::NoFreq => false, @@ -24,6 +27,8 @@ impl SegmentPostingsOption { } } + /// Returns true iff this option include encoding + /// term positions. pub fn has_positions(&self) -> bool { match *self { SegmentPostingsOption::FreqAndPositions => true, diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 34ba47382..5c5e93a7d 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -12,11 +12,8 @@ use DocId; use core::Segment; use std::io::{self, Write}; use compression::VIntEncoder; -use common::VInt; -use common::BinarySerializable; use common::CountingWriter; use termdict::TermDictionaryBuilder; -use datastruct::{SkipList, SkipListBuilder}; /// `PostingsSerializer` is in charge of serializing diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index eb47933b4..8c9512fb1 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -54,11 +54,6 @@ impl Postings for VecPostings { fn positions(&self) -> &[u32] { &EMPTY_ARRAY } - - fn delta_positions(&self) -> &[u32] { - &EMPTY_ARRAY - } - } #[cfg(test)] From 1e89f86267c9680aee4291f8256d4782344fcd84 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 7 Aug 2017 19:16:49 +0900 Subject: [PATCH 08/29] blop --- src/indexer/merger.rs | 4 +- src/indexer/segment_serializer.rs | 8 +- src/postings/mod.rs | 4 +- src/postings/postings_writer.rs | 8 +- src/postings/recorder.rs | 10 +- src/postings/serializer.rs | 233 ++++++++++++++++++------------ 6 files changed, 159 insertions(+), 108 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 75f329186..6318a17c9 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -5,7 +5,7 @@ use DocId; use core::SerializableSegment; use schema::FieldValue; use indexer::SegmentSerializer; -use postings::PostingsSerializer; +use postings::InvertedIndexSerializer; use fastfield::U64FastFieldReader; use itertools::Itertools; use postings::Postings; @@ -192,7 +192,7 @@ impl IndexMerger { Ok(()) } - fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> { + fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { let mut delta_computer = DeltaComputer::new(); let mut merged_terms = TermMerger::from(&self.readers[..]); diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index b75663927..35d10ef8d 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -4,7 +4,7 @@ use core::Segment; use core::SegmentComponent; use fastfield::FastFieldSerializer; use store::StoreWriter; -use postings::PostingsSerializer; +use postings::InvertedIndexSerializer; /// Segment serializer is in charge of laying out on disk @@ -13,7 +13,7 @@ pub struct SegmentSerializer { store_writer: StoreWriter, fast_field_serializer: FastFieldSerializer, fieldnorms_serializer: FastFieldSerializer, - postings_serializer: PostingsSerializer, + postings_serializer: InvertedIndexSerializer, } impl SegmentSerializer { @@ -27,7 +27,7 @@ impl SegmentSerializer { let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS)); let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write)); - let postings_serializer = try!(PostingsSerializer::open(segment)); + let postings_serializer = try!(InvertedIndexSerializer::open(segment)); Ok(SegmentSerializer { postings_serializer: postings_serializer, store_writer: StoreWriter::new(store_write), @@ -37,7 +37,7 @@ impl SegmentSerializer { } /// Accessor to the `PostingsSerializer`. - pub fn get_postings_serializer(&mut self) -> &mut PostingsSerializer { + pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { &mut self.postings_serializer } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 21cfa6777..06e893646 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -17,7 +17,7 @@ mod segment_postings_option; pub use self::docset::{SkipResult, DocSet}; use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; -pub use self::serializer::PostingsSerializer; +pub use self::serializer::InvertedIndexSerializer; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub use self::term_info::TermInfo; pub use self::postings::Postings; @@ -58,7 +58,7 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut segment = index.new_segment(); - let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); + let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); posting_serializer.new_field(text_field); posting_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..120u32 { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 67a8f9c5e..0a995a889 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,7 +1,7 @@ use DocId; use schema::Term; use schema::FieldValue; -use postings::PostingsSerializer; +use postings::InvertedIndexSerializer; use std::io; use postings::Recorder; use analyzer::SimpleTokenizer; @@ -78,7 +78,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { /// It pushes all term, one field at a time, towards the /// postings serializer. #[allow(needless_range_loop)] - pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> { + pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect(); term_offsets.sort_by_key(|&(k, _v)| k); @@ -138,7 +138,7 @@ pub trait PostingsWriter { fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], - serializer: &mut PostingsSerializer, + serializer: &mut InvertedIndexSerializer, heap: &Heap) -> io::Result<()>; @@ -216,7 +216,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], - serializer: &mut PostingsSerializer, + serializer: &mut InvertedIndexSerializer, heap: &Heap) -> io::Result<()> { serializer.new_field(field); diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index c340d13fd..d7f91d35c 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,6 +1,6 @@ use DocId; use std::io; -use postings::PostingsSerializer; +use postings::InvertedIndexSerializer; use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; @@ -29,7 +29,7 @@ pub trait Recorder: HeapAllocable { /// Pushes the postings information to the serializer. fn serialize(&self, self_addr: u32, - serializer: &mut PostingsSerializer, + serializer: &mut InvertedIndexSerializer, heap: &Heap) -> io::Result<()>; } @@ -66,7 +66,7 @@ impl Recorder for NothingRecorder { fn serialize(&self, self_addr: u32, - serializer: &mut PostingsSerializer, + serializer: &mut InvertedIndexSerializer, heap: &Heap) -> io::Result<()> { for doc in self.stack.iter(self_addr, heap) { @@ -118,7 +118,7 @@ impl Recorder for TermFrequencyRecorder { fn serialize(&self, self_addr: u32, - serializer: &mut PostingsSerializer, + serializer: &mut InvertedIndexSerializer, heap: &Heap) -> io::Result<()> { // the last document has not been closed... @@ -173,7 +173,7 @@ impl Recorder for TFAndPositionRecorder { fn serialize(&self, self_addr: u32, - serializer: &mut PostingsSerializer, + serializer: &mut InvertedIndexSerializer, heap: &Heap) -> io::Result<()> { let mut doc_positions = Vec::with_capacity(100); diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 5c5e93a7d..c4b9c1146 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -47,29 +47,125 @@ use termdict::TermDictionaryBuilder; /// /// A description of the serialization format is /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). -pub struct PostingsSerializer { +pub struct InvertedIndexSerializer { terms_fst_builder: TermDictionaryBuilderImpl, + postings_serializer: PostingsSerializer, + positions_serializer: PositionSerializer, + schema: Schema, + + term_open: bool, + text_indexing_options: TextIndexingOptions, + + current_term_info: TermInfo, + +} + +struct PostingsSerializer { postings_write: CountingWriter, last_doc_id_encoded: u32, - positions_writer: PositionWriter, + block_encoder: BlockEncoder, doc_ids: Vec, term_freqs: Vec, - schema: Schema, - text_indexing_options: TextIndexingOptions, - term_open: bool, - current_term_info: TermInfo, + + termfreq_enabled: bool, } -struct PositionWriter { +impl PostingsSerializer { + fn new(write: WritePtr) -> PostingsSerializer { + PostingsSerializer { + postings_write: CountingWriter::wrap(write), + + block_encoder: BlockEncoder::new(), + doc_ids: vec!(), + term_freqs: vec!(), + + last_doc_id_encoded: 0u32, + termfreq_enabled: false, + } + } + + fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> { + self.doc_ids.push(doc_id); + if self.termfreq_enabled { + self.term_freqs.push(term_freq as u32); + } + if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { + { + // encode the doc ids + let block_encoded: &[u8] = + self.block_encoder + .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); + self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; + self.postings_write.write_all(block_encoded)?; + } + if self.termfreq_enabled { + // encode the term_freqs + let block_encoded: &[u8] = self.block_encoder + .compress_block_unsorted(&self.term_freqs); + self.postings_write.write_all(block_encoded)?; + self.term_freqs.clear(); + } + self.doc_ids.clear(); + } + Ok(()) + } + + fn set_termfreq_enabled(&mut self, termfreq_enabled: bool) { + self.termfreq_enabled = termfreq_enabled; + } + + fn close_term(&mut self) -> io::Result<()> { + if !self.doc_ids.is_empty() { + // we have doc ids waiting to be written + // this happens when the number of doc ids is + // not a perfect multiple of our block size. + // + // In that case, the remaining part is encoded + // using variable int encoding. + { + let block_encoded = + self.block_encoder + .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); + self.postings_write.write_all(block_encoded)?; + self.doc_ids.clear(); + } + // ... Idem for term frequencies + if self.termfreq_enabled { + let block_encoded = self.block_encoder + .compress_vint_unsorted(&self.term_freqs[..]); + self.postings_write.write_all(block_encoded)?; + self.term_freqs.clear(); + } + } + Ok(()) + } + + fn close(mut self) -> io::Result<()> { + self.postings_write.flush() + } + + + fn addr(&self) -> u32 { + self.postings_write.written_bytes() as u32 + } + + fn clear(&mut self) { + self.doc_ids.clear(); + self.term_freqs.clear(); + self.last_doc_id_encoded = 0; + } +} + +struct PositionSerializer { buffer: Vec, write: CountingWriter, block_encoder: BlockEncoder, } -impl PositionWriter { - fn new(write: WritePtr) -> PositionWriter { - PositionWriter { +impl PositionSerializer { + fn new(write: WritePtr) -> PositionSerializer { + PositionSerializer { buffer: Vec::with_capacity(NUM_DOCS_PER_BLOCK), write: CountingWriter::wrap(write), block_encoder: BlockEncoder::new(), @@ -108,37 +204,33 @@ impl PositionWriter { } } -impl PostingsSerializer { +impl InvertedIndexSerializer { /// Open a new `PostingsSerializer` for the given segment pub fn new(terms_write: WritePtr, postings_write: WritePtr, positions_write: WritePtr, schema: Schema) - -> Result { + -> Result { let terms_fst_builder = TermDictionaryBuilderImpl::new(terms_write)?; - Ok(PostingsSerializer { - terms_fst_builder: terms_fst_builder, - postings_write: CountingWriter::wrap(postings_write), - positions_writer: PositionWriter::new(positions_write), - last_doc_id_encoded: 0u32, - block_encoder: BlockEncoder::new(), - doc_ids: Vec::new(), - term_freqs: Vec::new(), - schema: schema, - text_indexing_options: TextIndexingOptions::Unindexed, - term_open: false, - current_term_info: TermInfo::default(), - }) + Ok(InvertedIndexSerializer { + terms_fst_builder: terms_fst_builder, + positions_serializer: PositionSerializer::new(positions_write), + postings_serializer: PostingsSerializer::new(postings_write), + schema: schema, + term_open: false, + current_term_info: TermInfo::default(), + text_indexing_options: TextIndexingOptions::Untokenized, + }) } /// Open a new `PostingsSerializer` for the given segment - pub fn open(segment: &mut Segment) -> Result { + pub fn open(segment: &mut Segment) -> Result { use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; - PostingsSerializer::new(segment.open_write(TERMS)?, - segment.open_write(POSTINGS)?, - segment.open_write(POSITIONS)?, - segment.schema()) + InvertedIndexSerializer::new(segment.open_write(TERMS)?, + segment.open_write(POSTINGS)?, + segment.open_write(POSITIONS)?, + segment.schema()) } /// Must be called before starting pushing terms of @@ -158,6 +250,17 @@ impl PostingsSerializer { } } }; + self.postings_serializer.set_termfreq_enabled(self.text_indexing_options.is_termfreq_enabled()); + } + + fn current_term_info(&self) -> TermInfo { + let (filepos, offset) = self.positions_serializer.addr(); + TermInfo { + doc_freq: 0, + postings_offset: self.postings_serializer.addr(), + positions_offset: filepos, + positions_inner_offset: offset, + } } /// Starts the postings for a new term. @@ -169,16 +272,8 @@ impl PostingsSerializer { panic!("Called new_term, while the previous term was not closed."); } self.term_open = true; - self.doc_ids.clear(); - self.last_doc_id_encoded = 0; - self.term_freqs.clear(); - let (filepos, offset) = self.positions_writer.addr(); - self.current_term_info = TermInfo { - doc_freq: 0, - postings_offset: self.postings_write.written_bytes() as u32, - positions_offset: filepos, - positions_inner_offset: offset, - }; + self.postings_serializer.clear(); + self.current_term_info = self.current_term_info(); self.terms_fst_builder.insert_key(term) } @@ -188,32 +283,8 @@ impl PostingsSerializer { /// using `VInt` encoding. pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { - - self.terms_fst_builder - .insert_value(&self.current_term_info)?; - - if !self.doc_ids.is_empty() { - // we have doc ids waiting to be written - // this happens when the number of doc ids is - // not a perfect multiple of our block size. - // - // In that case, the remaining part is encoded - // using variable int encoding. - { - let block_encoded = - self.block_encoder - .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); - self.postings_write.write_all(block_encoded)?; - self.doc_ids.clear(); - } - // ... Idem for term frequencies - if self.text_indexing_options.is_termfreq_enabled() { - let block_encoded = self.block_encoder - .compress_vint_unsorted(&self.term_freqs[..]); - self.postings_write.write_all(block_encoded)?; - self.term_freqs.clear(); - } - } + self.terms_fst_builder.insert_value(&self.current_term_info)?; + self.postings_serializer.close_term()?; self.term_open = false; } Ok(()) @@ -235,31 +306,11 @@ impl PostingsSerializer { position_deltas: &[u32]) -> io::Result<()> { self.current_term_info.doc_freq += 1; - self.doc_ids.push(doc_id); - if self.text_indexing_options.is_termfreq_enabled() { - self.term_freqs.push(term_freq as u32); - } + self.postings_serializer.write_doc(doc_id, term_freq)?; if self.text_indexing_options.is_position_enabled() { - self.positions_writer.write(position_deltas)?; - } - if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { - { - // encode the doc ids - let block_encoded: &[u8] = - self.block_encoder - .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); - self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; - self.postings_write.write_all(block_encoded)?; - } - if self.text_indexing_options.is_termfreq_enabled() { - // encode the term_freqs - let block_encoded: &[u8] = self.block_encoder - .compress_block_unsorted(&self.term_freqs); - self.postings_write.write_all(block_encoded)?; - self.term_freqs.clear(); - } - self.doc_ids.clear(); + self.positions_serializer.write(position_deltas)?; } + Ok(()) } @@ -267,8 +318,8 @@ impl PostingsSerializer { pub fn close(mut self) -> io::Result<()> { self.close_term()?; self.terms_fst_builder.finish()?; - self.postings_write.flush()?; - self.positions_writer.close()?; + self.postings_serializer.close()?; + self.positions_serializer.close()?; Ok(()) } } From 8f377b92d009df66c2e9e954f8628f7cc5689454 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 11 Aug 2017 18:11:32 +0900 Subject: [PATCH 09/29] introducing a field serializer --- src/indexer/merger.rs | 187 ++++++++++---------- src/postings/mod.rs | 17 +- src/postings/postings_writer.rs | 19 +- src/postings/recorder.rs | 14 +- src/postings/serializer.rs | 299 +++++++++++++++++++------------- 5 files changed, 298 insertions(+), 238 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6318a17c9..468d867e7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -19,7 +19,6 @@ use store::StoreWriter; use std::cmp::{min, max}; use schema::Term; use termdict::TermStreamer; -use postings::SegmentPostingsOption; pub struct IndexMerger { schema: Schema, @@ -215,103 +214,115 @@ impl IndexMerger { merged_doc_id_map.push(segment_local_map); } - let mut last_field: Option = None; + // Create the total list of doc ids + // by stacking the doc ids from the different segment. + // + // In the new segments, the doc id from the different + // segment are stacked so that : + // - Segment 0's doc ids become doc id [0, seg.max_doc] + // - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc] + // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, + // seg0.max_doc + seg1.max_doc + seg2.max_doc] + // ... + if !merged_terms.advance() { + return Ok(()); + } - let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions; + let mut current_field = Term::wrap(merged_terms.key()).field(); - while merged_terms.advance() { + loop { + // this loop processes all fields. + let mut field_serializer = serializer.new_field(current_field); - // Create the total list of doc ids - // by stacking the doc ids from the different segment. - // - // In the new segments, the doc id from the different - // segment are stacked so that : - // - Segment 0's doc ids become doc id [0, seg.max_doc] - // - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc] - // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, - // seg0.max_doc + seg1.max_doc + seg2.max_doc] - // ... - let term = Term::wrap(merged_terms.key()); - let current_field = term.field(); - - if last_field != Some(current_field) { - // we reached a new field. - let field_entry = self.schema.get_field_entry(current_field); - // ... set segment postings option the new field. - segment_postings_option = field_entry - .field_type() - .get_segment_postings_option() - .expect("Encountered a field that is not supposed to be + // we reached a new field. + let field_entry = self.schema.get_field_entry(current_field); + // ... set segment postings option the new field. + let segment_postings_option = field_entry + .field_type() + .get_segment_postings_option() + .expect("Encountered a field that is not supposed to be indexed. Have you modified the schema?"); - last_field = Some(current_field); + loop { + // this loops processes a field. + { + let term = Term::wrap(merged_terms.key()); - // it is perfectly safe to call `.new_field` - // even if there is no postings associated. - serializer.new_field(current_field); - } + // Let's compute the list of non-empty posting lists + let segment_postings: Vec<_> = merged_terms + .current_kvs() + .iter() + .flat_map(|heap_item| { + let segment_ord = heap_item.segment_ord; + let term_info = heap_item.streamer.value(); + let segment_reader = &self.readers[heap_item.segment_ord]; + let mut segment_postings = + segment_reader + .read_postings_from_terminfo(term_info, segment_postings_option); + if segment_postings.advance() { + Some((segment_ord, segment_postings)) + } else { + None + } + }) + .collect(); - // Let's compute the list of non-empty posting lists - let segment_postings: Vec<_> = merged_terms - .current_kvs() - .iter() - .flat_map(|heap_item| { - let segment_ord = heap_item.segment_ord; - let term_info = heap_item.streamer.value(); - let segment_reader = &self.readers[heap_item.segment_ord]; - let mut segment_postings = - segment_reader - .read_postings_from_terminfo(term_info, segment_postings_option); - if segment_postings.advance() { - Some((segment_ord, segment_postings)) - } else { - None + // At this point, `segment_postings` contains the posting list + // of all of the segments containing the given term. + // + // These segments are non-empty and advance has already been called. + + if !segment_postings.is_empty() { + // If not, the `term` will be entirely removed. + + // We know that there is at least one document containing + // the term, so we add it. + field_serializer.new_term(term.as_ref())?; + + // We can now serialize this postings, by pushing each document to the + // postings serializer. + + for (segment_ord, mut segment_postings) in segment_postings { + let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; + loop { + // `.advance()` has been called once before the loop. + // Hence we cannot use a `while segment_postings.advance()` loop. + if let Some(remapped_doc_id) = + old_to_new_doc_id[segment_postings.doc() as usize] { + // we make sure to only write the term iff + // there is at least one document. + let positions: &[u32] = segment_postings.positions(); + let term_freq = segment_postings.term_freq(); + let delta_positions = delta_computer.compute_delta(positions); + field_serializer + .write_doc(remapped_doc_id, term_freq, delta_positions)?; + } + if !segment_postings.advance() { + break; + } + } + } + + // closing the term. + field_serializer.close_term()?; } - }) - .collect(); - // At this point, `segment_postings` contains the posting list - // of all of the segments containing the given term. - // - // These segments are non-empty and advance has already been called. + } - if segment_postings.is_empty() { - // by continuing here, the `term` will be entirely removed. - continue; - } - // We know that there is at least one document containing - // the term, so we add it. - serializer.new_term(term.as_ref())?; + if !merged_terms.advance() { + return Ok(()) + } - // We can now serialize this postings, by pushing each document to the - // postings serializer. - - for (segment_ord, mut segment_postings) in segment_postings { - let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; - loop { - // `.advance()` has been called once before the loop. - // Hence we cannot use a `while segment_postings.advance()` loop. - if let Some(remapped_doc_id) = - old_to_new_doc_id[segment_postings.doc() as usize] { - // we make sure to only write the term iff - // there is at least one document. - let positions: &[u32] = segment_postings.positions(); - let term_freq = segment_postings.term_freq(); - let delta_positions = delta_computer.compute_delta(positions); - serializer - .write_doc(remapped_doc_id, term_freq, delta_positions)?; - } - if !segment_postings.advance() { + { + let next_term_field = Term::wrap(merged_terms.key()).field(); + if next_term_field != current_field { + current_field = next_term_field; break; } } } - - // closing the term. - serializer.close_term()?; } - Ok(()) } fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> { @@ -319,9 +330,9 @@ impl IndexMerger { let store_reader = reader.get_store_reader(); for doc_id in 0..reader.max_doc() { if !reader.is_deleted(doc_id) { - let doc = try!(store_reader.get(doc_id)); + let doc = store_reader.get(doc_id)?; let field_values: Vec<&FieldValue> = doc.field_values().iter().collect(); - try!(store_writer.store(&field_values)); + store_writer.store(&field_values)?; } } } @@ -331,11 +342,11 @@ impl IndexMerger { impl SerializableSegment for IndexMerger { fn write(&self, mut serializer: SegmentSerializer) -> Result { - try!(self.write_postings(serializer.get_postings_serializer())); - try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer())); - try!(self.write_fast_fields(serializer.get_fast_field_serializer())); - try!(self.write_storable_fields(serializer.get_store_writer())); - try!(serializer.close()); + self.write_postings(serializer.get_postings_serializer())?; + self.write_fieldnorms(serializer.get_fieldnorms_serializer())?; + self.write_fast_fields(serializer.get_fast_field_serializer())?; + self.write_storable_fields(serializer.get_store_writer())?; + serializer.close()?; Ok(self.max_doc) } } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 06e893646..fd78cbded 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -17,8 +17,9 @@ mod segment_postings_option; pub use self::docset::{SkipResult, DocSet}; use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; -pub use self::serializer::InvertedIndexSerializer; +pub use self::serializer::{InvertedIndexSerializer, FieldSerializer}; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; + pub use self::term_info::TermInfo; pub use self::postings::Postings; @@ -59,13 +60,15 @@ mod tests { let index = Index::create_in_ram(schema); let mut segment = index.new_segment(); let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); - posting_serializer.new_field(text_field); - posting_serializer.new_term("abc".as_bytes()).unwrap(); - for doc_id in 0u32..120u32 { - let delta_positions = vec![1, 2, 3, 2]; - posting_serializer.write_doc(doc_id, 2, &delta_positions).unwrap(); + { + let mut field_serializer = posting_serializer.new_field(text_field); + field_serializer.new_term("abc".as_bytes()).unwrap(); + for doc_id in 0u32..120u32 { + let delta_positions = vec![1, 2, 3, 2]; + field_serializer.write_doc(doc_id, 2, &delta_positions).unwrap(); + } + field_serializer.close_term().unwrap(); } - posting_serializer.close_term().unwrap(); posting_serializer.close().unwrap(); let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); assert!(read.len() <= 140); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 0a995a889..813073b4c 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,7 +1,7 @@ use DocId; use schema::Term; use schema::FieldValue; -use postings::InvertedIndexSerializer; +use postings::{InvertedIndexSerializer, FieldSerializer}; use std::io; use postings::Recorder; use analyzer::SimpleTokenizer; @@ -101,8 +101,8 @@ impl<'a> MultiFieldPostingsWriter<'a> { let (field, start) = offsets[i]; let (_, stop) = offsets[i + 1]; let postings_writer = &self.per_field_postings_writers[field.0 as usize]; - postings_writer - .serialize(field, &term_offsets[start..stop], serializer, self.heap)?; + let field_serializer = serializer.new_field(field); + postings_writer.serialize(&term_offsets[start..stop], field_serializer, self.heap)?; } Ok(()) } @@ -136,9 +136,8 @@ pub trait PostingsWriter { /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, - field: Field, term_addrs: &[(&[u8], u32)], - serializer: &mut InvertedIndexSerializer, + serializer: FieldSerializer, heap: &Heap) -> io::Result<()>; @@ -214,17 +213,15 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } fn serialize(&self, - field: Field, term_addrs: &[(&[u8], u32)], - serializer: &mut InvertedIndexSerializer, + mut serializer: FieldSerializer, heap: &Heap) -> io::Result<()> { - serializer.new_field(field); for &(term_bytes, addr) in term_addrs { let recorder: &mut Rec = self.heap.get_mut_ref(addr); - try!(serializer.new_term(term_bytes)); - try!(recorder.serialize(addr, serializer, heap)); - try!(serializer.close_term()); + serializer.new_term(term_bytes)?; + recorder.serialize(addr, &mut serializer, heap)?; + serializer.close_term()?; } Ok(()) } diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index d7f91d35c..dde85d66c 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,6 +1,6 @@ use DocId; use std::io; -use postings::InvertedIndexSerializer; +use postings::FieldSerializer; use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; @@ -29,7 +29,7 @@ pub trait Recorder: HeapAllocable { /// Pushes the postings information to the serializer. fn serialize(&self, self_addr: u32, - serializer: &mut InvertedIndexSerializer, + serializer: &mut FieldSerializer, heap: &Heap) -> io::Result<()>; } @@ -66,11 +66,11 @@ impl Recorder for NothingRecorder { fn serialize(&self, self_addr: u32, - serializer: &mut InvertedIndexSerializer, + serializer: &mut FieldSerializer, heap: &Heap) -> io::Result<()> { for doc in self.stack.iter(self_addr, heap) { - try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)); + serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?; } Ok(()) } @@ -118,7 +118,7 @@ impl Recorder for TermFrequencyRecorder { fn serialize(&self, self_addr: u32, - serializer: &mut InvertedIndexSerializer, + serializer: &mut FieldSerializer, heap: &Heap) -> io::Result<()> { // the last document has not been closed... @@ -173,7 +173,7 @@ impl Recorder for TFAndPositionRecorder { fn serialize(&self, self_addr: u32, - serializer: &mut InvertedIndexSerializer, + serializer: &mut FieldSerializer, heap: &Heap) -> io::Result<()> { let mut doc_positions = Vec::with_capacity(100); @@ -189,7 +189,7 @@ impl Recorder for TFAndPositionRecorder { prev_position = position; } } - try!(serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)); + serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?; } Ok(()) } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index c4b9c1146..c3f5f101e 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -52,14 +52,182 @@ pub struct InvertedIndexSerializer { postings_serializer: PostingsSerializer, positions_serializer: PositionSerializer, schema: Schema, - - term_open: bool, - text_indexing_options: TextIndexingOptions, - - current_term_info: TermInfo, - } + +impl InvertedIndexSerializer { + /// Open a new `PostingsSerializer` for the given segment + fn new(terms_write: WritePtr, + postings_write: WritePtr, + positions_write: WritePtr, + schema: Schema) + -> Result { + let terms_fst_builder = TermDictionaryBuilderImpl::new(terms_write)?; + Ok(InvertedIndexSerializer { + terms_fst_builder: terms_fst_builder, + positions_serializer: PositionSerializer::new(positions_write), + postings_serializer: PostingsSerializer::new(postings_write), + schema: schema, + }) + } + + + /// Open a new `PostingsSerializer` for the given segment + pub fn open(segment: &mut Segment) -> Result { + use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; + InvertedIndexSerializer::new(segment.open_write(TERMS)?, + segment.open_write(POSTINGS)?, + segment.open_write(POSITIONS)?, + segment.schema()) + } + + /// Must be called before starting pushing terms of + /// a given field. + /// + /// Loads the indexing options for the given field. + pub fn new_field(&mut self, field: Field) -> FieldSerializer { + let field_entry: &FieldEntry = self.schema.get_field_entry(field); + let text_indexing_options = match *field_entry.field_type() { + FieldType::Str(ref text_options) => text_options.get_indexing_options(), + FieldType::U64(ref int_options) | + FieldType::I64(ref int_options) => { + if int_options.is_indexed() { + TextIndexingOptions::Unindexed + } else { + TextIndexingOptions::Untokenized + } + } + }; + FieldSerializer::new( + text_indexing_options, + &mut self.terms_fst_builder, + &mut self.postings_serializer, + &mut self.positions_serializer, + ) + } + + /// Closes the serializer. + pub fn close(self) -> io::Result<()> { + self.terms_fst_builder.finish()?; + self.postings_serializer.close()?; + self.positions_serializer.close()?; + Ok(()) + } +} + + +/* +let field_entry: &FieldEntry = self.schema.get_field_entry(field); +self.text_indexing_options = match *field_entry.field_type() { + FieldType::Str(ref text_options) => text_options.get_indexing_options(), + FieldType::U64(ref int_options) | + FieldType::I64(ref int_options) => { + if int_options.is_indexed() { + TextIndexingOptions::Unindexed + } else { + TextIndexingOptions::Untokenized + } + } +}; +self.postings_serializer.set_termfreq_enabled(self.text_indexing_options.is_termfreq_enabled()); + + */ + +pub struct FieldSerializer<'a> { + text_indexing_options: TextIndexingOptions, + terms_fst_builder: &'a mut TermDictionaryBuilderImpl, + postings_serializer: &'a mut PostingsSerializer, + positions_serializer: &'a mut PositionSerializer, + current_term_info: TermInfo, + term_open: bool, +} + + +impl<'a> FieldSerializer<'a> { + + fn new( + text_indexing_options: TextIndexingOptions, + terms_fst_builder: &'a mut TermDictionaryBuilderImpl, + postings_serializer: &'a mut PostingsSerializer, + positions_serializer: &'a mut PositionSerializer + ) -> FieldSerializer<'a> { + + postings_serializer.set_termfreq_enabled(text_indexing_options.is_termfreq_enabled()); + + FieldSerializer { + text_indexing_options: text_indexing_options, + terms_fst_builder: terms_fst_builder, + postings_serializer: postings_serializer, + positions_serializer: positions_serializer, + current_term_info: TermInfo::default(), + term_open: false, + } + } + + fn current_term_info(&self) -> TermInfo { + let (filepos, offset) = self.positions_serializer.addr(); + TermInfo { + doc_freq: 0, + postings_offset: self.postings_serializer.addr(), + positions_offset: filepos, + positions_inner_offset: offset, + } + } + + /// Starts the postings for a new term. + /// * term - the term. It needs to come after the previous term according + /// to the lexicographical order. + /// * doc_freq - return the number of document containing the term. + pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> { + if self.term_open { + panic!("Called new_term, while the previous term was not closed."); + } + self.term_open = true; + self.postings_serializer.clear(); + self.current_term_info = self.current_term_info(); + self.terms_fst_builder.insert_key(term) + } + + /// Serialize the information that a document contains the current term, + /// its term frequency, and the position deltas. + /// + /// At this point, the positions are already `delta-encoded`. + /// For instance, if the positions are `2, 3, 17`, + /// `position_deltas` is `2, 1, 14` + /// + /// Term frequencies and positions may be ignored by the serializer depending + /// on the configuration of the field in the `Schema`. + pub fn write_doc(&mut self, + doc_id: DocId, + term_freq: u32, + position_deltas: &[u32]) + -> io::Result<()> { + self.current_term_info.doc_freq += 1; + self.postings_serializer.write_doc(doc_id, term_freq)?; + if self.text_indexing_options.is_position_enabled() { + self.positions_serializer.write(position_deltas)?; + } + Ok(()) + } + + /// Finish the serialization for this term postings. + /// + /// If the current block is incomplete, it need to be encoded + /// using `VInt` encoding. + pub fn close_term(&mut self) -> io::Result<()> { + if self.term_open { + self.terms_fst_builder.insert_value(&self.current_term_info)?; + self.postings_serializer.close_term()?; + self.term_open = false; + } + Ok(()) + } +} + +// TODO is the last term always closed? + + + struct PostingsSerializer { postings_write: CountingWriter, last_doc_id_encoded: u32, @@ -204,122 +372,3 @@ impl PositionSerializer { } } -impl InvertedIndexSerializer { - /// Open a new `PostingsSerializer` for the given segment - pub fn new(terms_write: WritePtr, - postings_write: WritePtr, - positions_write: WritePtr, - schema: Schema) - -> Result { - let terms_fst_builder = TermDictionaryBuilderImpl::new(terms_write)?; - Ok(InvertedIndexSerializer { - terms_fst_builder: terms_fst_builder, - positions_serializer: PositionSerializer::new(positions_write), - postings_serializer: PostingsSerializer::new(postings_write), - schema: schema, - term_open: false, - current_term_info: TermInfo::default(), - text_indexing_options: TextIndexingOptions::Untokenized, - }) - } - - - /// Open a new `PostingsSerializer` for the given segment - pub fn open(segment: &mut Segment) -> Result { - use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; - InvertedIndexSerializer::new(segment.open_write(TERMS)?, - segment.open_write(POSTINGS)?, - segment.open_write(POSITIONS)?, - segment.schema()) - } - - /// Must be called before starting pushing terms of - /// a given field. - /// - /// Loads the indexing options for the given field. - pub fn new_field(&mut self, field: Field) { - let field_entry: &FieldEntry = self.schema.get_field_entry(field); - self.text_indexing_options = match *field_entry.field_type() { - FieldType::Str(ref text_options) => text_options.get_indexing_options(), - FieldType::U64(ref int_options) | - FieldType::I64(ref int_options) => { - if int_options.is_indexed() { - TextIndexingOptions::Unindexed - } else { - TextIndexingOptions::Untokenized - } - } - }; - self.postings_serializer.set_termfreq_enabled(self.text_indexing_options.is_termfreq_enabled()); - } - - fn current_term_info(&self) -> TermInfo { - let (filepos, offset) = self.positions_serializer.addr(); - TermInfo { - doc_freq: 0, - postings_offset: self.postings_serializer.addr(), - positions_offset: filepos, - positions_inner_offset: offset, - } - } - - /// Starts the postings for a new term. - /// * term - the term. It needs to come after the previous term according - /// to the lexicographical order. - /// * doc_freq - return the number of document containing the term. - pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> { - if self.term_open { - panic!("Called new_term, while the previous term was not closed."); - } - self.term_open = true; - self.postings_serializer.clear(); - self.current_term_info = self.current_term_info(); - self.terms_fst_builder.insert_key(term) - } - - /// Finish the serialization for this term postings. - /// - /// If the current block is incomplete, it need to be encoded - /// using `VInt` encoding. - pub fn close_term(&mut self) -> io::Result<()> { - if self.term_open { - self.terms_fst_builder.insert_value(&self.current_term_info)?; - self.postings_serializer.close_term()?; - self.term_open = false; - } - Ok(()) - } - - - /// Serialize the information that a document contains the current term, - /// its term frequency, and the position deltas. - /// - /// At this point, the positions are already `delta-encoded`. - /// For instance, if the positions are `2, 3, 17`, - /// `position_deltas` is `2, 1, 14` - /// - /// Term frequencies and positions may be ignored by the serializer depending - /// on the configuration of the field in the `Schema`. - pub fn write_doc(&mut self, - doc_id: DocId, - term_freq: u32, - position_deltas: &[u32]) - -> io::Result<()> { - self.current_term_info.doc_freq += 1; - self.postings_serializer.write_doc(doc_id, term_freq)?; - if self.text_indexing_options.is_position_enabled() { - self.positions_serializer.write(position_deltas)?; - } - - Ok(()) - } - - /// Closes the serializer. - pub fn close(mut self) -> io::Result<()> { - self.close_term()?; - self.terms_fst_builder.finish()?; - self.postings_serializer.close()?; - self.positions_serializer.close()?; - Ok(()) - } -} From f9203228be25e6d6fa340e4a9fd2142f0427ea06 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 12 Aug 2017 18:45:59 +0900 Subject: [PATCH 10/29] Using composite file in fast field. --- src/common/composite_file.rs | 159 +++++++++++++++++++++++++++++++++++ src/common/mod.rs | 3 + src/fastfield/mod.rs | 10 +-- src/fastfield/reader.rs | 40 ++------- src/fastfield/serializer.rs | 98 ++++++++++----------- src/fastfield/writer.rs | 7 +- src/indexer/merger.rs | 7 +- 7 files changed, 226 insertions(+), 98 deletions(-) create mode 100644 src/common/composite_file.rs diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs new file mode 100644 index 000000000..bea35f9fa --- /dev/null +++ b/src/common/composite_file.rs @@ -0,0 +1,159 @@ +use std::io::Write; +use common::CountingWriter; +use std::collections::HashMap; +use schema::Field; +use common::VInt; +use std::io; +use directory::ReadOnlySource; +use common::BinarySerializable; + +pub struct CompositeWrite { + write: CountingWriter, + offsets: HashMap, +} + +impl CompositeWrite { + pub fn wrap(w: W) -> CompositeWrite { + CompositeWrite { + write: CountingWriter::wrap(w), + offsets: HashMap::new(), + } + } + + pub fn for_field(&mut self, field: Field) -> &mut CountingWriter { + let offset = self.write.written_bytes(); + assert!(!self.offsets.contains_key(&field)); + self.offsets.insert(field, offset); + &mut self.write + } + + pub fn close(&mut self) -> io::Result<()> { + let footer_offset = self.write.written_bytes(); + VInt(self.offsets.len() as u64).serialize(&mut self.write)?; + + let mut offset_fields: Vec<_> = self.offsets.iter() + .map(|(field, offset)| (offset, field)) + .collect(); + + offset_fields.sort(); + + let mut prev_offset = 0; + for (offset, field) in offset_fields { + VInt( (offset - prev_offset) as u64).serialize(&mut self.write)?; + field.serialize(&mut self.write)?; + prev_offset = *offset; + } + + let footer_len = (self.write.written_bytes() - footer_offset) as u32; + footer_len.serialize(&mut self.write)?; + self.write.flush()?; + Ok(()) + } +} + + +pub struct CompositeFile { + data: ReadOnlySource, + offsets_index: HashMap, +} + +impl CompositeFile { + pub fn open(data: ReadOnlySource) -> io::Result { + let end = data.len(); + let footer_len_data = data.slice(end - 4, end); + let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; + + let footer_start = end - 4 - footer_len; + let footer_data = data.slice(footer_start, footer_start + footer_len); + let mut footer_buffer = footer_data.as_slice(); + let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; + + let mut fields = vec!(); + let mut offsets = vec!(); + + let mut field_index = HashMap::new(); + + let mut offset = 0; + for _ in 0..num_fields { + offset += VInt::deserialize(&mut footer_buffer)?.0 as usize; + let field = Field::deserialize(&mut footer_buffer)?; + offsets.push(offset); + fields.push(field); + } + offsets.push(footer_start); + for i in 0..num_fields { + let field = fields[i]; + let start_offset = offsets[i]; + let end_offset = offsets[i+1]; + field_index.insert(field, (start_offset, end_offset)); + } + + Ok(CompositeFile { + data: data.slice(0, footer_start), + offsets_index: field_index, + }) + } + + pub fn open_read(&self, field: Field) -> Option { + self.offsets_index + .get(&field) + .map(|&(from, to)| { + self.data.slice(from, to) + }) + } +} + + +#[cfg(test)] +mod test { + + use std::io::Write; + use super::{CompositeWrite, CompositeFile}; + use directory::{RAMDirectory, Directory}; + use schema::Field; + use common::VInt; + use common::BinarySerializable; + use std::path::Path; + + #[test] + fn test_composite_file() { + let path = Path::new("test_path"); + let mut directory = RAMDirectory::create(); + { + let w = directory.open_write(path).unwrap(); + let mut composite_write = CompositeWrite::wrap(w); + { + let mut write_0 = composite_write.for_field(Field(0u32)); + VInt(32431123u64).serialize(&mut write_0).unwrap(); + write_0.flush().unwrap(); + } + + { + let mut write_4 = composite_write.for_field(Field(4u32)); + VInt(2).serialize(&mut write_4).unwrap(); + write_4.flush().unwrap(); + } + composite_write.close().unwrap(); + } + { + let r = directory.open_read(path).unwrap(); + let composite_file = CompositeFile::open(r).unwrap(); + { + let file0 = composite_file.open_read(Field(0u32)).unwrap(); + let mut file0_buf = file0.as_slice(); + let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0; + assert_eq!(file0_buf.len(), 0); + assert_eq!(payload_0, 32431123u64); + } + { + let file4 = composite_file.open_read(Field(4u32)).unwrap(); + let mut file4_buf = file4.as_slice(); + let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0; + assert_eq!(file4_buf.len(), 0); + assert_eq!(payload_4, 2u64); + } + } + + } + +} \ No newline at end of file diff --git a/src/common/mod.rs b/src/common/mod.rs index 0af9d2417..e8c8763f1 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -2,8 +2,11 @@ mod serialize; mod timer; mod vint; mod counting_writer; +mod composite_file; pub mod bitpacker; + +pub(crate) use self::composite_file::{CompositeWrite, CompositeFile}; pub use self::serialize::BinarySerializable; pub use self::timer::Timing; pub use self::timer::TimerTree; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 8b47d3a0e..31b241388 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -94,7 +94,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 38 as usize); + assert_eq!(source.len(), 35 as usize); } { let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); @@ -128,7 +128,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 63 as usize); + assert_eq!(source.len(), 60 as usize); } { let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); @@ -164,7 +164,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 36 as usize); + assert_eq!(source.len(), 33 as usize); } { let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); @@ -197,7 +197,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 80044 as usize); + assert_eq!(source.len(), 80041 as usize); } { let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); @@ -233,7 +233,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 17711 as usize); + assert_eq!(source.len(), 17708 as usize); } { let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index aae1dd797..2ec8f66fc 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,6 +1,6 @@ use std::io; -use std::collections::HashMap; use directory::ReadOnlySource; +use common::CompositeFile; use common::BinarySerializable; use DocId; use schema::{Field, SchemaBuilder}; @@ -240,8 +240,7 @@ impl FastFieldReader for I64FastFieldReader { /// It contains a mapping that associated these fields to /// the proper slice in the fastfield reader file. pub struct FastFieldsReader { - source: ReadOnlySource, - field_offsets: HashMap, + composite_file: CompositeFile, } impl FastFieldsReader { @@ -251,31 +250,9 @@ impl FastFieldsReader { /// the list of the offset is read (as a footer of the /// data file). pub fn from_source(source: ReadOnlySource) -> io::Result { - let header_offset; - let field_offsets: Vec<(Field, u32)>; - { - let buffer = source.as_slice(); - { - let mut cursor = buffer; - header_offset = u32::deserialize(&mut cursor)?; - } - { - let mut cursor = &buffer[header_offset as usize..]; - field_offsets = Vec::deserialize(&mut cursor)?; - } - } - let mut end_offsets: Vec = field_offsets.iter().map(|&(_, offset)| offset).collect(); - end_offsets.push(header_offset); - let mut field_offsets_map: HashMap = HashMap::new(); - for (field_start_offsets, stop_offset) in - field_offsets.iter().zip(end_offsets.iter().skip(1)) { - let (field, start_offset) = *field_start_offsets; - field_offsets_map.insert(field, (start_offset, *stop_offset)); - } Ok(FastFieldsReader { - field_offsets: field_offsets_map, - source: source, - }) + composite_file: CompositeFile::open(source)?, + }) } /// Returns the u64 fast value reader if the field @@ -287,11 +264,8 @@ impl FastFieldsReader { /// # Panics /// May panic if the index is corrupted. pub fn open_reader(&self, field: Field) -> Option { - self.field_offsets - .get(&field) - .map(|&(start, stop)| { - let field_source = self.source.slice(start as usize, stop as usize); - FFReader::open(field_source) - }) + self.composite_file + .open_read(field) + .map(FFReader::open) } } diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index ef6ffedf9..590aee84a 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -3,7 +3,8 @@ use directory::WritePtr; use schema::Field; use common::bitpacker::{compute_num_bits, BitPacker}; use common::CountingWriter; -use std::io::{self, Write, Seek, SeekFrom}; +use common::CompositeWrite; +use std::io::{self, Write}; /// `FastFieldSerializer` is in charge of serializing /// fastfields on disk. @@ -26,27 +27,17 @@ use std::io::{self, Write, Seek, SeekFrom}; /// * `close_field()` /// * `close()` pub struct FastFieldSerializer { - write: CountingWriter, - fields: Vec<(Field, u32)>, - min_value: u64, - field_open: bool, - bit_packer: BitPacker, + composite_write: CompositeWrite, } - impl FastFieldSerializer { /// Constructor pub fn new(write: WritePtr) -> io::Result { // just making room for the pointer to header. - let mut counting_writer = CountingWriter::wrap(write); - 0u32.serialize(&mut counting_writer)?; + let composite_write = CompositeWrite::wrap(write); Ok(FastFieldSerializer { - write: counting_writer, - fields: Vec::new(), - min_value: 0, - field_open: false, - bit_packer: BitPacker::new(0), - }) + composite_write: composite_write, + }) } /// Start serializing a new u64 fast field @@ -54,23 +45,48 @@ impl FastFieldSerializer { field: Field, min_value: u64, max_value: u64) - -> io::Result<()> { - if self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed")); - } - self.min_value = min_value; - self.field_open = true; - self.fields.push((field, self.write.written_bytes() as u32)); - let write = &mut self.write; + -> io::Result>> { + let field_write = self + .composite_write + .for_field(field); + FastSingleFieldSerializer::open( + field_write, + min_value, + max_value) + } + + + /// Closes the serializer + /// + /// After this call the data must be persistently save on disk. + pub fn close(mut self) -> io::Result<()> { + self.composite_write.close() + } +} + +pub struct FastSingleFieldSerializer<'a, W: Write + 'a> { + bit_packer: BitPacker, + write: &'a mut W, + min_value: u64, +} + +impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { + + fn open(write: &'a mut W, + min_value: u64, + max_value: u64) -> io::Result> { min_value.serialize(write)?; let amplitude = max_value - min_value; amplitude.serialize(write)?; let num_bits = compute_num_bits(amplitude); - self.bit_packer = BitPacker::new(num_bits as usize); - Ok(()) + let bit_packer = BitPacker::new(num_bits as usize); + Ok(FastSingleFieldSerializer { + write: write, + bit_packer: bit_packer, + min_value: min_value, + }) } - /// Pushes a new value to the currently open u64 fast field. pub fn add_val(&mut self, val: u64) -> io::Result<()> { let val_to_write: u64 = val - self.min_value; @@ -78,33 +94,7 @@ impl FastFieldSerializer { Ok(()) } - /// Close the u64 fast field. - pub fn close_field(&mut self) -> io::Result<()> { - if !self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed")); - } - self.field_open = false; - // adding some padding to make sure we - // can read the last elements with our u64 - // cursor - self.bit_packer.close(&mut self.write)?; - Ok(()) - } - - - /// Closes the serializer - /// - /// After this call the data must be persistently save on disk. - pub fn close(self) -> io::Result { - if self.field_open { - return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed")); - } - let header_offset: usize = self.write.written_bytes() as usize; - let (mut write, written_size) = self.write.finish()?; - self.fields.serialize(&mut write)?; - write.seek(SeekFrom::Start(0))?; - (header_offset as u32).serialize(&mut write)?; - write.flush()?; - Ok(written_size) + pub fn close_field(mut self) -> io::Result<()> { + self.bit_packer.close(&mut self.write) } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 52b29972f..1427a7b36 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -208,13 +208,14 @@ impl IntFastFieldWriter { (self.val_min, self.val_max) }; - serializer.new_u64_fast_field(self.field, min, max)?; + + let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?; let mut cursor = self.vals.as_slice(); while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) { - serializer.add_val(val)?; + single_field_serializer.add_val(val)?; } - serializer.close_field() + single_field_serializer.close_field() } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 468d867e7..f150f831a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -175,18 +175,19 @@ impl IndexMerger { assert!(min_val <= max_val); - fast_field_serializer + + let mut fast_single_field_serializer = fast_field_serializer .new_u64_fast_field(field, min_val, max_val)?; for (max_doc, u64_reader, delete_bitset) in u64_readers { for doc_id in 0..max_doc { if !delete_bitset.is_deleted(doc_id) { let val = u64_reader.get(doc_id); - fast_field_serializer.add_val(val)?; + fast_single_field_serializer.add_val(val)?; } } } - fast_field_serializer.close_field()?; + fast_single_field_serializer.close_field()?; } Ok(()) } From 0eb3c872fd4abf5f811ab13e2c13aeca873cab3d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 12 Aug 2017 19:33:56 +0900 Subject: [PATCH 11/29] Using composite file for all of the inverted index component --- src/indexer/merger.rs | 4 +- src/indexer/segment_serializer.rs | 1 - src/postings/mod.rs | 2 +- src/postings/postings_writer.rs | 2 +- src/postings/serializer.rs | 127 ++++++++++++++---------------- 5 files changed, 66 insertions(+), 70 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index f150f831a..74cdd625a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -233,7 +233,7 @@ impl IndexMerger { loop { // this loop processes all fields. - let mut field_serializer = serializer.new_field(current_field); + let mut field_serializer = serializer.new_field(current_field)?; // we reached a new field. let field_entry = self.schema.get_field_entry(current_field); @@ -312,6 +312,7 @@ impl IndexMerger { if !merged_terms.advance() { + field_serializer.close()?; return Ok(()) } @@ -319,6 +320,7 @@ impl IndexMerger { let next_term_field = Term::wrap(merged_terms.key()).field(); if next_term_field != current_field { current_field = next_term_field; + field_serializer.close()?; break; } } diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 35d10ef8d..76190bd9b 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -6,7 +6,6 @@ use fastfield::FastFieldSerializer; use store::StoreWriter; use postings::InvertedIndexSerializer; - /// Segment serializer is in charge of laying out on disk /// the data accumulated and sorted by the `SegmentWriter`. pub struct SegmentSerializer { diff --git a/src/postings/mod.rs b/src/postings/mod.rs index fd78cbded..d1a05bbb0 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -61,7 +61,7 @@ mod tests { let mut segment = index.new_segment(); let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); { - let mut field_serializer = posting_serializer.new_field(text_field); + let mut field_serializer = posting_serializer.new_field(text_field).unwrap(); field_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..120u32 { let delta_positions = vec![1, 2, 3, 2]; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 813073b4c..a6306b141 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -101,7 +101,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { let (field, start) = offsets[i]; let (_, stop) = offsets[i + 1]; let postings_writer = &self.per_field_postings_writers[field.0 as usize]; - let field_serializer = serializer.new_field(field); + let field_serializer = serializer.new_field(field)?; postings_writer.serialize(&term_offsets[start..stop], field_serializer, self.heap)?; } Ok(()) diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index c3f5f101e..087baed3d 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -13,6 +13,7 @@ use core::Segment; use std::io::{self, Write}; use compression::VIntEncoder; use common::CountingWriter; +use common::CompositeWrite; use termdict::TermDictionaryBuilder; @@ -48,25 +49,24 @@ use termdict::TermDictionaryBuilder; /// A description of the serialization format is /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). pub struct InvertedIndexSerializer { - terms_fst_builder: TermDictionaryBuilderImpl, - postings_serializer: PostingsSerializer, - positions_serializer: PositionSerializer, + terms_write: CompositeWrite, + postings_write: CompositeWrite, + positions_write: CompositeWrite, schema: Schema, } impl InvertedIndexSerializer { /// Open a new `PostingsSerializer` for the given segment - fn new(terms_write: WritePtr, - postings_write: WritePtr, - positions_write: WritePtr, - schema: Schema) + fn new(terms_write: CompositeWrite, + postings_write: CompositeWrite, + positions_write: CompositeWrite, + schema: Schema) -> Result { - let terms_fst_builder = TermDictionaryBuilderImpl::new(terms_write)?; Ok(InvertedIndexSerializer { - terms_fst_builder: terms_fst_builder, - positions_serializer: PositionSerializer::new(positions_write), - postings_serializer: PostingsSerializer::new(postings_write), + terms_write: terms_write, + postings_write: postings_write, + positions_write: positions_write, schema: schema, }) } @@ -75,17 +75,19 @@ impl InvertedIndexSerializer { /// Open a new `PostingsSerializer` for the given segment pub fn open(segment: &mut Segment) -> Result { use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; - InvertedIndexSerializer::new(segment.open_write(TERMS)?, - segment.open_write(POSTINGS)?, - segment.open_write(POSITIONS)?, - segment.schema()) + InvertedIndexSerializer::new( + CompositeWrite::wrap( + segment.open_write(TERMS)?), + CompositeWrite::wrap(segment.open_write(POSTINGS)?), + CompositeWrite::wrap(segment.open_write(POSITIONS)?), + segment.schema()) } /// Must be called before starting pushing terms of /// a given field. /// /// Loads the indexing options for the given field. - pub fn new_field(&mut self, field: Field) -> FieldSerializer { + pub fn new_field(&mut self, field: Field) -> io::Result { let field_entry: &FieldEntry = self.schema.get_field_entry(field); let text_indexing_options = match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options.get_indexing_options(), @@ -98,46 +100,32 @@ impl InvertedIndexSerializer { } } }; + let term_dictionary_write = self.terms_write.for_field(field); + let postings_write = self.postings_write.for_field(field); + let positions_write = self.positions_write.for_field(field); FieldSerializer::new( text_indexing_options, - &mut self.terms_fst_builder, - &mut self.postings_serializer, - &mut self.positions_serializer, + term_dictionary_write, + postings_write, + positions_write ) } /// Closes the serializer. - pub fn close(self) -> io::Result<()> { - self.terms_fst_builder.finish()?; - self.postings_serializer.close()?; - self.positions_serializer.close()?; + pub fn close(mut self) -> io::Result<()> { + self.terms_write.close()?; + self.postings_write.close()?; + self.positions_write.close()?; Ok(()) } } -/* -let field_entry: &FieldEntry = self.schema.get_field_entry(field); -self.text_indexing_options = match *field_entry.field_type() { - FieldType::Str(ref text_options) => text_options.get_indexing_options(), - FieldType::U64(ref int_options) | - FieldType::I64(ref int_options) => { - if int_options.is_indexed() { - TextIndexingOptions::Unindexed - } else { - TextIndexingOptions::Untokenized - } - } -}; -self.postings_serializer.set_termfreq_enabled(self.text_indexing_options.is_termfreq_enabled()); - - */ - pub struct FieldSerializer<'a> { text_indexing_options: TextIndexingOptions, - terms_fst_builder: &'a mut TermDictionaryBuilderImpl, - postings_serializer: &'a mut PostingsSerializer, - positions_serializer: &'a mut PositionSerializer, + term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter, TermInfo>, + postings_serializer: PostingsSerializer<&'a mut CountingWriter>, + positions_serializer: PositionSerializer<&'a mut CountingWriter>, current_term_info: TermInfo, term_open: bool, } @@ -147,21 +135,24 @@ impl<'a> FieldSerializer<'a> { fn new( text_indexing_options: TextIndexingOptions, - terms_fst_builder: &'a mut TermDictionaryBuilderImpl, - postings_serializer: &'a mut PostingsSerializer, - positions_serializer: &'a mut PositionSerializer - ) -> FieldSerializer<'a> { + term_dictionary_write: &'a mut CountingWriter, + postings_write: &'a mut CountingWriter, + positions_write: &'a mut CountingWriter + ) -> io::Result> { - postings_serializer.set_termfreq_enabled(text_indexing_options.is_termfreq_enabled()); + let term_freq_enabled = text_indexing_options.is_termfreq_enabled(); + let term_dictionary_builder = TermDictionaryBuilderImpl::new(term_dictionary_write)?; + let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled); + let positions_serializer = PositionSerializer::new(positions_write); - FieldSerializer { + Ok(FieldSerializer { text_indexing_options: text_indexing_options, - terms_fst_builder: terms_fst_builder, + term_dictionary_builder: term_dictionary_builder, postings_serializer: postings_serializer, positions_serializer: positions_serializer, current_term_info: TermInfo::default(), term_open: false, - } + }) } fn current_term_info(&self) -> TermInfo { @@ -185,7 +176,7 @@ impl<'a> FieldSerializer<'a> { self.term_open = true; self.postings_serializer.clear(); self.current_term_info = self.current_term_info(); - self.terms_fst_builder.insert_key(term) + self.term_dictionary_builder.insert_key(term) } /// Serialize the information that a document contains the current term, @@ -216,20 +207,28 @@ impl<'a> FieldSerializer<'a> { /// using `VInt` encoding. pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { - self.terms_fst_builder.insert_value(&self.current_term_info)?; + self.term_dictionary_builder.insert_value(&self.current_term_info)?; self.postings_serializer.close_term()?; self.term_open = false; } Ok(()) } + + pub fn close(mut self) -> io::Result<()> { + self.close_term()?; + self.positions_serializer.close()?; + self.postings_serializer.close()?; + self.term_dictionary_builder.finish()?; + Ok(()) + } } // TODO is the last term always closed? -struct PostingsSerializer { - postings_write: CountingWriter, +struct PostingsSerializer { + postings_write: CountingWriter, last_doc_id_encoded: u32, block_encoder: BlockEncoder, @@ -239,8 +238,8 @@ struct PostingsSerializer { termfreq_enabled: bool, } -impl PostingsSerializer { - fn new(write: WritePtr) -> PostingsSerializer { +impl PostingsSerializer { + fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer { PostingsSerializer { postings_write: CountingWriter::wrap(write), @@ -249,7 +248,7 @@ impl PostingsSerializer { term_freqs: vec!(), last_doc_id_encoded: 0u32, - termfreq_enabled: false, + termfreq_enabled: termfreq_enabled, } } @@ -279,10 +278,6 @@ impl PostingsSerializer { Ok(()) } - fn set_termfreq_enabled(&mut self, termfreq_enabled: bool) { - self.termfreq_enabled = termfreq_enabled; - } - fn close_term(&mut self) -> io::Result<()> { if !self.doc_ids.is_empty() { // we have doc ids waiting to be written @@ -325,14 +320,14 @@ impl PostingsSerializer { } } -struct PositionSerializer { +struct PositionSerializer { buffer: Vec, - write: CountingWriter, + write: CountingWriter, // See if we can offset the original counting writer. block_encoder: BlockEncoder, } -impl PositionSerializer { - fn new(write: WritePtr) -> PositionSerializer { +impl PositionSerializer { + fn new(write: W) -> PositionSerializer { PositionSerializer { buffer: Vec::with_capacity(NUM_DOCS_PER_BLOCK), write: CountingWriter::wrap(write), From 413d0e17197cf56275af04ee8a15e908bdfbcf6f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 13 Aug 2017 17:57:11 +0900 Subject: [PATCH 12/29] NOBUG test passing --- src/common/composite_file.rs | 9 + src/compression/mod.rs | 32 +-- src/compression/pack/compression_pack_simd.rs | 14 +- src/compression/stream.rs | 27 ++- src/compression/vint/compression_vint_simd.rs | 14 +- src/core/field_reader.rs | 149 ++++++++++++ src/core/mod.rs | 3 +- src/core/searcher.rs | 59 +++-- src/core/segment_reader.rs | 204 ++++++---------- src/directory/mod.rs | 2 +- src/directory/read_only_source.rs | 42 ++++ src/indexer/index_writer.rs | 3 +- src/indexer/merger.rs | 219 +++++++++--------- src/lib.rs | 30 ++- src/postings/mod.rs | 23 +- src/postings/postings_writer.rs | 11 +- src/postings/segment_postings.rs | 89 ++++--- src/query/phrase_query/phrase_scorer.rs | 10 +- src/query/phrase_query/phrase_weight.rs | 3 +- src/query/term_query/term_weight.rs | 10 +- src/termdict/merger.rs | 14 +- src/termdict/mod.rs | 8 +- 22 files changed, 592 insertions(+), 383 deletions(-) create mode 100644 src/core/field_reader.rs diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index bea35f9fa..b092f0bce 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -52,6 +52,7 @@ impl CompositeWrite { } +#[derive(Clone)] pub struct CompositeFile { data: ReadOnlySource, offsets_index: HashMap, @@ -94,6 +95,14 @@ impl CompositeFile { }) } + pub fn empty() -> CompositeFile { + CompositeFile { + offsets_index: HashMap::new(), + data: ReadOnlySource::empty(), + } + } + + pub fn open_read(&self, field: Field) -> Option { self.offsets_index .get(&field) diff --git a/src/compression/mod.rs b/src/compression/mod.rs index d8540892b..43096622c 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -46,11 +46,11 @@ pub trait VIntDecoder { compressed_data: &'a [u8], offset: u32, num_els: usize) - -> &'a [u8]; + -> usize; fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) - -> &'a [u8]; + -> usize; } impl VIntEncoder for BlockEncoder { @@ -68,7 +68,7 @@ impl VIntDecoder for BlockDecoder { compressed_data: &'a [u8], offset: u32, num_els: usize) - -> &'a [u8] { + -> usize { self.output_len = num_els; vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset) } @@ -76,7 +76,7 @@ impl VIntDecoder for BlockDecoder { fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) - -> &'a [u8] { + -> usize { self.output_len = num_els; vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els]) } @@ -100,8 +100,8 @@ pub mod tests { let compressed_data = encoder.compress_block_sorted(&vals, 0); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0); - assert_eq!(remaining_data.len(), 0); + let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0); + assert_eq!(consumed_num_bytes, compressed_data.len()); } for i in 0..128 { assert_eq!(vals[i], decoder.output(i)); @@ -115,8 +115,8 @@ pub mod tests { let compressed_data = encoder.compress_block_sorted(&vals, 10); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10); - assert_eq!(remaining_data.len(), 0); + let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10); + assert_eq!(consumed_num_bytes, compressed_data.len()); } for i in 0..128 { assert_eq!(vals[i], decoder.output(i)); @@ -134,9 +134,9 @@ pub mod tests { compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(&compressed, 10); - assert_eq!(remaining_data.len(), 1); - assert_eq!(remaining_data[0], 173u8); + let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10); + assert_eq!(consumed_num_bytes, compressed.len() - 1); + assert_eq!(compressed[consumed_num_bytes], 173u8); } for i in 0..n { assert_eq!(vals[i], decoder.output(i)); @@ -154,9 +154,9 @@ pub mod tests { compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_unsorted(&compressed); - assert_eq!(remaining_data.len(), 1); - assert_eq!(remaining_data[0], 173u8); + let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed); + assert_eq!(consumed_num_bytes + 1, compressed.len()); + assert_eq!(compressed[consumed_num_bytes], 173u8); } for i in 0..n { assert_eq!(vals[i], decoder.output(i)); @@ -174,9 +174,9 @@ pub mod tests { let encoded_data = encoder.compress_vint_sorted(&input, *offset); assert!(encoded_data.len() <= expected_length); let mut decoder = BlockDecoder::new(); - let remaining_data = + let consumed_num_bytes = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len()); - assert_eq!(0, remaining_data.len()); + assert_eq!(consumed_num_bytes, encoded_data.len()); assert_eq!(input, decoder.output_array()); } } diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index 6842e0cc2..c430a728f 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -78,19 +78,19 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted<'a>(&mut self, - compressed_data: &'a [u8], - offset: u32) - -> &'a [u8] { + pub fn uncompress_block_sorted(&mut self, + compressed_data: &[u8], + offset: u32) + -> usize { let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset); self.output_len = NUM_DOCS_PER_BLOCK; - &compressed_data[consumed_size..] + consumed_size } - pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] { + pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize { let consumed_size = uncompress_unsorted(compressed_data, &mut self.output); self.output_len = NUM_DOCS_PER_BLOCK; - &compressed_data[consumed_size..] + consumed_size } #[inline] diff --git a/src/compression/stream.rs b/src/compression/stream.rs index 735eb7bef..0af50ca5b 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -1,15 +1,16 @@ use compression::BlockDecoder; use compression::NUM_DOCS_PER_BLOCK; use compression::compressed_block_size; +use directory::SourceRead; -pub struct CompressedIntStream<'a> { - buffer: &'a [u8], +pub struct CompressedIntStream { + buffer: SourceRead, block_decoder: BlockDecoder, inner_offset: usize, } -impl<'a> CompressedIntStream<'a> { - pub fn wrap(buffer: &'a [u8]) -> CompressedIntStream<'a> { +impl CompressedIntStream { + pub fn wrap(buffer: SourceRead) -> CompressedIntStream { CompressedIntStream { buffer: buffer, block_decoder: BlockDecoder::new(), @@ -29,7 +30,8 @@ impl<'a> CompressedIntStream<'a> { } num_els -= available; start += available; - self.buffer = self.block_decoder.uncompress_block_unsorted(self.buffer); + let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref()); + self.buffer.advance(num_consumed_bytes); self.inner_offset = 0; } else { @@ -51,11 +53,12 @@ impl<'a> CompressedIntStream<'a> { // entirely skip decompressing some blocks. while skip_len >= NUM_DOCS_PER_BLOCK { skip_len -= NUM_DOCS_PER_BLOCK; - let num_bits: u8 = self.buffer[0]; + let num_bits: u8 = self.buffer.as_ref()[0]; let block_len = compressed_block_size(num_bits); - self.buffer = &self.buffer[block_len..]; + self.buffer.advance(block_len); } - self.buffer = self.block_decoder.uncompress_block_unsorted(self.buffer); + let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref()); + self.buffer.advance(num_consumed_bytes); self.inner_offset = skip_len; } } @@ -69,8 +72,9 @@ pub mod tests { use compression::compressed_block_size; use compression::NUM_DOCS_PER_BLOCK; use compression::BlockEncoder; + use directory::{SourceRead, ReadOnlySource}; - fn create_stream_buffer() -> Vec { + fn create_stream_buffer() -> ReadOnlySource { let mut buffer: Vec = vec!(); let mut encoder = BlockEncoder::new(); let vals: Vec = (0u32..1_025u32).collect(); @@ -80,13 +84,14 @@ pub mod tests { assert_eq!(compressed_block_size(num_bits), compressed_block.len()); buffer.extend_from_slice(compressed_block); } - buffer + ReadOnlySource::from(buffer) } #[test] fn test_compressed_int_stream() { let buffer = create_stream_buffer(); - let mut stream = CompressedIntStream::wrap(&buffer[..]); + let buffer_reader = SourceRead::from(buffer); + let mut stream = CompressedIntStream::wrap(buffer_reader); let mut block: [u32; NUM_DOCS_PER_BLOCK] = [0u32; NUM_DOCS_PER_BLOCK]; stream.read(&mut block[0..2]); diff --git a/src/compression/vint/compression_vint_simd.rs b/src/compression/vint/compression_vint_simd.rs index dbeca660c..f8e09536f 100644 --- a/src/compression/vint/compression_vint_simd.rs +++ b/src/compression/vint/compression_vint_simd.rs @@ -49,20 +49,18 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) - -> &'a [u8] { - let consumed_bytes = unsafe { + -> usize { + unsafe { streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len() as u32, offset) - }; - &compressed_data[consumed_bytes..] + } } #[inline(always)] -pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { - let consumed_bytes = unsafe { +pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { + unsafe { streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len()) - }; - &compressed_data[consumed_bytes..] + } } diff --git a/src/core/field_reader.rs b/src/core/field_reader.rs new file mode 100644 index 000000000..bead5bb80 --- /dev/null +++ b/src/core/field_reader.rs @@ -0,0 +1,149 @@ +use directory::{SourceRead, ReadOnlySource}; +use termdict::{TermDictionary, TermDictionaryImpl}; +use std::io; +use postings::{SegmentPostings, BlockSegmentPostings}; +use postings::TermInfo; +use postings::SegmentPostingsOption; +use schema::Term; +use std::cmp; +use fastfield::DeleteBitSet; +use schema::Schema; +use compression::CompressedIntStream; + +pub struct FieldReader { + termdict: TermDictionaryImpl, + postings_source: ReadOnlySource, + positions_source: ReadOnlySource, + delete_bitset: DeleteBitSet, + schema: Schema, +} + +impl FieldReader { + + pub(crate) fn new( + termdict_source: ReadOnlySource, + postings_source: ReadOnlySource, + positions_source: ReadOnlySource, + delete_bitset: DeleteBitSet, + schema: Schema, + + ) -> io::Result { + + Ok(FieldReader { + termdict: TermDictionaryImpl::from_source(termdict_source)?, + postings_source: postings_source, + positions_source: positions_source, + delete_bitset: delete_bitset, + schema: schema, + }) + } + + /// Returns the term info associated with the term. + pub fn get_term_info(&self, term: &Term) -> Option { + self.termdict.get(term.as_slice()) + } + + + /// Return the term dictionary datastructure. + pub fn terms(&self) -> &TermDictionaryImpl { + &self.termdict + } + + /// Resets the block segment to another position of the postings + /// file. + /// + /// This is useful for enumerating through a list of terms, + /// and consuming the associated posting lists while avoiding + /// reallocating a `BlockSegmentPostings`. + /// + /// # Warning + /// + /// This does not reset the positions list. + pub fn reset_block_postings_from_terminfo(&self, + term_info: &TermInfo, + block_postings: &mut BlockSegmentPostings) { + let offset = term_info.postings_offset as usize; + let end_source = self.postings_source.len(); + let postings_slice = self.postings_source.slice(offset, end_source); + let postings_reader = SourceRead::from(postings_slice); + block_postings.reset(term_info.doc_freq as usize, postings_reader); + } + + + + /// Returns a block postings given a `term_info`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings_from_terminfo(&self, + term_info: &TermInfo, + option: SegmentPostingsOption) + -> BlockSegmentPostings { + let offset = term_info.postings_offset as usize; + let postings_data = self.postings_source.slice_from(offset); + let has_freq = option.has_freq(); + BlockSegmentPostings::from_data( + term_info.doc_freq as usize, + SourceRead::from(postings_data), + has_freq) + } + + /// Returns a posting object given a `term_info`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_postings_from_terminfo(&self, + term_info: &TermInfo, + option: SegmentPostingsOption) + -> SegmentPostings { + let block_postings = self.read_block_postings_from_terminfo(term_info, option); + let delete_bitset = self.delete_bitset.clone(); + let position_stream = { + if option.has_positions() { + let position_offset = term_info.positions_offset; + let positions_reader = SourceRead::from(self.positions_source.slice_from(position_offset as usize)); + let mut stream = CompressedIntStream::wrap(positions_reader); + stream.skip(term_info.positions_inner_offset as usize); + Some(stream) + } + else { + None + } + }; + SegmentPostings::from_block_postings( + block_postings, + delete_bitset, + position_stream + ) + } + + /// Returns the segment postings associated with the term, and with the given option, + /// or `None` if the term has never been encountered and indexed. + /// + /// If the field was not indexed with the indexing options that cover + /// the requested options, the returned `SegmentPostings` the method does not fail + /// and returns a `SegmentPostings` with as much information as possible. + /// + /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a + /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` + /// with `DocId`s and frequencies. + pub fn read_postings(&self, + term: &Term, + option: SegmentPostingsOption) + -> Option { + let field = term.field(); + let field_entry = self.schema.get_field_entry(field); + let term_info = get!(self.get_term_info(term)); + let maximum_option = get!(field_entry.field_type().get_segment_postings_option()); + let best_effort_option = cmp::min(maximum_option, option); + Some(self.read_postings_from_terminfo(&term_info, best_effort_option)) + } + + /// Returns the number of documents containing the term. + pub fn doc_freq(&self, term: &Term) -> u32 { + match self.get_term_info(term) { + Some(term_info) => term_info.doc_freq, + None => 0, + } + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs index dca8b5ccd..bba1447ef 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,7 +7,9 @@ mod segment; mod index_meta; mod pool; mod segment_meta; +mod field_reader; +pub use self::field_reader::FieldReader; pub use self::searcher::Searcher; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; @@ -18,7 +20,6 @@ pub use self::index::Index; pub use self::segment_meta::SegmentMeta; pub use self::index_meta::IndexMeta; - use std::path::PathBuf; lazy_static! { diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 6579698e2..d84ad22a3 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -6,9 +6,11 @@ use common::TimerTree; use query::Query; use DocId; use DocAddress; -use schema::Term; -use termdict::TermMerger; +use schema::{Term, Field}; +use termdict::{TermMerger, TermDictionary}; +use std::sync::Arc; use std::fmt; +use core::FieldReader; use postings::TermInfo; @@ -46,7 +48,12 @@ impl Searcher { pub fn doc_freq(&self, term: &Term) -> u32 { self.segment_readers .iter() - .map(|segment_reader| segment_reader.doc_freq(term)) + .map(|segment_reader| { + segment_reader + .field_reader(term.field()) + .unwrap() // TODO error handling + .doc_freq(term) + }) .fold(0u32, |acc, val| acc + val) } @@ -65,20 +72,46 @@ impl Searcher { query.search(self, collector) } - /// Returns a Stream over all of the sorted unique terms of - /// the searcher. - /// - /// This includes all of the fields from all of the segment_readers. - /// See [`TermIterator`](struct.TermIterator.html). - /// - /// # Warning - /// This API is very likely to change in the future. - pub fn terms(&self) -> TermMerger { - TermMerger::from(self.segment_readers()) + pub fn field(&self, field: Field) -> Result { + let field_readers = self.segment_readers + .iter() + .map(|segment_reader| { + segment_reader.field_reader(field) + }) + .collect::>>()?; + Ok(FieldSearcher::new(field_readers)) } } + +pub struct FieldSearcher { + field_readers: Vec>, +} + + +impl FieldSearcher { + + fn new(field_readers: Vec>) -> FieldSearcher { + FieldSearcher { + field_readers: field_readers, + } + } + + + /// Returns a Stream over all of the sorted unique terms of + /// for the given field. + pub fn terms(&self) -> TermMerger { + let term_streamers: Vec<_> = self.field_readers + .iter() + .map(|field_reader| { + field_reader.terms().stream() + }) + .collect(); + TermMerger::new(term_streamers) + } +} + impl From> for Searcher { fn from(segment_readers: Vec) -> Searcher { Searcher { segment_readers: segment_readers } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 619888228..336496018 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -2,26 +2,21 @@ use Result; use core::Segment; use core::SegmentId; use core::SegmentComponent; -use schema::Term; +use std::sync::RwLock; use common::HasLen; -use compression::CompressedIntStream; use core::SegmentMeta; use fastfield::{self, FastFieldNotAvailableError}; use fastfield::DeleteBitSet; use store::StoreReader; use schema::Document; -use directory::ReadOnlySource; use DocId; use std::str; -use termdict::TermDictionary; -use std::cmp; -use postings::TermInfo; -use termdict::TermDictionaryImpl; use std::sync::Arc; +use std::collections::HashMap; +use common::CompositeFile; use std::fmt; +use core::FieldReader; use schema::Field; -use postings::SegmentPostingsOption; -use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; @@ -40,15 +35,19 @@ use schema::Schema; /// #[derive(Clone)] pub struct SegmentReader { + field_reader_cache: Arc>>>, + segment_id: SegmentId, segment_meta: SegmentMeta, - terms: Arc, - postings_data: ReadOnlySource, + + termdict_composite: CompositeFile, + postings_composite: CompositeFile, + positions_composite: CompositeFile, + store_reader: StoreReader, fast_fields_reader: Arc, fieldnorms_reader: Arc, delete_bitset: DeleteBitSet, - positions_data: ReadOnlySource, schema: Schema, } @@ -117,14 +116,6 @@ impl SegmentReader { self.fieldnorms_reader.open_reader(field) } - /// Returns the number of documents containing the term. - pub fn doc_freq(&self, term: &Term) -> u32 { - match self.get_term_info(term) { - Some(term_info) => term_info.doc_freq, - None => 0, - } - } - /// Accessor to the segment's `StoreReader`. pub fn get_store_reader(&self) -> &StoreReader { &self.store_reader @@ -133,13 +124,24 @@ impl SegmentReader { /// Open a new segment for reading. pub fn open(segment: Segment) -> Result { - let source = segment.open_read(SegmentComponent::TERMS)?; - let terms = TermDictionaryImpl::from_source(source)?; + let termdict_source = segment.open_read(SegmentComponent::TERMS)?; + let termdict_composite = CompositeFile::open(termdict_source)?; let store_source = segment.open_read(SegmentComponent::STORE)?; let store_reader = StoreReader::from_source(store_source); - let postings_shared_mmap = segment.open_read(SegmentComponent::POSTINGS)?; + let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; + let postings_composite = CompositeFile::open(postings_source)?; + + let positions_composite = { + if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) { + CompositeFile::open(source)? + } + else { + CompositeFile::empty() + } + }; + let fast_field_data = segment.open_read(SegmentComponent::FASTFIELDS)?; let fast_fields_reader = FastFieldsReader::from_source(fast_field_data)?; @@ -147,9 +149,6 @@ impl SegmentReader { let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorms_reader = FastFieldsReader::from_source(fieldnorms_data)?; - let positions_data = segment - .open_read(SegmentComponent::POSITIONS) - .unwrap_or_else(|_| ReadOnlySource::empty()); let delete_bitset = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; @@ -160,22 +159,53 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { - segment_meta: segment.meta().clone(), - postings_data: postings_shared_mmap, - terms: Arc::new(terms), - segment_id: segment.id(), - store_reader: store_reader, - fast_fields_reader: Arc::new(fast_fields_reader), - fieldnorms_reader: Arc::new(fieldnorms_reader), - delete_bitset: delete_bitset, - positions_data: positions_data, - schema: schema, - }) + field_reader_cache: Arc::new(RwLock::new(HashMap::new())), + segment_meta: segment.meta().clone(), + postings_composite: postings_composite, + termdict_composite: termdict_composite, + segment_id: segment.id(), + store_reader: store_reader, + fast_fields_reader: Arc::new(fast_fields_reader), + fieldnorms_reader: Arc::new(fieldnorms_reader), + delete_bitset: delete_bitset, + positions_composite: positions_composite, + schema: schema, + }) } - /// Return the term dictionary datastructure. - pub fn terms(&self) -> &TermDictionaryImpl { - &self.terms + pub fn field_reader(&self, field: Field) -> Result> { + if let Some(field_reader) = self.field_reader_cache.read() + .unwrap() // TODO + .get(&field) { + return Ok(field_reader.clone()); + } + + // TODO better error + let termdict_source = self.termdict_composite + .open_read(field) + .ok_or("Field not found")?; + + let postings_source = self.postings_composite + .open_read(field) + .ok_or("field not found")?; + + let positions_source = self.positions_composite + .open_read(field) + .ok_or("field not found")?; + + let field_reader = Arc::new(FieldReader::new( + termdict_source, + postings_source, + positions_source, + self.delete_bitset.clone(), + self.schema.clone(), + )?); + + self.field_reader_cache + .write() + .unwrap() // TODO + .insert(field, field_reader.clone()); + Ok(field_reader) } /// Returns the document (or to be accurate, its stored field) @@ -187,100 +217,6 @@ impl SegmentReader { } - /// Returns the segment postings associated with the term, and with the given option, - /// or `None` if the term has never been encountered and indexed. - /// - /// If the field was not indexed with the indexing options that cover - /// the requested options, the returned `SegmentPostings` the method does not fail - /// and returns a `SegmentPostings` with as much information as possible. - /// - /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a - /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` - /// with `DocId`s and frequencies. - pub fn read_postings(&self, - term: &Term, - option: SegmentPostingsOption) - -> Option { - let field = term.field(); - let field_entry = self.schema.get_field_entry(field); - let term_info = get!(self.get_term_info(term)); - let maximum_option = get!(field_entry.field_type().get_segment_postings_option()); - let best_effort_option = cmp::min(maximum_option, option); - Some(self.read_postings_from_terminfo(&term_info, best_effort_option)) - } - - - /// Returns a posting object given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most user should prefer using `read_postings` instead. - pub fn read_postings_from_terminfo(&self, - term_info: &TermInfo, - option: SegmentPostingsOption) - -> SegmentPostings { - let block_postings = self.read_block_postings_from_terminfo(term_info, option); - let delete_bitset = self.delete_bitset.clone(); - let position_stream = { - if option.has_positions() { - let position_offset = term_info.positions_offset; - let positions_data = &self.positions_data[position_offset as usize..]; - let mut stream = CompressedIntStream::wrap(positions_data); - stream.skip(term_info.positions_inner_offset as usize); - Some(stream) - } - else { - None - } - }; - SegmentPostings::from_block_postings( - block_postings, - delete_bitset, - position_stream - ) - } - - - /// Returns a block postings given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most user should prefer using `read_postings` instead. - pub fn read_block_postings_from_terminfo(&self, - term_info: &TermInfo, - option: SegmentPostingsOption) - -> BlockSegmentPostings { - let offset = term_info.postings_offset as usize; - let postings_data = &self.postings_data[offset..]; - let has_freq = option.has_freq(); - BlockSegmentPostings::from_data( - term_info.doc_freq as usize, - postings_data, - has_freq) - } - - - /// Resets the block segment to another position of the postings - /// file. - /// - /// This is useful for enumerating through a list of terms, - /// and consuming the associated posting lists while avoiding - /// reallocating a `BlockSegmentPostings`. - /// - /// # Warning - /// - /// This does not reset the positions list. - pub fn reset_block_postings_from_terminfo<'a>(&'a self, - term_info: &TermInfo, - block_postings: &mut BlockSegmentPostings<'a>) { - let offset = term_info.postings_offset as usize; - let postings_data: &'a [u8] = &self.postings_data[offset..]; - block_postings.reset(term_info.doc_freq as usize, postings_data); - } - - /// Returns the term info associated with the term. - pub fn get_term_info(&self, term: &Term) -> Option { - self.terms.get(term.as_slice()) - } - /// Returns the segment id pub fn segment_id(&self) -> SegmentId { self.segment_id diff --git a/src/directory/mod.rs b/src/directory/mod.rs index b107d78c5..cfdaee719 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -16,7 +16,7 @@ pub mod error; use std::io::{Write, Seek}; use std::io::BufWriter; -pub use self::read_only_source::ReadOnlySource; +pub use self::read_only_source::{SourceRead, ReadOnlySource}; pub use self::directory::Directory; pub use self::ram_directory::RAMDirectory; pub use self::mmap_directory::MmapDirectory; diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index d327f5a51..1fd0afc0f 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -2,6 +2,8 @@ use fst::raw::MmapReadOnly; use std::ops::Deref; use super::shared_vec_slice::SharedVecSlice; use common::HasLen; +use std::slice; +use std::io::{self, Read}; use stable_deref_trait::StableDeref; /// Read object that represents files in tantivy. @@ -62,6 +64,11 @@ impl ReadOnlySource { } } } + + pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource { + let len = self.len(); + self.slice(from_offset, len) + } } impl HasLen for ReadOnlySource { @@ -82,3 +89,38 @@ impl From> for ReadOnlySource { ReadOnlySource::Anonymous(shared_data) } } + +pub struct SourceRead { + _data_owner: ReadOnlySource, + cursor: &'static [u8] +} + +impl SourceRead { + pub fn advance(&mut self, len: usize) { + self.cursor = &self.cursor[len..]; + } +} + +impl AsRef<[u8]> for SourceRead { + fn as_ref(&self) -> &[u8] { + self.cursor + } +} + +impl From for SourceRead { + fn from(source: ReadOnlySource) -> SourceRead { + let len = source.len(); + let slice_ptr = source.as_slice().as_ptr(); + let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) }; + SourceRead { + _data_owner: source, + cursor: static_slice, + } + } +} + +impl Read for SourceRead { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.cursor.read(buf) + } +} diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 565c3089e..1477fb50b 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -177,8 +177,9 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, // Limit doc helps identify the first document // that may be affected by the delete operation. let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); + let field_reader = segment_reader.field_reader(delete_op.term.field())?; if let Some(mut docset) = - segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + field_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { while docset.advance() { let deleted_doc = docset.doc(); if deleted_doc < limit_doc { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 74cdd625a..48aa695aa 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -17,6 +17,7 @@ use fastfield::FastFieldSerializer; use fastfield::FastFieldReader; use store::StoreWriter; use std::cmp::{min, max}; +use termdict::TermDictionary; use schema::Term; use termdict::TermStreamer; @@ -195,48 +196,62 @@ impl IndexMerger { fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { let mut delta_computer = DeltaComputer::new(); - let mut merged_terms = TermMerger::from(&self.readers[..]); - let mut max_doc = 0; - - // map from segment doc ids to the resulting merged segment doc id. - let mut merged_doc_id_map: Vec>> = Vec::with_capacity(self.readers.len()); - - for reader in &self.readers { - let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); - for doc_id in 0..reader.max_doc() { - if reader.is_deleted(doc_id) { - segment_local_map.push(None); - } else { - segment_local_map.push(Some(max_doc)); - max_doc += 1u32; - } + let mut indexed_fields = vec!(); + for (field_ord, field_entry) in self.schema.fields().iter().enumerate() { + // if field_entry + if field_entry.is_indexed() { + indexed_fields.push(Field(field_ord as u32)); } - merged_doc_id_map.push(segment_local_map); } - // Create the total list of doc ids - // by stacking the doc ids from the different segment. - // - // In the new segments, the doc id from the different - // segment are stacked so that : - // - Segment 0's doc ids become doc id [0, seg.max_doc] - // - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc] - // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, - // seg0.max_doc + seg1.max_doc + seg2.max_doc] - // ... - if !merged_terms.advance() { - return Ok(()); - } + for indexed_field in indexed_fields { - let mut current_field = Term::wrap(merged_terms.key()).field(); + let field_readers = self.readers + .iter() + .map(|reader| + reader.field_reader(indexed_field)) + .collect::>>()?; - loop { - // this loop processes all fields. - let mut field_serializer = serializer.new_field(current_field)?; + let field_term_streams = field_readers + .iter() + .map(|field_reader| field_reader.terms().stream() ) + .collect(); + + let mut merged_terms = TermMerger::new(field_term_streams); + let mut max_doc = 0; + + // map from segment doc ids to the resulting merged segment doc id. + let mut merged_doc_id_map: Vec>> = Vec::with_capacity(self.readers.len()); + + for reader in &self.readers { + let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); + for doc_id in 0..reader.max_doc() { + if reader.is_deleted(doc_id) { + segment_local_map.push(None); + } else { + segment_local_map.push(Some(max_doc)); + max_doc += 1u32; + } + } + merged_doc_id_map.push(segment_local_map); + } + + // Create the total list of doc ids + // by stacking the doc ids from the different segment. + // + // In the new segments, the doc id from the different + // segment are stacked so that : + // - Segment 0's doc ids become doc id [0, seg.max_doc] + // - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc] + // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, + // seg0.max_doc + seg1.max_doc + seg2.max_doc] + // ... + + let mut field_serializer = serializer.new_field(indexed_field)?; + + let field_entry = self.schema.get_field_entry(indexed_field); - // we reached a new field. - let field_entry = self.schema.get_field_entry(current_field); // ... set segment postings option the new field. let segment_postings_option = field_entry .field_type() @@ -244,88 +259,78 @@ impl IndexMerger { .expect("Encountered a field that is not supposed to be indexed. Have you modified the schema?"); - loop { - // this loops processes a field. - { - let term = Term::wrap(merged_terms.key()); + while merged_terms.advance() { - // Let's compute the list of non-empty posting lists - let segment_postings: Vec<_> = merged_terms - .current_kvs() - .iter() - .flat_map(|heap_item| { - let segment_ord = heap_item.segment_ord; - let term_info = heap_item.streamer.value(); - let segment_reader = &self.readers[heap_item.segment_ord]; - let mut segment_postings = - segment_reader - .read_postings_from_terminfo(term_info, segment_postings_option); - if segment_postings.advance() { - Some((segment_ord, segment_postings)) - } else { - None + let term = Term::wrap(merged_terms.key()); + + // Let's compute the list of non-empty posting lists + let segment_postings: Vec<_> = merged_terms + .current_kvs() + .iter() + .flat_map(|heap_item| { + let segment_ord = heap_item.segment_ord; + let term_info = heap_item.streamer.value(); + let segment_reader = &self.readers[heap_item.segment_ord]; + let field_reader = segment_reader.field_reader(term.field()).unwrap(); // TODO fix unwrap + let mut segment_postings = field_reader + .read_postings_from_terminfo(term_info, segment_postings_option); + if segment_postings.advance() { + Some((segment_ord, segment_postings)) + } else { + None + } + }) + .collect(); + + // At this point, `segment_postings` contains the posting list + // of all of the segments containing the given term. + // + // These segments are non-empty and advance has already been called. + + if !segment_postings.is_empty() { + // If not, the `term` will be entirely removed. + + // We know that there is at least one document containing + // the term, so we add it. + field_serializer.new_term(term.as_ref())?; + + // We can now serialize this postings, by pushing each document to the + // postings serializer. + + for (segment_ord, mut segment_postings) in segment_postings { + let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; + loop { + // `.advance()` has been called once before the loop. + // Hence we cannot use a `while segment_postings.advance()` loop. + if let Some(remapped_doc_id) = + old_to_new_doc_id[segment_postings.doc() as usize] { + // we make sure to only write the term iff + // there is at least one document. + let positions: &[u32] = segment_postings.positions(); + let term_freq = segment_postings.term_freq(); + let delta_positions = delta_computer.compute_delta(positions); + field_serializer + .write_doc(remapped_doc_id, term_freq, delta_positions)?; } - }) - .collect(); - - // At this point, `segment_postings` contains the posting list - // of all of the segments containing the given term. - // - // These segments are non-empty and advance has already been called. - - if !segment_postings.is_empty() { - // If not, the `term` will be entirely removed. - - // We know that there is at least one document containing - // the term, so we add it. - field_serializer.new_term(term.as_ref())?; - - // We can now serialize this postings, by pushing each document to the - // postings serializer. - - for (segment_ord, mut segment_postings) in segment_postings { - let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; - loop { - // `.advance()` has been called once before the loop. - // Hence we cannot use a `while segment_postings.advance()` loop. - if let Some(remapped_doc_id) = - old_to_new_doc_id[segment_postings.doc() as usize] { - // we make sure to only write the term iff - // there is at least one document. - let positions: &[u32] = segment_postings.positions(); - let term_freq = segment_postings.term_freq(); - let delta_positions = delta_computer.compute_delta(positions); - field_serializer - .write_doc(remapped_doc_id, term_freq, delta_positions)?; - } - if !segment_postings.advance() { - break; - } + if !segment_postings.advance() { + break; } } - - // closing the term. - field_serializer.close_term()?; } + // closing the term. + field_serializer.close_term()?; } - - if !merged_terms.advance() { - field_serializer.close()?; - return Ok(()) - } - - { - let next_term_field = Term::wrap(merged_terms.key()).field(); - if next_term_field != current_field { - current_field = next_term_field; - field_serializer.close()?; - break; - } - } } + + field_serializer.close()?; + } + /* + + */ + Ok(()) } fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> { diff --git a/src/lib.rs b/src/lib.rs index d719badfb..c926b67b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -390,15 +390,16 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let field_reader = reader.field_reader(text_field).unwrap(); + assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -424,16 +425,17 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); + let field_reader = reader.field_reader(term_abcd.field()).unwrap(); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -459,13 +461,14 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let field_reader = reader.field_reader(term_abcd.field()).unwrap(); + assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -473,7 +476,7 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_c, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -497,6 +500,7 @@ mod tests { let term = Term::from_field_u64(field, 1u64); let mut postings = searcher .segment_reader(0) + .field_reader(term.field()).unwrap() .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -520,6 +524,7 @@ mod tests { let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) + .field_reader(term.field()).unwrap() .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -582,10 +587,11 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); + let field_reader = reader.field_reader(text_field).unwrap(); let term_abcd = Term::from_field_text(text_field, "abcd"); - assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); let term_af = Term::from_field_text(text_field, "af"); - let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap(); + let mut postings = field_reader.read_postings(&term_af, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index d1a05bbb0..05991a1d1 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -132,12 +132,14 @@ mod tests { { let term_a = Term::from_field_text(text_field, "abcdef"); assert!(segment_reader - .read_postings(&term_a, FreqAndPositions) - .is_none()); + .field_reader(term_a.field()).unwrap() + .read_postings(&term_a, FreqAndPositions) + .is_none()); } { let term_a = Term::from_field_text(text_field, "a"); let mut postings_a = segment_reader + .field_reader(term_a.field()).unwrap() .read_postings(&term_a, FreqAndPositions) .unwrap(); assert_eq!(postings_a.len(), 1000); @@ -160,6 +162,7 @@ mod tests { { let term_e = Term::from_field_text(text_field, "e"); let mut postings_e = segment_reader + .field_reader(term_e.field()).unwrap() .read_postings(&term_e, FreqAndPositions) .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); @@ -247,6 +250,7 @@ mod tests { for i in 0..num_docs - 1 { for j in i + 1..num_docs { let mut segment_postings = segment_reader + .field_reader(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -260,6 +264,7 @@ mod tests { { let mut segment_postings = segment_reader + .field_reader(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -280,6 +285,7 @@ mod tests { // check that filtering works { let mut segment_postings = segment_reader + .field_reader(term_0.field()).unwrap() .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -289,6 +295,7 @@ mod tests { } let mut segment_postings = segment_reader + .field_reader(term_0.field()).unwrap() .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -313,6 +320,7 @@ mod tests { // make sure seeking still works for i in 0..num_docs { let mut segment_postings = segment_reader + .field_reader(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -328,6 +336,7 @@ mod tests { // now try with a longer sequence { let mut segment_postings = segment_reader + .field_reader(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -363,12 +372,14 @@ mod tests { // finally, check that it's empty { let mut segment_postings = segment_reader + .field_reader(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); assert_eq!(segment_postings.skip_next(0), SkipResult::End); let mut segment_postings = segment_reader + .field_reader(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -436,6 +447,7 @@ mod tests { b.iter(|| { let mut segment_postings = segment_reader + .field_reader(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); while segment_postings.advance() {} @@ -448,15 +460,19 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { let segment_postings_a = segment_reader + .field_reader(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_b = segment_reader + .field_reader(TERM_B.field()).unwrap() .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_c = segment_reader + .field_reader(TERM_C.field()).unwrap() .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_d = segment_reader + .field_reader(TERM_D.field()).unwrap() .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq) .unwrap(); let mut intersection = IntersectionDocSet::from(vec![segment_postings_a, @@ -473,6 +489,7 @@ mod tests { let docs = tests::sample(segment_reader.num_docs(), p); let mut segment_postings = segment_reader + .field_reader(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); @@ -489,6 +506,7 @@ mod tests { b.iter(|| { let mut segment_postings = segment_reader + .field_reader(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { @@ -526,6 +544,7 @@ mod tests { b.iter(|| { let n: u32 = test::black_box(17); let mut segment_postings = segment_reader + .field_reader(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let mut s = 0u32; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index a6306b141..4e1f770c7 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -101,8 +101,9 @@ impl<'a> MultiFieldPostingsWriter<'a> { let (field, start) = offsets[i]; let (_, stop) = offsets[i + 1]; let postings_writer = &self.per_field_postings_writers[field.0 as usize]; - let field_serializer = serializer.new_field(field)?; - postings_writer.serialize(&term_offsets[start..stop], field_serializer, self.heap)?; + let mut field_serializer = serializer.new_field(field)?; + postings_writer.serialize(&term_offsets[start..stop], &mut field_serializer, self.heap)?; + field_serializer.close()?; } Ok(()) } @@ -137,7 +138,7 @@ pub trait PostingsWriter { /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, term_addrs: &[(&[u8], u32)], - serializer: FieldSerializer, + serializer: &mut FieldSerializer, heap: &Heap) -> io::Result<()>; @@ -214,13 +215,13 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' fn serialize(&self, term_addrs: &[(&[u8], u32)], - mut serializer: FieldSerializer, + serializer: &mut FieldSerializer, heap: &Heap) -> io::Result<()> { for &(term_bytes, addr) in term_addrs { let recorder: &mut Rec = self.heap.get_mut_ref(addr); serializer.new_term(term_bytes)?; - recorder.serialize(addr, &mut serializer, heap)?; + recorder.serialize(addr, serializer, heap)?; serializer.close_term()?; } Ok(()) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 26810edf4..0866a5fe5 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -5,11 +5,15 @@ use std::cmp; use fst::Streamer; use fastfield::DeleteBitSet; use std::cell::UnsafeCell; +use directory::{SourceRead, ReadOnlySource}; + -const EMPTY_DATA: [u8; 0] = [0u8; 0]; const EMPTY_POSITIONS: [u32; 0] = [0u32; 0]; -struct PositionComputer<'a> { + + + +struct PositionComputer { // store the amount of position int // before reading positions. // @@ -17,12 +21,12 @@ struct PositionComputer<'a> { // the positions vec. position_to_skip: Option, positions: Vec, - positions_stream: CompressedIntStream<'a>, + positions_stream: CompressedIntStream, } -impl<'a> PositionComputer<'a> { +impl PositionComputer { - pub fn new(positions_stream: CompressedIntStream<'a>) -> PositionComputer<'a> { + pub fn new(positions_stream: CompressedIntStream) -> PositionComputer { PositionComputer { position_to_skip: None, positions: vec!(), @@ -64,25 +68,25 @@ impl<'a> PositionComputer<'a> { /// /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. -pub struct SegmentPostings<'a> { - block_cursor: BlockSegmentPostings<'a>, +pub struct SegmentPostings { + block_cursor: BlockSegmentPostings, cur: usize, delete_bitset: DeleteBitSet, - position_computer: Option>>, + position_computer: Option>, } -impl<'a> SegmentPostings<'a> { +impl SegmentPostings { /// Reads a Segment postings from an &[u8] /// /// * `len` - number of document in the posting lists. /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, + pub fn from_block_postings(segment_block_postings: BlockSegmentPostings, delete_bitset: DeleteBitSet, - positions_stream_opt: Option>) - -> SegmentPostings<'a> { + positions_stream_opt: Option) + -> SegmentPostings { let position_computer = positions_stream_opt.map(|stream| { UnsafeCell::new(PositionComputer::new(stream)) }); @@ -95,7 +99,7 @@ impl<'a> SegmentPostings<'a> { } /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings<'a> { + pub fn empty() -> SegmentPostings { let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { block_cursor: empty_block_cursor, @@ -117,7 +121,7 @@ impl<'a> SegmentPostings<'a> { } -impl<'a> DocSet for SegmentPostings<'a> { +impl DocSet for SegmentPostings { // goes to the next element. // next needs to be called a first time to point to the correct element. #[inline] @@ -259,13 +263,13 @@ impl<'a> DocSet for SegmentPostings<'a> { } } -impl<'a> HasLen for SegmentPostings<'a> { +impl HasLen for SegmentPostings { fn len(&self) -> usize { self.block_cursor.doc_freq() } } -impl<'a> Postings for SegmentPostings<'a> { +impl Postings for SegmentPostings { fn term_freq(&self) -> u32 { self.block_cursor.freq(self.cur) } @@ -286,6 +290,7 @@ impl<'a> Postings for SegmentPostings<'a> { } + /// `BlockSegmentPostings` is a cursor iterating over blocks /// of documents. /// @@ -293,7 +298,7 @@ impl<'a> Postings for SegmentPostings<'a> { /// /// While it is useful for some very specific high-performance /// use cases, you should prefer using `SegmentPostings` for most usage. -pub struct BlockSegmentPostings<'a> { +pub struct BlockSegmentPostings { doc_decoder: BlockDecoder, freq_decoder: BlockDecoder, has_freq: bool, @@ -302,14 +307,14 @@ pub struct BlockSegmentPostings<'a> { doc_offset: DocId, num_binpacked_blocks: usize, num_vint_docs: usize, - remaining_data: &'a [u8], + remaining_data: SourceRead, } -impl<'a> BlockSegmentPostings<'a> { +impl BlockSegmentPostings { pub(crate) fn from_data(doc_freq: usize, - data: &'a [u8], + data: SourceRead, has_freq: bool) - -> BlockSegmentPostings<'a> { + -> BlockSegmentPostings { let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; BlockSegmentPostings { @@ -337,7 +342,7 @@ impl<'a> BlockSegmentPostings<'a> { // # Warning // // This does not reset the positions list. - pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) { + pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) { let num_binpacked_blocks: usize = doc_freq / NUM_DOCS_PER_BLOCK; let num_vint_docs = doc_freq & (NUM_DOCS_PER_BLOCK - 1); self.num_binpacked_blocks = num_binpacked_blocks; @@ -398,25 +403,30 @@ impl<'a> BlockSegmentPostings<'a> { pub fn advance(&mut self) -> bool { if self.num_binpacked_blocks > 0 { // TODO could self.doc_offset be just a local variable? - self.remaining_data = - self.doc_decoder - .uncompress_block_sorted(self.remaining_data, self.doc_offset); + + let num_consumed_bytes = self + .doc_decoder + .uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset); + self.remaining_data.advance(num_consumed_bytes); + if self.has_freq { - self.remaining_data = self.freq_decoder.uncompress_block_unsorted(self.remaining_data); + let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref()); + self.remaining_data.advance(num_consumed_bytes); } // it will be used as the next offset. self.doc_offset = self.doc_decoder.output(NUM_DOCS_PER_BLOCK - 1); self.num_binpacked_blocks -= 1; true } else if self.num_vint_docs > 0 { - self.remaining_data = + let num_compressed_bytes = self.doc_decoder - .uncompress_vint_sorted(self.remaining_data, + .uncompress_vint_sorted(self.remaining_data.as_ref(), self.doc_offset, self.num_vint_docs); + self.remaining_data.advance(num_compressed_bytes); if self.has_freq { self.freq_decoder - .uncompress_vint_unsorted(self.remaining_data, self.num_vint_docs); + .uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs); } self.num_vint_docs = 0; true @@ -426,7 +436,7 @@ impl<'a> BlockSegmentPostings<'a> { } /// Returns an empty segment postings object - pub fn empty() -> BlockSegmentPostings<'static> { + pub fn empty() -> BlockSegmentPostings { BlockSegmentPostings { num_binpacked_blocks: 0, num_vint_docs: 0, @@ -435,14 +445,14 @@ impl<'a> BlockSegmentPostings<'a> { freq_decoder: BlockDecoder::with_val(1), has_freq: false, - remaining_data: &EMPTY_DATA, + remaining_data: From::from(ReadOnlySource::empty()), doc_offset: 0, doc_freq: 0, } } } -impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> { +impl<'b> Streamer<'b> for BlockSegmentPostings { type Item = &'b [DocId]; fn next(&'b mut self) -> Option<&'b [DocId]> { @@ -498,10 +508,11 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0); + let field_reader = segment_reader.field_reader(int_field).unwrap(); let term = Term::from_field_u64(int_field, 0u64); - let term_info = segment_reader.get_term_info(&term).unwrap(); + let term_info = field_reader.get_term_info(&term).unwrap(); let mut block_segments = - segment_reader + field_reader .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); let mut offset: u32 = 0u32; // checking that the block before calling advance is empty @@ -538,17 +549,19 @@ mod tests { let mut block_segments; { let term = Term::from_field_u64(int_field, 0u64); - let term_info = segment_reader.get_term_info(&term).unwrap(); + let field_reader = segment_reader.field_reader(int_field).unwrap(); + let term_info = field_reader.get_term_info(&term).unwrap(); block_segments = - segment_reader + field_reader .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[0, 2, 4]); { let term = Term::from_field_u64(int_field, 1u64); - let term_info = segment_reader.get_term_info(&term).unwrap(); - segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments); + let field_reader = segment_reader.field_reader(int_field).unwrap(); + let term_info = field_reader.get_term_info(&term).unwrap(); + field_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[1, 3, 5]); diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 1726340d1..d9c887afb 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -5,12 +5,12 @@ use postings::Postings; use postings::IntersectionDocSet; use DocId; -pub struct PhraseScorer<'a> { - pub intersection_docset: IntersectionDocSet>, +pub struct PhraseScorer { + pub intersection_docset: IntersectionDocSet, } -impl<'a> PhraseScorer<'a> { +impl PhraseScorer { fn phrase_match(&self) -> bool { let mut positions_arr: Vec<&[u32]> = self.intersection_docset .docsets() @@ -54,7 +54,7 @@ impl<'a> PhraseScorer<'a> { } } -impl<'a> DocSet for PhraseScorer<'a> { +impl DocSet for PhraseScorer { fn advance(&mut self) -> bool { while self.intersection_docset.advance() { if self.phrase_match() { @@ -74,7 +74,7 @@ impl<'a> DocSet for PhraseScorer<'a> { } -impl<'a> Scorer for PhraseScorer<'a> { +impl Scorer for PhraseScorer { fn score(&self) -> f32 { 1f32 } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index a171b4160..2e9efd463 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -22,8 +22,9 @@ impl Weight for PhraseWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let mut term_postings_list = Vec::new(); for term in &self.phrase_terms { + let field_reader = reader.field_reader(term.field())?; let term_postings_option = - reader.read_postings(term, SegmentPostingsOption::FreqAndPositions); + field_reader.read_postings(term, SegmentPostingsOption::FreqAndPositions); if let Some(term_postings) = term_postings_option { term_postings_list.push(term_postings); } else { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 99bfa7d47..65f56b054 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -27,12 +27,14 @@ impl TermWeight { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } - pub fn specialized_scorer<'a>(&self, - reader: &'a SegmentReader) - -> Result>> { + pub fn specialized_scorer(&self, + reader: &SegmentReader) + -> Result> { let field = self.term.field(); + let field_reader = reader.field_reader(field)?; + // TODO move field reader too let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - let postings: Option> = reader.read_postings(&self.term, self.segment_postings_options); + let postings: Option = field_reader.read_postings(&self.term, self.segment_postings_options); Ok(postings .map(|segment_postings| { TermScorer { diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 4689e0673..e7b4b392c 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -1,11 +1,8 @@ use std::collections::BinaryHeap; -use core::SegmentReader; use termdict::TermStreamerImpl; use common::BinarySerializable; -use postings::TermInfo; use std::cmp::Ordering; use termdict::TermStreamer; -use termdict::TermDictionary; use schema::Term; pub struct HeapItem<'a, V> @@ -58,7 +55,7 @@ pub struct TermMerger<'a, V> impl<'a, V> TermMerger<'a, V> where V: 'a + BinarySerializable + Default { - fn new(streams: Vec>) -> TermMerger<'a, V> { + pub fn new(streams: Vec>) -> TermMerger<'a, V> { TermMerger { heap: BinaryHeap::new(), current_streamers: streams @@ -141,12 +138,3 @@ impl<'a, V> TermMerger<'a, V> } - -impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> { - fn from(segment_readers: &'a [SegmentReader]) -> TermMerger<'a, TermInfo> { - TermMerger::new(segment_readers - .iter() - .map(|reader| reader.terms().stream()) - .collect()) - } -} diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index c4786f539..f045eb1f7 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -282,9 +282,6 @@ mod tests { assert!(!stream.advance()); } - - - #[test] fn test_term_iterator() { let mut schema_builder = SchemaBuilder::default(); @@ -319,13 +316,16 @@ mod tests { } index.load_searchers().unwrap(); let searcher = index.searcher(); - let mut term_it = searcher.terms(); + + let field_searcher = searcher.field(text_field).unwrap(); + let mut term_it = field_searcher.terms(); let mut term_string = String::new(); while term_it.advance() { let term = Term::from_bytes(term_it.key()); term_string.push_str(term.text()); } assert_eq!(&*term_string, "abcdef"); + } From 8d466b8a7638c0cfe236d16dba84a8211590b9be Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 13 Aug 2017 18:39:45 +0900 Subject: [PATCH 13/29] half way through removing FastFieldsReader --- src/core/searcher.rs | 1 - src/core/segment_reader.rs | 40 ++++++++++++------------- src/fastfield/mod.rs | 45 +++++++++++++++++----------- src/fastfield/reader.rs | 60 +++++++++----------------------------- src/indexer/merger.rs | 4 ++- 5 files changed, 63 insertions(+), 87 deletions(-) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index d84ad22a3..eeac26072 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -23,7 +23,6 @@ pub struct Searcher { segment_readers: Vec, } - impl Searcher { /// Fetches a document from tantivy's store given a `DocAddress`. /// diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 336496018..e3d203488 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -17,7 +17,7 @@ use common::CompositeFile; use std::fmt; use core::FieldReader; use schema::Field; -use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; +use fastfield::{FastFieldReader, U64FastFieldReader}; use schema::Schema; @@ -43,10 +43,10 @@ pub struct SegmentReader { termdict_composite: CompositeFile, postings_composite: CompositeFile, positions_composite: CompositeFile, + fast_fields_composite: CompositeFile, + fieldnorms_composite: CompositeFile, store_reader: StoreReader, - fast_fields_reader: Arc, - fieldnorms_reader: Arc, delete_bitset: DeleteBitSet, schema: Schema, } @@ -75,11 +75,6 @@ impl SegmentReader { self.delete_bitset.len() as DocId } - #[doc(hidden)] - pub fn fast_fields_reader(&self) -> &FastFieldsReader { - &*self.fast_fields_reader - } - /// Accessor to a segment's fast field reader given a field. /// /// Returns the u64 fast value reader if the field @@ -91,16 +86,17 @@ impl SegmentReader { /// # Panics /// May panic if the index is corrupted. pub fn get_fast_field_reader - (&self, - field: Field) - -> fastfield::Result { + (&self, field: Field) -> fastfield::Result { let field_entry = self.schema.get_field_entry(field); if !TFastFieldReader::is_enabled(field_entry.field_type()) { Err(FastFieldNotAvailableError::new(field_entry)) } else { - Ok(self.fast_fields_reader - .open_reader(field) - .expect("Fast field file corrupted.")) + self.fast_fields_composite + .open_read(field) + .ok_or_else(|| { + FastFieldNotAvailableError::new(field_entry) + }) + .map(TFastFieldReader::open) } } @@ -113,7 +109,9 @@ impl SegmentReader { /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> Option { - self.fieldnorms_reader.open_reader(field) + self.fieldnorms_composite + .open_read(field) + .map(U64FastFieldReader::open) } /// Accessor to the segment's `StoreReader`. @@ -143,11 +141,11 @@ impl SegmentReader { }; - let fast_field_data = segment.open_read(SegmentComponent::FASTFIELDS)?; - let fast_fields_reader = FastFieldsReader::from_source(fast_field_data)?; + let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?; + let fast_fields_composite = CompositeFile::open(fast_fields_data)?; let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; - let fieldnorms_reader = FastFieldsReader::from_source(fieldnorms_data)?; + let fieldnorms_composite = CompositeFile::open(fieldnorms_data)?; let delete_bitset = if segment.meta().has_deletes() { @@ -161,12 +159,12 @@ impl SegmentReader { Ok(SegmentReader { field_reader_cache: Arc::new(RwLock::new(HashMap::new())), segment_meta: segment.meta().clone(), - postings_composite: postings_composite, termdict_composite: termdict_composite, + postings_composite: postings_composite, + fast_fields_composite: fast_fields_composite, + fieldnorms_composite: fieldnorms_composite, segment_id: segment.id(), store_reader: store_reader, - fast_fields_reader: Arc::new(fast_fields_reader), - fieldnorms_reader: Arc::new(fieldnorms_reader), delete_bitset: delete_bitset, positions_composite: positions_composite, schema: schema, diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 31b241388..cff0f151e 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -32,7 +32,7 @@ mod delete; pub use self::delete::write_delete_bitset; pub use self::delete::DeleteBitSet; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; -pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader}; +pub use self::reader::{U64FastFieldReader, I64FastFieldReader}; pub use self::reader::FastFieldReader; pub use self::serializer::FastFieldSerializer; pub use self::error::{Result, FastFieldNotAvailableError}; @@ -51,6 +51,7 @@ mod tests { use fastfield::FastFieldReader; use rand::Rng; use rand::SeedableRng; + use common::CompositeFile; use rand::XorShiftRng; lazy_static! { @@ -96,6 +97,8 @@ mod tests { { assert_eq!(source.len(), 35 as usize); } + // TODO uncomment + /* { let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); let fast_field_reader: U64FastFieldReader = @@ -104,6 +107,7 @@ mod tests { assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); } + */ } #[test] @@ -131,9 +135,9 @@ mod tests { assert_eq!(source.len(), 60 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); assert_eq!(fast_field_reader.get(0), 4u64); assert_eq!(fast_field_reader.get(1), 14_082_001u64); assert_eq!(fast_field_reader.get(2), 3_052u64); @@ -167,9 +171,9 @@ mod tests { assert_eq!(source.len(), 33 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); for doc in 0..10_000 { assert_eq!(fast_field_reader.get(doc), 100_000u64); } @@ -200,9 +204,10 @@ mod tests { assert_eq!(source.len(), 80041 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { assert_eq!(fast_field_reader.get(doc), @@ -236,9 +241,10 @@ mod tests { assert_eq!(source.len(), 17708 as usize); } { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: I64FastFieldReader = - fast_field_readers.open_reader(i64_field).unwrap(); + I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap()); + assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); for (doc, i) in (-100i64..10_000i64).enumerate() { @@ -272,9 +278,10 @@ mod tests { let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: I64FastFieldReader = - fast_field_readers.open_reader(i64_field).unwrap(); + I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap()); assert_eq!(fast_field_reader.get(0u32), 0i64); } } @@ -305,9 +312,10 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let mut a = 0u64; for _ in 0..n { assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]); @@ -359,9 +367,11 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + + b.iter(|| { let n = test::black_box(7000u32); let mut a = 0u64; @@ -390,9 +400,10 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let fast_fields_composite = CompositeFile::open(source).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + b.iter(|| { let n = test::black_box(1000u32); let mut a = 0u32; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 2ec8f66fc..45848293d 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -148,20 +148,24 @@ impl From> for U64FastFieldReader { fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); } + panic!("TODO fix me"); + /* directory .open_read(path) .chain_err(|| "Failed to open the file") .and_then(|source| { - FastFieldsReader::from_source(source) - .chain_err(|| "Failed to read the file.") - }) - .and_then(|ff_readers| { - ff_readers - .open_reader(field) - .ok_or_else(|| "Failed to find the requested field".into()) - }) - .expect("This should never happen, please report.") + CompositeFile::open(source) + .chain_err(|| "Failed to read the file.") + }) + .and_then(|composite_file| { + composite_file.open_read(field) + ff_readers + .open_reader(field) + .ok_or_else(|| "Failed to find the requested field".into()) + }) + .expect("This should never happen, please report.") + */ } } @@ -231,41 +235,3 @@ impl FastFieldReader for I64FastFieldReader { } } } - - - -/// The `FastFieldsReader` is the datastructure containing -/// all of the fast fields' data. -/// -/// It contains a mapping that associated these fields to -/// the proper slice in the fastfield reader file. -pub struct FastFieldsReader { - composite_file: CompositeFile, -} - -impl FastFieldsReader { - /// Opens a `FastFieldsReader` - /// - /// When opening the fast field reader, the - /// the list of the offset is read (as a footer of the - /// data file). - pub fn from_source(source: ReadOnlySource) -> io::Result { - Ok(FastFieldsReader { - composite_file: CompositeFile::open(source)?, - }) - } - - /// Returns the u64 fast value reader if the field - /// is a u64 field indexed as "fast". - /// - /// Return None if the field is not a u64 field - /// indexed with the fast option. - /// - /// # Panics - /// May panic if the index is corrupted. - pub fn open_reader(&self, field: Field) -> Option { - self.composite_file - .open_read(field) - .map(FFReader::open) - } -} diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 48aa695aa..2d2385cb9 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -58,7 +58,9 @@ fn extract_fieldnorm_reader(segment_reader: &SegmentReader, fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option { - segment_reader.fast_fields_reader().open_reader(field) + segment_reader.get_fast_field_reader(field) + .ok() + } struct DeltaComputer { From 9cb7a0f6e6da027456814ddbeb41173ea3b5646e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 13 Aug 2017 19:38:25 +0900 Subject: [PATCH 14/29] Unit tests passing --- src/core/field_reader.rs | 1 - src/fastfield/reader.rs | 32 +++++++++++--------------------- src/indexer/merger.rs | 6 +----- 3 files changed, 12 insertions(+), 27 deletions(-) diff --git a/src/core/field_reader.rs b/src/core/field_reader.rs index bead5bb80..eaf35514b 100644 --- a/src/core/field_reader.rs +++ b/src/core/field_reader.rs @@ -26,7 +26,6 @@ impl FieldReader { positions_source: ReadOnlySource, delete_bitset: DeleteBitSet, schema: Schema, - ) -> io::Result { Ok(FieldReader { diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 45848293d..b67510524 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,9 +1,7 @@ -use std::io; use directory::ReadOnlySource; -use common::CompositeFile; use common::BinarySerializable; use DocId; -use schema::{Field, SchemaBuilder}; +use schema::{SchemaBuilder}; use std::path::Path; use schema::FAST; use directory::{WritePtr, RAMDirectory, Directory}; @@ -12,8 +10,8 @@ use fastfield::FastFieldsWriter; use common::bitpacker::compute_num_bits; use common::bitpacker::BitUnpacker; use schema::FieldType; -use error::ResultExt; use std::mem; +use common::CompositeFile; use common; use owning_ref::OwningRef; @@ -148,24 +146,16 @@ impl From> for U64FastFieldReader { fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); } - panic!("TODO fix me"); - /* - directory - .open_read(path) - .chain_err(|| "Failed to open the file") - .and_then(|source| { - CompositeFile::open(source) - .chain_err(|| "Failed to read the file.") - }) - .and_then(|composite_file| { - composite_file.open_read(field) - ff_readers - .open_reader(field) - .ok_or_else(|| "Failed to find the requested field".into()) - }) - .expect("This should never happen, please report.") - */ + let source = directory + .open_read(path) + .expect("Failed to open the file"); + let composite_file = CompositeFile::open(source) + .expect("Failed to read the composite file"); + + let field_source = composite_file.open_read(field) + .expect("File component not found"); + U64FastFieldReader::open(field_source) } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 2d2385cb9..87a2f1ed5 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -217,7 +217,7 @@ impl IndexMerger { let field_term_streams = field_readers .iter() - .map(|field_reader| field_reader.terms().stream() ) + .map(|field_reader| field_reader.terms().stream()) .collect(); let mut merged_terms = TermMerger::new(field_term_streams); @@ -298,7 +298,6 @@ impl IndexMerger { // We can now serialize this postings, by pushing each document to the // postings serializer. - for (segment_ord, mut segment_postings) in segment_postings { let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; loop { @@ -329,9 +328,6 @@ impl IndexMerger { field_serializer.close()?; } - /* - - */ Ok(()) } From 38513014d5ff2b3d69a9a0006f697c7c7664f114 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 14 Aug 2017 23:35:09 +0900 Subject: [PATCH 15/29] Reenable unit test. Consuming CompositeWrite on Close. --- src/common/composite_file.rs | 2 +- src/fastfield/mod.rs | 8 +++----- src/fastfield/serializer.rs | 2 +- src/postings/serializer.rs | 5 +---- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index b092f0bce..39db0bbcb 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -27,7 +27,7 @@ impl CompositeWrite { &mut self.write } - pub fn close(&mut self) -> io::Result<()> { + pub fn close(mut self) -> io::Result<()> { let footer_offset = self.write.written_bytes(); VInt(self.offsets.len() as u64).serialize(&mut self.write)?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index cff0f151e..fc69f6ddf 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -97,17 +97,15 @@ mod tests { { assert_eq!(source.len(), 35 as usize); } - // TODO uncomment - /* { - let fast_field_readers = FastFieldsReader::from_source(source).unwrap(); + let composite_file = CompositeFile::open(source).unwrap(); + let field_source = composite_file.open_read(*FIELD).unwrap(); let fast_field_reader: U64FastFieldReader = - fast_field_readers.open_reader(*FIELD).unwrap(); + U64FastFieldReader::open(field_source); assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); } - */ } #[test] diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 590aee84a..62dd7cba1 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -59,7 +59,7 @@ impl FastFieldSerializer { /// Closes the serializer /// /// After this call the data must be persistently save on disk. - pub fn close(mut self) -> io::Result<()> { + pub fn close(self) -> io::Result<()> { self.composite_write.close() } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 087baed3d..76c1f1f6b 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -112,7 +112,7 @@ impl InvertedIndexSerializer { } /// Closes the serializer. - pub fn close(mut self) -> io::Result<()> { + pub fn close(self) -> io::Result<()> { self.terms_write.close()?; self.postings_write.close()?; self.positions_write.close()?; @@ -223,9 +223,6 @@ impl<'a> FieldSerializer<'a> { } } -// TODO is the last term always closed? - - struct PostingsSerializer { postings_write: CountingWriter, From eb5b2ffdcc927b2fb38d94d8b15b5b0618d39262 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 15 Aug 2017 13:57:22 +0900 Subject: [PATCH 16/29] Cleanups --- src/common/composite_file.rs | 5 ++++- src/common/counting_writer.rs | 2 +- src/core/searcher.rs | 3 +++ src/fastfield/mod.rs | 18 +++++++++--------- src/fastfield/reader.rs | 2 +- src/fastfield/serializer.rs | 4 +++- src/indexer/segment_serializer.rs | 4 ++-- src/postings/serializer.rs | 9 ++++----- src/termdict/mod.rs | 2 -- 9 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 39db0bbcb..60f9286df 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -3,11 +3,14 @@ use common::CountingWriter; use std::collections::HashMap; use schema::Field; use common::VInt; +use directory::WritePtr; use std::io; use directory::ReadOnlySource; use common::BinarySerializable; -pub struct CompositeWrite { + + +pub struct CompositeWrite { write: CountingWriter, offsets: HashMap, } diff --git a/src/common/counting_writer.rs b/src/common/counting_writer.rs index db13e368f..d9ea877d2 100644 --- a/src/common/counting_writer.rs +++ b/src/common/counting_writer.rs @@ -2,7 +2,7 @@ use std::io::Write; use std::io; -pub struct CountingWriter { +pub struct CountingWriter { underlying: W, written_bytes: usize, } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index eeac26072..e8e54475a 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -71,6 +71,8 @@ impl Searcher { query.search(self, collector) } + + // This API may change in the future. pub fn field(&self, field: Field) -> Result { let field_readers = self.segment_readers .iter() @@ -84,6 +86,7 @@ impl Searcher { + pub struct FieldSearcher { field_readers: Vec>, } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index fc69f6ddf..cce503f21 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -85,7 +85,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64); @@ -114,7 +114,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64); @@ -156,7 +156,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for _ in 0..10_000 { add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64); @@ -185,7 +185,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); // forcing the amplitude to be high add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64); @@ -224,7 +224,7 @@ mod tests { let schema = schema_builder.build(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); for i in -100i64..10_000i64 { let mut doc = Document::default(); @@ -266,7 +266,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); let doc = Document::default(); fast_field_writers.add_document(&doc); @@ -300,7 +300,7 @@ mod tests { let mut directory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); @@ -355,7 +355,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); @@ -388,7 +388,7 @@ mod tests { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index b67510524..cdec175e2 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -137,7 +137,7 @@ impl From> for U64FastFieldReader { let mut directory: RAMDirectory = RAMDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::new(write).unwrap(); + let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); for val in vals { let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap(); diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 62dd7cba1..ce2184fde 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -31,8 +31,10 @@ pub struct FastFieldSerializer { } impl FastFieldSerializer { + + /// Constructor - pub fn new(write: WritePtr) -> io::Result { + pub fn from_write(write: WritePtr) -> io::Result { // just making room for the pointer to header. let composite_write = CompositeWrite::wrap(write); Ok(FastFieldSerializer { diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 76190bd9b..719c98c14 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -21,10 +21,10 @@ impl SegmentSerializer { let store_write = try!(segment.open_write(SegmentComponent::STORE)); let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS)); - let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write)); + let fast_field_serializer = try!(FastFieldSerializer::from_write(fast_field_write)); let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS)); - let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write)); + let fieldnorms_serializer = try!(FastFieldSerializer::from_write(fieldnorms_write)); let postings_serializer = try!(InvertedIndexSerializer::open(segment)); Ok(SegmentSerializer { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 76c1f1f6b..410226e7a 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -76,11 +76,10 @@ impl InvertedIndexSerializer { pub fn open(segment: &mut Segment) -> Result { use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; InvertedIndexSerializer::new( - CompositeWrite::wrap( - segment.open_write(TERMS)?), - CompositeWrite::wrap(segment.open_write(POSTINGS)?), - CompositeWrite::wrap(segment.open_write(POSITIONS)?), - segment.schema()) + CompositeWrite::wrap(segment.open_write(TERMS)?), + CompositeWrite::wrap(segment.open_write(POSTINGS)?), + CompositeWrite::wrap(segment.open_write(POSITIONS)?), + segment.schema()) } /// Must be called before starting pushing terms of diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index f045eb1f7..03a4ac0af 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -95,7 +95,6 @@ mod streamdict; pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, TermStreamerBuilderImpl}; - mod merger; use std::io; @@ -325,7 +324,6 @@ mod tests { term_string.push_str(term.text()); } assert_eq!(&*term_string, "abcdef"); - } From 2d70efb7b077c0c603c2b1760d9d06228d9a5ca8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 15 Aug 2017 14:43:05 +0900 Subject: [PATCH 17/29] Removed trait boundary on termdict --- src/termdict/fstdict/streamer.rs | 6 ++---- src/termdict/fstdict/termdict.rs | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/termdict/fstdict/streamer.rs b/src/termdict/fstdict/streamer.rs index 053942bf9..3e4dbe83f 100644 --- a/src/termdict/fstdict/streamer.rs +++ b/src/termdict/fstdict/streamer.rs @@ -5,8 +5,7 @@ use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default +pub struct TermStreamerBuilderImpl<'a, V: 'a> { fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>, @@ -63,8 +62,7 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default +pub struct TermStreamerImpl<'a, V: 'a> { fst_map: &'a TermDictionaryImpl, stream: Stream<'a>, diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 78edaf203..1d29e0df6 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -14,8 +14,6 @@ fn convert_fst_error(e: fst::Error) -> io::Error { /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) pub struct TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default { fst_builder: fst::MapBuilder, data: Vec, @@ -96,7 +94,6 @@ fn open_fst_index(source: ReadOnlySource) -> io::Result { /// See [`TermDictionary`](./trait.TermDictionary.html) pub struct TermDictionaryImpl - where V: BinarySerializable + Default { fst_index: fst::Map, values_mmap: ReadOnlySource, From 744edb2c5c53fea0fc081071c105a9db81450841 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 16 Aug 2017 14:06:00 +0900 Subject: [PATCH 18/29] NOBUG Avoid serializing position offset when useless. Test passing --- Cargo.toml | 2 +- src/core/searcher.rs | 3 +- src/postings/serializer.rs | 60 ++++++------ src/termdict/fstdict/streamer.rs | 25 +++-- src/termdict/fstdict/termdict.rs | 36 +++---- src/termdict/merger.rs | 35 +++---- src/termdict/mod.rs | 145 ++++++++++++---------------- src/termdict/streamdict/mod.rs | 26 +++++ src/termdict/streamdict/streamer.rs | 86 ++++++++++------- src/termdict/streamdict/termdict.rs | 92 +++++++++++------- 10 files changed, 270 insertions(+), 240 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 845f1d31d..11711a088 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ debug-assertions = false [features] -default = ["simdcompression"] +default = ["simdcompression", "streamdict"] simdcompression = ["libc", "gcc"] streamdict = [] diff --git a/src/core/searcher.rs b/src/core/searcher.rs index e8e54475a..515b8fda6 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -11,7 +11,6 @@ use termdict::{TermMerger, TermDictionary}; use std::sync::Arc; use std::fmt; use core::FieldReader; -use postings::TermInfo; /// Holds a list of `SegmentReader`s ready for search. @@ -103,7 +102,7 @@ impl FieldSearcher { /// Returns a Stream over all of the sorted unique terms of /// for the given field. - pub fn terms(&self) -> TermMerger { + pub fn terms(&self) -> TermMerger { let term_streamers: Vec<_> = self.field_readers .iter() .map(|field_reader| { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 410226e7a..caec58b1f 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -5,7 +5,6 @@ use schema::Field; use schema::FieldEntry; use schema::FieldType; use schema::Schema; -use schema::TextIndexingOptions; use directory::WritePtr; use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder}; use DocId; @@ -88,22 +87,11 @@ impl InvertedIndexSerializer { /// Loads the indexing options for the given field. pub fn new_field(&mut self, field: Field) -> io::Result { let field_entry: &FieldEntry = self.schema.get_field_entry(field); - let text_indexing_options = match *field_entry.field_type() { - FieldType::Str(ref text_options) => text_options.get_indexing_options(), - FieldType::U64(ref int_options) | - FieldType::I64(ref int_options) => { - if int_options.is_indexed() { - TextIndexingOptions::Unindexed - } else { - TextIndexingOptions::Untokenized - } - } - }; let term_dictionary_write = self.terms_write.for_field(field); let postings_write = self.postings_write.for_field(field); let positions_write = self.positions_write.for_field(field); FieldSerializer::new( - text_indexing_options, + field_entry.field_type().clone(), term_dictionary_write, postings_write, positions_write @@ -121,10 +109,9 @@ impl InvertedIndexSerializer { pub struct FieldSerializer<'a> { - text_indexing_options: TextIndexingOptions, - term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter, TermInfo>, + term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter>, postings_serializer: PostingsSerializer<&'a mut CountingWriter>, - positions_serializer: PositionSerializer<&'a mut CountingWriter>, + positions_serializer_opt: Option>>, current_term_info: TermInfo, term_open: bool, } @@ -133,29 +120,46 @@ pub struct FieldSerializer<'a> { impl<'a> FieldSerializer<'a> { fn new( - text_indexing_options: TextIndexingOptions, + field_type: FieldType, term_dictionary_write: &'a mut CountingWriter, postings_write: &'a mut CountingWriter, - positions_write: &'a mut CountingWriter + positions_write: &'a mut CountingWriter, ) -> io::Result> { - let term_freq_enabled = text_indexing_options.is_termfreq_enabled(); - let term_dictionary_builder = TermDictionaryBuilderImpl::new(term_dictionary_write)?; + let (term_freq_enabled, position_enabled): (bool, bool) = + match field_type { + FieldType::Str(ref text_options) => { + let text_indexing_options = text_options.get_indexing_options(); + (text_indexing_options.is_termfreq_enabled(), text_indexing_options.is_position_enabled()) + }, + _ => { + (false, false) + } + }; + let term_dictionary_builder = TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?; let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled); - let positions_serializer = PositionSerializer::new(positions_write); + let positions_serializer_opt = + if position_enabled { + Some(PositionSerializer::new(positions_write)) + } + else { + None + }; Ok(FieldSerializer { - text_indexing_options: text_indexing_options, term_dictionary_builder: term_dictionary_builder, postings_serializer: postings_serializer, - positions_serializer: positions_serializer, + positions_serializer_opt: positions_serializer_opt, current_term_info: TermInfo::default(), term_open: false, }) } fn current_term_info(&self) -> TermInfo { - let (filepos, offset) = self.positions_serializer.addr(); + let (filepos, offset) = self.positions_serializer_opt + .as_ref() + .map(|positions_serializer| positions_serializer.addr()) + .unwrap_or((0u32, 0u8)); TermInfo { doc_freq: 0, postings_offset: self.postings_serializer.addr(), @@ -194,8 +198,8 @@ impl<'a> FieldSerializer<'a> { -> io::Result<()> { self.current_term_info.doc_freq += 1; self.postings_serializer.write_doc(doc_id, term_freq)?; - if self.text_indexing_options.is_position_enabled() { - self.positions_serializer.write(position_deltas)?; + if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() { + positions_serializer.write(position_deltas)?; } Ok(()) } @@ -215,7 +219,9 @@ impl<'a> FieldSerializer<'a> { pub fn close(mut self) -> io::Result<()> { self.close_term()?; - self.positions_serializer.close()?; + if let Some(positions_serializer) = self.positions_serializer_opt { + positions_serializer.close()?; + } self.postings_serializer.close()?; self.term_dictionary_builder.finish()?; Ok(()) diff --git a/src/termdict/fstdict/streamer.rs b/src/termdict/fstdict/streamer.rs index 3e4dbe83f..823c0ba61 100644 --- a/src/termdict/fstdict/streamer.rs +++ b/src/termdict/fstdict/streamer.rs @@ -1,20 +1,21 @@ use fst::{IntoStreamer, Streamer}; use fst::map::{StreamBuilder, Stream}; use common::BinarySerializable; +use postings::TermInfo; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a, V: 'a> +pub struct TermStreamerBuilderImpl<'a> { - fst_map: &'a TermDictionaryImpl, + fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>, } -impl<'a, V> TermStreamerBuilderImpl<'a, V> +impl<'a, V> TermStreamerBuilderImpl<'a> where V: 'a + BinarySerializable + Default { - pub(crate) fn new(fst_map: &'a TermDictionaryImpl, + pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self { TermStreamerBuilderImpl { @@ -24,10 +25,9 @@ impl<'a, V> TermStreamerBuilderImpl<'a, V> } } -impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { - type Streamer = TermStreamerImpl<'a, V>; + type Streamer = TermStreamerImpl<'a>; fn ge>(mut self, bound: T) -> Self { self.stream_builder = self.stream_builder.ge(bound); @@ -62,17 +62,16 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a, V: 'a> +pub struct TermStreamerImpl<'a> { - fst_map: &'a TermDictionaryImpl, + fst_map: &'a TermDictionaryImpl, stream: Stream<'a>, offset: u64, current_key: Vec, - current_value: V, + current_value: TermInfo, } -impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> - where V: BinarySerializable + Default +impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { if let Some((term, offset)) = self.stream.next() { @@ -93,7 +92,7 @@ impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> &self.current_key } - fn value(&self) -> &V { + fn value(&self) -> &TermInfo { &self.current_value } } diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 1d29e0df6..c9054323a 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -3,7 +3,7 @@ use fst; use fst::raw::Fst; use directory::ReadOnlySource; use common::BinarySerializable; -use std::marker::PhantomData; +use bincode; use postings::TermInfo; use termdict::{TermDictionary, TermDictionaryBuilder}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; @@ -13,16 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error { } /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) -pub struct TermDictionaryBuilderImpl +pub struct TermDictionaryBuilderImpl { fst_builder: fst::MapBuilder, data: Vec, - _phantom_: PhantomData, } -impl TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilderImpl + where W: Write { /// # Warning /// Horribly dangerous internal API @@ -47,20 +45,18 @@ impl TermDictionaryBuilderImpl } } -impl TermDictionaryBuilder for TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilder for TermDictionaryBuilderImpl + where W: Write { - fn new(w: W) -> io::Result { + fn new(w: W, field_option: FieldOption) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilderImpl { fst_builder: fst_builder, data: Vec::new(), - _phantom_: PhantomData, }) } - fn insert>(&mut self, key_ref: K, value: &V) -> io::Result<()> { + fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { let key = key_ref.as_ref(); self.fst_builder .insert(key, self.data.len() as u64) @@ -93,15 +89,13 @@ fn open_fst_index(source: ReadOnlySource) -> io::Result { } /// See [`TermDictionary`](./trait.TermDictionary.html) -pub struct TermDictionaryImpl +pub struct TermDictionaryImpl { fst_index: fst::Map, values_mmap: ReadOnlySource, - _phantom_: PhantomData, } -impl TermDictionaryImpl - where V: BinarySerializable + Default +impl TermDictionaryImpl { /// Deserialize and returns the value at address `offset` pub(crate) fn read_value(&self, offset: u64) -> io::Result { @@ -112,12 +106,11 @@ impl TermDictionaryImpl } -impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl - where V: BinarySerializable + Default + 'a +impl<'a> TermDictionary<'a> for TermDictionaryImpl { - type Streamer = TermStreamerImpl<'a, V>; + type Streamer = TermStreamerImpl<'a>; - type StreamBuilder = TermStreamerBuilderImpl<'a, V>; + type StreamBuilder = TermStreamerBuilderImpl<'a>; fn from_source(source: ReadOnlySource) -> io::Result { let total_len = source.len(); @@ -131,7 +124,6 @@ impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl Ok(TermDictionaryImpl { fst_index: fst_index, values_mmap: values_source, - _phantom_: PhantomData, }) } @@ -144,7 +136,7 @@ impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl }) } - fn range(&self) -> TermStreamerBuilderImpl { + fn range(&self) -> TermStreamerBuilderImpl { TermStreamerBuilderImpl::new(self, self.fst_index.range()) } } diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index e7b4b392c..4efdfd5d2 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -1,39 +1,34 @@ use std::collections::BinaryHeap; use termdict::TermStreamerImpl; -use common::BinarySerializable; use std::cmp::Ordering; use termdict::TermStreamer; use schema::Term; -pub struct HeapItem<'a, V> - where V: 'a + BinarySerializable + Default +pub struct HeapItem<'a> { - pub streamer: TermStreamerImpl<'a, V>, + pub streamer: TermStreamerImpl<'a>, pub segment_ord: usize, } -impl<'a, V> PartialEq for HeapItem<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> PartialEq for HeapItem<'a> { fn eq(&self, other: &Self) -> bool { self.segment_ord == other.segment_ord } } -impl<'a, V> Eq for HeapItem<'a, V> where V: 'a + BinarySerializable + Default {} +impl<'a> Eq for HeapItem<'a> {} -impl<'a, V> PartialOrd for HeapItem<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> PartialOrd for HeapItem<'a> { - fn partial_cmp(&self, other: &HeapItem<'a, V>) -> Option { + fn partial_cmp(&self, other: &HeapItem<'a>) -> Option { Some(self.cmp(other)) } } -impl<'a, V> Ord for HeapItem<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> Ord for HeapItem<'a> { - fn cmp(&self, other: &HeapItem<'a, V>) -> Ordering { + fn cmp(&self, other: &HeapItem<'a>) -> Ordering { (&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord)) } } @@ -45,17 +40,15 @@ impl<'a, V> Ord for HeapItem<'a, V> /// - the term /// - a slice with the ordinal of the segments containing /// the terms. -pub struct TermMerger<'a, V> - where V: 'a + BinarySerializable + Default +pub struct TermMerger<'a> { - heap: BinaryHeap>, - current_streamers: Vec>, + heap: BinaryHeap>, + current_streamers: Vec>, } -impl<'a, V> TermMerger<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> TermMerger<'a> { - pub fn new(streams: Vec>) -> TermMerger<'a, V> { + pub fn new(streams: Vec>) -> TermMerger<'a> { TermMerger { heap: BinaryHeap::new(), current_streamers: streams @@ -122,7 +115,7 @@ impl<'a, V> TermMerger<'a, V> /// This method may be called /// iff advance() has been called before /// and "true" was returned. - pub fn current_kvs(&self) -> &[HeapItem<'a, V>] { + pub fn current_kvs(&self) -> &[HeapItem<'a>] { &self.current_streamers[..] } diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 03a4ac0af..73ed0a1b3 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -1,36 +1,10 @@ /*! The term dictionary is one of the key datastructure of -tantivy. It associates sorted `terms` to their respective -posting list. +tantivy. It associates sorted `terms` to a `TermInfo` struct +that serves as an address in their respective posting list. -The term dictionary makes it possible to iterate through -the keys in a sorted manner. - -# Example - -``` -extern crate tantivy; -use tantivy::termdict::*; -use tantivy::directory::ReadOnlySource; - -# fn main() { -# run().expect("Test failed"); -# } -# fn run() -> tantivy::Result<()> { -let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec!())?; - -// keys have to be insert in order. -term_dictionary_builder.insert("apple", &1u32)?; -term_dictionary_builder.insert("grape", &2u32)?; -term_dictionary_builder.insert("pear", &3u32)?; -let buffer: Vec = term_dictionary_builder.finish()?; - -let source = ReadOnlySource::from(buffer); -let term_dictionary = TermDictionaryImpl::from_source(source)?; - -assert_eq!(term_dictionary.get("grape"), Some(2u32)); -# Ok(()) -# } +The term dictionary API makes it possible to iterate through +a range of keys in a sorted manner. ``` @@ -74,14 +48,12 @@ followed by a streaming through at most `1024` elements in the term `stream`. */ -use schema::{Field, Term}; -use common::BinarySerializable; +use schema::{Field, Term, FieldType}; use directory::ReadOnlySource; - +use postings::TermInfo; pub use self::merger::TermMerger; - #[cfg(not(feature="streamdict"))] mod fstdict; #[cfg(not(feature="streamdict"))] @@ -100,21 +72,19 @@ use std::io; /// Dictionary associating sorted `&[u8]` to values -pub trait TermDictionary<'a, V> - where V: BinarySerializable + Default + 'a, - Self: Sized +pub trait TermDictionary<'a> where Self: Sized { /// Streamer type associated to the term dictionary - type Streamer: TermStreamer + 'a; + type Streamer: TermStreamer + 'a; /// StreamerBuilder type associated to the term dictionary - type StreamBuilder: TermStreamerBuilder + 'a; + type StreamBuilder: TermStreamerBuilder + 'a; /// Opens a `TermDictionary` given a data source. fn from_source(source: ReadOnlySource) -> io::Result; /// Lookups the value corresponding to the key. - fn get>(&self, target_key: K) -> Option; + fn get>(&self, target_key: K) -> Option; /// Returns a range builder, to stream all of the terms /// within an interval. @@ -139,17 +109,16 @@ pub trait TermDictionary<'a, V> /// Builder for the new term dictionary. /// /// Inserting must be done in the order of the `keys`. -pub trait TermDictionaryBuilder: Sized - where W: io::Write, - V: BinarySerializable + Default +pub trait TermDictionaryBuilder: Sized + where W: io::Write { /// Creates a new `TermDictionaryBuilder` - fn new(write: W) -> io::Result; + fn new(write: W, field_type: FieldType) -> io::Result; /// Inserts a `(key, value)` pair in the term dictionary. /// /// *Keys have to be inserted in order.* - fn insert>(&mut self, key: K, value: &V) -> io::Result<()>; + fn insert>(&mut self, key: K, value: &TermInfo) -> io::Result<()>; /// Finalize writing the builder, and returns the underlying /// `Write` object. @@ -159,7 +128,7 @@ pub trait TermDictionaryBuilder: Sized /// `TermStreamer` acts as a cursor over a range of terms of a segment. /// Terms are guaranteed to be sorted. -pub trait TermStreamer: Sized { +pub trait TermStreamer: Sized { /// Advance position the stream on the next item. /// Before the first call to `.advance()`, the stream /// is an unitialized state. @@ -186,10 +155,10 @@ pub trait TermStreamer: Sized { /// /// Calling `.value()` before the first call to `.advance()` returns /// `V::default()`. - fn value(&self) -> &V; + fn value(&self) -> &TermInfo; /// Return the next `(key, value)` pair. - fn next(&mut self) -> Option<(Term<&[u8]>, &V)> { + fn next(&mut self) -> Option<(Term<&[u8]>, &TermInfo)> { if self.advance() { Some((Term::wrap(self.key()), self.value())) } else { @@ -201,11 +170,10 @@ pub trait TermStreamer: Sized { /// `TermStreamerBuilder` is an helper object used to define /// a range of terms that should be streamed. -pub trait TermStreamerBuilder - where V: BinarySerializable + Default +pub trait TermStreamerBuilder { /// Associated `TermStreamer` type that this builder is building. - type Streamer: TermStreamer; + type Streamer: TermStreamer; /// Limit the range to terms greater or equal to the bound fn ge>(self, bound: T) -> Self; @@ -237,46 +205,56 @@ mod tests { use termdict::TermStreamerBuilder; use termdict::TermDictionary; use termdict::TermDictionaryBuilder; + use schema::{FieldType, TextOptions}; + use postings::TermInfo; + const BLOCK_SIZE: usize = 1_500; + fn make_term_info(val: u32) -> TermInfo { + let mut term_info = TermInfo::default(); + term_info.doc_freq = val; + term_info + } + #[test] fn test_term_dictionary() { let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write).unwrap(); + let field_type = FieldType::Str(TextOptions::default()); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type).unwrap(); term_dictionary_builder - .insert("abc".as_bytes(), &34u32) + .insert("abc".as_bytes(), &make_term_info(34u32)) .unwrap(); term_dictionary_builder - .insert("abcd".as_bytes(), &346u32) + .insert("abcd".as_bytes(), &make_term_info(346u32)) .unwrap(); term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source).unwrap(); - assert_eq!(term_dict.get("abc"), Some(34u32)); - assert_eq!(term_dict.get("abcd"), Some(346u32)); + let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source).unwrap(); + assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); + assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); let mut stream = term_dict.stream(); { { let (k, v) = stream.next().unwrap(); assert_eq!(k.as_ref(), "abc".as_bytes()); - assert_eq!(v, &34u32); + assert_eq!(v.doc_freq, 34u32); } assert_eq!(stream.key(), "abc".as_bytes()); - assert_eq!(*stream.value(), 34u32); + assert_eq!(stream.value().doc_freq, 34u32); } { { let (k, v) = stream.next().unwrap(); assert_eq!(k.as_slice(), "abcd".as_bytes()); - assert_eq!(v, &346u32); + assert_eq!(v.doc_freq, 346u32); } assert_eq!(stream.key(), "abcd".as_bytes()); - assert_eq!(*stream.value(), 346u32); + assert_eq!(stream.value().doc_freq, 346u32); } assert!(!stream.advance()); } @@ -332,15 +310,16 @@ mod tests { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); + let field_type = FieldType::Str(TextOptions::default()); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { - term_dictionary_builder.insert(id.as_bytes(), i).unwrap(); + term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) .unwrap(); { let mut streamer = term_dictionary.stream(); @@ -348,7 +327,7 @@ mod tests { while let Some((streamer_k, streamer_v)) = streamer.next() { let &(ref key, ref v) = &ids[i]; assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v, v); + assert_eq!(streamer_v.doc_freq, *v); i += 1; } } @@ -362,17 +341,18 @@ mod tests { let ids: Vec<_> = (0u32..50_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); + let field_type = FieldType::Str(TextOptions::default()); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { - term_dictionary_builder.insert(id.as_bytes(), i).unwrap(); + term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) .unwrap(); { for i in (0..20).chain(6000..8_000) { @@ -385,7 +365,7 @@ mod tests { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j]; assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key); - assert_eq!(streamer_v, v); + assert_eq!(streamer_v.doc_freq, *v); } } } @@ -401,7 +381,7 @@ mod tests { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j + 1]; assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v, v); + assert_eq!(streamer_v.doc_freq, *v); } } } @@ -428,45 +408,46 @@ mod tests { #[test] fn test_stream_range_boundaries() { + let field_type = FieldType::Str(TextOptions::default()); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; - term_dictionary_builder.insert(&number_arr, &i).unwrap(); + term_dictionary_builder.insert(&number_arr, &make_term_info(i as u32)).unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) .unwrap(); - let value_list = |mut streamer: TermStreamerImpl| { - let mut res: Vec = vec![]; - while let Some((_, &v)) = streamer.next() { - res.push(v); + let value_list = |mut streamer: TermStreamerImpl| { + let mut res: Vec = vec![]; + while let Some((_, ref v)) = streamer.next() { + res.push(v.doc_freq); } res }; { let range = term_dictionary.range().ge([2u8]).into_stream(); assert_eq!(value_list(range), - vec![2u8, 3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]); + vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]); } { let range = term_dictionary.range().gt([2u8]).into_stream(); - assert_eq!(value_list(range), vec![3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]); + assert_eq!(value_list(range), vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]); } { let range = term_dictionary.range().lt([6u8]).into_stream(); - assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8]); + assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]); } { let range = term_dictionary.range().le([6u8]).into_stream(); - assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8, 6u8]); + assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]); } { let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream(); - assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8]); + assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]); } } diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index 90b719dda..101d8e9fb 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -6,3 +6,29 @@ pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; + +use schema::FieldType; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy)] +pub(crate) enum TermDeserializerOption { + StrNoPositions, + StrWithPositions, + U64, +} + +fn make_deserializer_options(field_type: &FieldType) -> TermDeserializerOption { + match *field_type { + FieldType::Str(ref text_options) => { + let indexing_options = text_options.get_indexing_options(); + if indexing_options.is_position_enabled() { + TermDeserializerOption::StrWithPositions + } + else { + TermDeserializerOption::StrNoPositions + } + } + _ => { + TermDeserializerOption::U64 + } + } +} \ No newline at end of file diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 5de91a343..2f302c8fd 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -1,44 +1,47 @@ #![allow(should_implement_trait)] use std::cmp::max; -use common::BinarySerializable; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; +use postings::TermInfo; +use super::TermDeserializerOption; -pub(crate) fn stream_before<'a, V>(term_dictionary: &'a TermDictionaryImpl, - target_key: &[u8]) - -> TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default + +pub(crate) fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, + target_key: &[u8], + deserializer_option: TermDeserializerOption) + -> TermStreamerImpl<'a> { let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref()); let offset: usize = offset as usize; TermStreamerImpl { cursor: &term_dictionary.stream_data()[offset..], current_key: Vec::from(prev_key), - current_value: V::default(), + current_value: TermInfo::default(), + term_deserializer_option: deserializer_option, } } + /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default +pub struct TermStreamerBuilderImpl<'a> { - term_dictionary: &'a TermDictionaryImpl, + term_dictionary: &'a TermDictionaryImpl, origin: usize, offset_from: usize, offset_to: usize, current_key: Vec, + deserializer_option: TermDeserializerOption, } -impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { - type Streamer = TermStreamerImpl<'a, V>; + type Streamer = TermStreamerImpl<'a>; /// Limit the range to terms greater or equal to the bound fn ge>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.lt(target_key); let (offset_before, current_key) = get_offset(smaller_than, streamer); self.current_key = current_key; @@ -49,7 +52,7 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Limit the range to terms strictly greater than the bound fn gt>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.le(target_key); let (offset_before, current_key) = get_offset(smaller_than, streamer); self.current_key = current_key; @@ -60,7 +63,7 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Limit the range to terms lesser or equal to the bound fn lt>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.lt(target_key); let (offset_before, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; @@ -70,7 +73,7 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// Limit the range to terms lesser or equal to the bound fn le>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref()); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.le(target_key); let (offset_before, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; @@ -82,10 +85,12 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> let data: &[u8] = self.term_dictionary.stream_data(); let start = self.offset_from; let stop = max(self.offset_to, start); + TermStreamerImpl { cursor: &data[start..stop], current_key: self.current_key, - current_value: V::default(), + current_value: TermInfo::default(), + term_deserializer_option: self.deserializer_option, } } } @@ -94,10 +99,9 @@ impl<'a, V> TermStreamerBuilder for TermStreamerBuilderImpl<'a, V> /// key in the stream matching a given predicate. /// /// returns (start offset, the data required to load the value) -fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P, - mut streamer: TermStreamerImpl) +fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, + mut streamer: TermStreamerImpl<'a>) -> (usize, Vec) - where V: 'a + BinarySerializable + Default { let mut prev: &[u8] = streamer.cursor; @@ -114,10 +118,11 @@ fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P, (prev.as_ptr() as usize, prev_data) } -impl<'a, V> TermStreamerBuilderImpl<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> TermStreamerBuilderImpl<'a> { - pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl) -> Self { + pub(crate) fn new( + term_dictionary: &'a TermDictionaryImpl, + deserializer_option: TermDeserializerOption) -> Self { let data = term_dictionary.stream_data(); let origin = data.as_ptr() as usize; TermStreamerBuilderImpl { @@ -126,26 +131,37 @@ impl<'a, V> TermStreamerBuilderImpl<'a, V> offset_from: 0, offset_to: data.len(), current_key: Vec::with_capacity(300), + deserializer_option: deserializer_option, } } } + + /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default +pub struct TermStreamerImpl<'a> { cursor: &'a [u8], current_key: Vec, - current_value: V, + current_value: TermInfo, + term_deserializer_option: TermDeserializerOption, } - -impl<'a, V: BinarySerializable> TermStreamerImpl<'a, V> - where V: 'a + BinarySerializable + Default +impl<'a> TermStreamerImpl<'a> { - pub(crate) fn extract_value(self) -> V { + pub(crate) fn extract_value(self) -> TermInfo { self.current_value } + + fn deserialize_value(&mut self) { + self.current_value.doc_freq = deserialize_vint(&mut self.cursor) as u32; + self.current_value.postings_offset = deserialize_vint(&mut self.cursor) as u32; + if self.term_deserializer_option == TermDeserializerOption::StrWithPositions { + self.current_value.positions_offset = deserialize_vint(&mut self.cursor) as u32; + self.current_value.positions_inner_offset = self.cursor[0]; + self.cursor = &self.cursor[1..]; + } + } } fn deserialize_vint(data: &mut &[u8]) -> u64 { @@ -163,8 +179,7 @@ fn deserialize_vint(data: &mut &[u8]) -> u64 { res } -impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> - where V: BinarySerializable + Default +impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { if self.cursor.is_empty() { @@ -176,9 +191,7 @@ impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> self.current_key.extend(&self.cursor[..added_length]); self.cursor = &self.cursor[added_length..]; - self.current_value = - V::deserialize(&mut self.cursor) - .expect("Term dictionary corrupted. Failed to deserialize a value"); + self.deserialize_value(); true } @@ -186,7 +199,8 @@ impl<'a, V> TermStreamer for TermStreamerImpl<'a, V> &self.current_key } - fn value(&self) -> &V { + fn value(&self) -> &TermInfo { &self.current_value } } + diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index 5759ce1e2..3a3b6101b 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -6,11 +6,14 @@ use fst::raw::Fst; use common::VInt; use directory::ReadOnlySource; use common::BinarySerializable; -use std::marker::PhantomData; use common::CountingWriter; +use bincode; use std::cmp::Ordering; use postings::TermInfo; +use schema::FieldType; use fst::raw::Node; +use super::make_deserializer_options; +use super::TermDeserializerOption; use super::streamer::stream_before; use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; @@ -22,15 +25,13 @@ fn convert_fst_error(e: fst::Error) -> io::Error { } /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) -pub struct TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +pub struct TermDictionaryBuilderImpl { write: CountingWriter, block_index: fst::MapBuilder>, last_key: Vec, len: usize, - _phantom_: PhantomData, + deserializer_options: TermDeserializerOption, } fn common_prefix_length(left: &[u8], right: &[u8]) -> usize { @@ -48,9 +49,8 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec) { } } -impl TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilderImpl + where W: Write { fn add_index_entry(&mut self) { self.block_index @@ -80,32 +80,46 @@ impl TermDictionaryBuilderImpl Ok(()) } - pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> { - value.serialize(&mut self.write)?; + pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { + + VInt(value.doc_freq as u64).serialize(&mut self.write)?; + VInt(value.postings_offset as u64).serialize(&mut self.write)?; + + if self.deserializer_options == TermDeserializerOption::StrWithPositions { + VInt(value.positions_offset as u64).serialize(&mut self.write)?; + self.write.write_all(&[value.positions_inner_offset])?; + } + Ok(()) } } -impl TermDictionaryBuilder for TermDictionaryBuilderImpl - where W: Write, - V: BinarySerializable + Default +impl TermDictionaryBuilder for TermDictionaryBuilderImpl + where W: Write { /// Creates a new `TermDictionaryBuilder` - fn new(write: W) -> io::Result { - let buffer: Vec = vec![]; + fn new(mut write: W, field_type: FieldType) -> io::Result { + let deserializer_options = make_deserializer_options(&field_type); + { + // serialize the field type. + let data: Vec = bincode::serialize(&deserializer_options, bincode::Bounded(256u64)) + .expect("Failed to serialize field type within 256 bytes. This should never be a problem."); + write.write_all(&[data.len() as u8])?; + write.write_all(&data[..])?; + } Ok(TermDictionaryBuilderImpl { write: CountingWriter::wrap(write), - block_index: fst::MapBuilder::new(buffer).expect("This cannot fail"), + block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), last_key: Vec::with_capacity(128), len: 0, - _phantom_: PhantomData, + deserializer_options: deserializer_options, }) } /// Inserts a `(key, value)` pair in the term dictionary. /// /// *Keys have to be inserted in order.* - fn insert>(&mut self, key_ref: K, value: &V) -> io::Result<()> { + fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { let key = key_ref.as_ref(); self.insert_key(key)?; self.insert_value(value) @@ -138,16 +152,14 @@ fn open_fst_index(source: ReadOnlySource) -> io::Result { } /// See [`TermDictionary`](./trait.TermDictionary.html) -pub struct TermDictionaryImpl - where V: BinarySerializable + Default +pub struct TermDictionaryImpl { stream_data: ReadOnlySource, fst_index: fst::Map, - _phantom_: PhantomData, + deserializer_option: TermDeserializerOption, } -impl TermDictionaryImpl - where V: BinarySerializable + Default +impl TermDictionaryImpl { pub(crate) fn stream_data(&self) -> &[u8] { self.stream_data.as_slice() @@ -200,15 +212,23 @@ impl TermDictionaryImpl } -impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl - where V: BinarySerializable + Default + 'a -{ - type Streamer = TermStreamerImpl<'a, V>; - type StreamBuilder = TermStreamerBuilderImpl<'a, V>; +impl<'a> TermDictionary<'a> for TermDictionaryImpl +{ + type Streamer = TermStreamerImpl<'a>; + + type StreamBuilder = TermStreamerBuilderImpl<'a>; /// Opens a `TermDictionary` given a data source. - fn from_source(source: ReadOnlySource) -> io::Result { + fn from_source(mut source: ReadOnlySource) -> io::Result { + // it won't take more than 100 bytes + let deserialize_option_len = source.slice(0, 1).as_slice()[0] as usize; + let deserialize_option_source = source.slice(1, 1 + deserialize_option_len); + let deserialize_option_buffer: &[u8] = deserialize_option_source.as_slice(); + let deserializer_option: TermDeserializerOption = bincode::deserialize(deserialize_option_buffer) + .expect("Field dictionary data is corrupted. Failed to deserialize field type."); + source = source.slice_from(1 + deserialize_option_len); + let total_len = source.len(); let length_offset = total_len - 8; let split_len: usize = { @@ -220,15 +240,15 @@ impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl let fst_index = open_fst_index(fst_data)?; Ok(TermDictionaryImpl { - stream_data: stream_data, - fst_index: fst_index, - _phantom_: PhantomData, - }) + stream_data: stream_data, + fst_index: fst_index, + deserializer_option: deserializer_option, + }) } /// Lookups the value corresponding to the key. - fn get>(&self, target_key: K) -> Option { - let mut streamer = stream_before(self, target_key.as_ref()); + fn get>(&self, target_key: K) -> Option { + let mut streamer = stream_before(self, target_key.as_ref(), self.deserializer_option); while streamer.advance() { let position = streamer.key().cmp(target_key.as_ref()); match position { @@ -245,6 +265,6 @@ impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl /// Returns a range builder, to stream all of the terms /// within an interval. fn range(&'a self) -> Self::StreamBuilder { - Self::StreamBuilder::new(self) + Self::StreamBuilder::new(self, self.deserializer_option) } } From 303fc7e8200873040578d80bcb3cd7450904705a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 17 Aug 2017 12:08:39 +0900 Subject: [PATCH 19/29] Better unit test for termdict. Checking the TermInfo --- src/termdict/mod.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 73ed0a1b3..1d1912f34 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -198,23 +198,25 @@ mod tests { use super::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl}; use directory::{RAMDirectory, Directory, ReadOnlySource}; use std::path::PathBuf; - use schema::{Term, SchemaBuilder, Document, TEXT}; + use schema::{FieldType, Term, SchemaBuilder, Document, TEXT}; use core::Index; use std::str; use termdict::TermStreamer; use termdict::TermStreamerBuilder; use termdict::TermDictionary; use termdict::TermDictionaryBuilder; - use schema::{FieldType, TextOptions}; use postings::TermInfo; const BLOCK_SIZE: usize = 1_500; fn make_term_info(val: u32) -> TermInfo { - let mut term_info = TermInfo::default(); - term_info.doc_freq = val; - term_info + TermInfo { + doc_freq: val, + positions_offset: val * 2u32, + postings_offset: val * 3u32, + positions_inner_offset: 5u8, + } } #[test] @@ -223,7 +225,7 @@ mod tests { let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let field_type = FieldType::Str(TextOptions::default()); + let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type).unwrap(); term_dictionary_builder .insert("abc".as_bytes(), &make_term_info(34u32)) @@ -310,7 +312,7 @@ mod tests { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); - let field_type = FieldType::Str(TextOptions::default()); + let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { @@ -327,7 +329,7 @@ mod tests { while let Some((streamer_k, streamer_v)) = streamer.next() { let &(ref key, ref v) = &ids[i]; assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v.doc_freq, *v); + assert_eq!(streamer_v, &make_term_info(*v)); i += 1; } } @@ -341,7 +343,7 @@ mod tests { let ids: Vec<_> = (0u32..50_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); - let field_type = FieldType::Str(TextOptions::default()); + let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { @@ -366,6 +368,7 @@ mod tests { let &(ref key, ref v) = &ids[i + j]; assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key); assert_eq!(streamer_v.doc_freq, *v); + assert_eq!(streamer_v, &make_term_info(*v)); } } } @@ -408,7 +411,7 @@ mod tests { #[test] fn test_stream_range_boundaries() { - let field_type = FieldType::Str(TextOptions::default()); + let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for i in 0u8..10u8 { From d0d5db4515dc820f8daee1e043b5231c3b3ac7fb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 19 Aug 2017 12:03:04 +0900 Subject: [PATCH 20/29] Streamdict using SIMD instruction. --- src/lib.rs | 3 +- src/termdict/streamdict/mod.rs | 15 ++ src/termdict/streamdict/streamer.rs | 132 +++++++------- src/termdict/streamdict/term_block_encoder.rs | 164 ++++++++++++++++++ src/termdict/streamdict/termdict.rs | 69 ++++---- .../streamdict/terminfo_block_encoder.rs | 117 +++++++++++++ 6 files changed, 404 insertions(+), 96 deletions(-) create mode 100644 src/termdict/streamdict/term_block_encoder.rs create mode 100644 src/termdict/streamdict/terminfo_block_encoder.rs diff --git a/src/lib.rs b/src/lib.rs index c926b67b3..5f4cfa3ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -262,7 +262,7 @@ mod tests { } #[test] - fn test_docfreq() { + fn test_docfreq1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); @@ -301,7 +301,6 @@ mod tests { } } - #[test] fn test_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index 101d8e9fb..4a4db7690 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -1,11 +1,15 @@ mod termdict; mod streamer; +mod term_block_encoder; +mod terminfo_block_encoder; pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; +use self::term_block_encoder::{TermBlockEncoder, TermBlockDecoder}; +use self::terminfo_block_encoder::{TermInfoBlockEncoder, TermInfoBlockDecoder}; use schema::FieldType; @@ -16,6 +20,17 @@ pub(crate) enum TermDeserializerOption { U64, } +impl TermDeserializerOption { + + pub fn has_positions(&self) -> bool { + match *self { + TermDeserializerOption::StrWithPositions => true, + _ => false + } + } + +} + fn make_deserializer_options(field_type: &FieldType) -> TermDeserializerOption { match *field_type { FieldType::Str(ref text_options) => { diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 2f302c8fd..419468308 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -3,6 +3,7 @@ use std::cmp::max; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; +use super::{TermBlockDecoder, TermInfoBlockDecoder}; use postings::TermInfo; use super::TermDeserializerOption; @@ -15,10 +16,10 @@ pub(crate) fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref()); let offset: usize = offset as usize; TermStreamerImpl { + remaining_in_block: 0, + term_block_decoder: TermBlockDecoder::given_previous_term(&prev_key[..]), + terminfo_block_decoder: TermInfoBlockDecoder::new(deserializer_option.has_positions()), cursor: &term_dictionary.stream_data()[offset..], - current_key: Vec::from(prev_key), - current_value: TermInfo::default(), - term_deserializer_option: deserializer_option, } } @@ -27,7 +28,9 @@ pub(crate) fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, pub struct TermStreamerBuilderImpl<'a> { term_dictionary: &'a TermDictionaryImpl, + block_start: &'a [u8], origin: usize, + cursor: usize, offset_from: usize, offset_to: usize, current_key: Vec, @@ -40,44 +43,60 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// Limit the range to terms greater or equal to the bound fn ge>(mut self, bound: T) -> Self { + unimplemented!(); + /* let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.lt(target_key); - let (offset_before, current_key) = get_offset(smaller_than, streamer); + let (block_start, cursor, current_key) = get_offset(smaller_than, streamer); + self.block_start = block_start; self.current_key = current_key; - self.offset_from = offset_before - self.origin; + self.cursor = cursor; + //self.offset_from = ; + */ self } /// Limit the range to terms strictly greater than the bound fn gt>(mut self, bound: T) -> Self { + unimplemented!(); + /* let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.le(target_key); - let (offset_before, current_key) = get_offset(smaller_than, streamer); + let (block_start, cursor, current_key) = get_offset(smaller_than, streamer); + self.block_start = block_start; self.current_key = current_key; - self.offset_from = offset_before - self.origin; + self.cursor = cursor; + //self.offset_from = offset_before - self.origin; + */ self } /// Limit the range to terms lesser or equal to the bound fn lt>(mut self, bound: T) -> Self { + unimplemented!(); + /* let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.lt(target_key); let (offset_before, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self + */ } /// Limit the range to terms lesser or equal to the bound fn le>(mut self, bound: T) -> Self { + unimplemented!(); + /* let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); let smaller_than = |k: &[u8]| k.le(target_key); let (offset_before, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self + */ } /// Build the streamer. @@ -85,12 +104,12 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let data: &[u8] = self.term_dictionary.stream_data(); let start = self.offset_from; let stop = max(self.offset_to, start); - + println!("current_key {:?}", self.current_key); TermStreamerImpl { + remaining_in_block: 0, cursor: &data[start..stop], - current_key: self.current_key, - current_value: TermInfo::default(), - term_deserializer_option: self.deserializer_option, + term_block_decoder: TermBlockDecoder::given_previous_term(&self.current_key), + terminfo_block_decoder: TermInfoBlockDecoder::new(self.deserializer_option.has_positions()), } } } @@ -98,24 +117,30 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// Returns offset information for the first /// key in the stream matching a given predicate. /// -/// returns (start offset, the data required to load the value) +/// returns +/// - the block start +/// - the index within this block +/// - the term_buffer state to initialize the block) fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, mut streamer: TermStreamerImpl<'a>) - -> (usize, Vec) -{ - let mut prev: &[u8] = streamer.cursor; + -> (&'a [u8], usize, Vec) +{//&'a [u8] + let mut block_start: &[u8] = streamer.cursor; + let mut cursor = 0; + let mut term_buffer: Vec = vec!(); - let mut prev_data: Vec = streamer.current_key.clone(); - - while let Some((iter_key, _)) = streamer.next() { + while streamer.advance() { + let iter_key = streamer.key(); if !predicate(iter_key.as_ref()) { - return (prev.as_ptr() as usize, prev_data); + return (block_start, streamer.term_block_decoder.cursor() - 1, term_buffer); + } + if streamer.remaining_in_block == 0 { + block_start = streamer.cursor; + term_buffer.clear(); + term_buffer.extend_from_slice(iter_key.as_ref()); } - prev = streamer.cursor; - prev_data.clear(); - prev_data.extend_from_slice(iter_key.as_ref()); } - (prev.as_ptr() as usize, prev_data) + (block_start, streamer.term_block_decoder.cursor() - 1, term_buffer) } impl<'a> TermStreamerBuilderImpl<'a> @@ -127,6 +152,8 @@ impl<'a> TermStreamerBuilderImpl<'a> let origin = data.as_ptr() as usize; TermStreamerBuilderImpl { term_dictionary: term_dictionary, + block_start: term_dictionary.stream_data().as_ref(), + cursor: 0, origin: origin, offset_from: 0, offset_to: data.len(), @@ -141,66 +168,49 @@ impl<'a> TermStreamerBuilderImpl<'a> /// See [`TermStreamer`](./trait.TermStreamer.html) pub struct TermStreamerImpl<'a> { + remaining_in_block: usize, + term_block_decoder: TermBlockDecoder<'a>, + terminfo_block_decoder: TermInfoBlockDecoder<'a>, cursor: &'a [u8], - current_key: Vec, - current_value: TermInfo, - term_deserializer_option: TermDeserializerOption, } impl<'a> TermStreamerImpl<'a> { - pub(crate) fn extract_value(self) -> TermInfo { - self.current_value - } - - fn deserialize_value(&mut self) { - self.current_value.doc_freq = deserialize_vint(&mut self.cursor) as u32; - self.current_value.postings_offset = deserialize_vint(&mut self.cursor) as u32; - if self.term_deserializer_option == TermDeserializerOption::StrWithPositions { - self.current_value.positions_offset = deserialize_vint(&mut self.cursor) as u32; - self.current_value.positions_inner_offset = self.cursor[0]; + fn load_block(&mut self) -> bool { + self.remaining_in_block = self.cursor[0] as usize; + if self.remaining_in_block == 0 { + false + } + else { self.cursor = &self.cursor[1..]; + self.cursor = self.term_block_decoder.decode_block(self.cursor); + self.cursor = self.terminfo_block_decoder.decode_block(self.cursor, self.remaining_in_block); + true } } } -fn deserialize_vint(data: &mut &[u8]) -> u64 { - let mut res = 0; - let mut shift = 0; - for i in 0.. { - let b = data[i]; - res |= ((b % 128u8) as u64) << shift; - if b & 128u8 != 0u8 { - *data = &data[(i + 1)..]; - break; - } - shift += 7; - } - res -} impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { - if self.cursor.is_empty() { - return false; + if self.remaining_in_block == 0 { + if !self.load_block() { + return false; + } } - let common_length: usize = deserialize_vint(&mut self.cursor) as usize; - self.current_key.truncate(common_length); - let added_length: usize = deserialize_vint(&mut self.cursor) as usize; - self.current_key.extend(&self.cursor[..added_length]); - - self.cursor = &self.cursor[added_length..]; - self.deserialize_value(); + self.remaining_in_block -= 1; + self.term_block_decoder.advance(); + self.terminfo_block_decoder.advance(); true } fn key(&self) -> &[u8] { - &self.current_key + self.term_block_decoder.term() } fn value(&self) -> &TermInfo { - &self.current_value + self.terminfo_block_decoder.term_info() } } diff --git a/src/termdict/streamdict/term_block_encoder.rs b/src/termdict/streamdict/term_block_encoder.rs new file mode 100644 index 000000000..157a3cf28 --- /dev/null +++ b/src/termdict/streamdict/term_block_encoder.rs @@ -0,0 +1,164 @@ +use compression::{BlockEncoder, BlockDecoder, NUM_DOCS_PER_BLOCK}; +use std::io::{self, Write}; + +fn compute_common_prefix_length(left: &[u8], right: &[u8]) -> usize { + left.iter() + .cloned() + .zip(right.iter().cloned()) + .take_while(|&(b1, b2)| b1 == b2) + .count() +} + + +pub struct TermBlockEncoder { + block_encoder: BlockEncoder, + + pop_lens: [u32; NUM_DOCS_PER_BLOCK], + push_lens: [u32; NUM_DOCS_PER_BLOCK], + suffixes: Vec, + + previous_key: Vec, + count: usize, +} + +impl TermBlockEncoder { + pub fn new() -> TermBlockEncoder { + TermBlockEncoder { + block_encoder: BlockEncoder::new(), + pop_lens: [0u32; NUM_DOCS_PER_BLOCK], + push_lens: [0u32; NUM_DOCS_PER_BLOCK], + suffixes: Vec::with_capacity(NUM_DOCS_PER_BLOCK*5), + + previous_key: Vec::with_capacity(30), + + count: 0, + } + } + + pub fn encode(&mut self, key: &[u8]) { + let common_prefix_len = compute_common_prefix_length(&self.previous_key, key); + self.pop_lens[self.count] = (self.previous_key.len() - common_prefix_len) as u32; + self.push_lens[self.count] = (key.len() - common_prefix_len) as u32; + self.previous_key.clear(); + let suffix = &key[common_prefix_len..]; + self.suffixes.extend_from_slice(suffix); + self.previous_key.extend_from_slice(key); + self.count += 1; + } + + pub fn len(&self) -> usize { + self.count + } + + pub fn flush(&mut self, output: &mut W) -> io::Result<()> { + for i in self.count..NUM_DOCS_PER_BLOCK { + self.pop_lens[i] = 0u32; + self.push_lens[i] = 0u32; + } + output.write_all(self.block_encoder.compress_block_unsorted(&self.pop_lens))?; + output.write_all(self.block_encoder.compress_block_unsorted(&self.push_lens))?; + output.write_all(&self.suffixes[..])?; + self.suffixes.clear(); + self.count = 0; + Ok(()) + } +} + + + +pub struct TermBlockDecoder<'a> { + pop_lens_decoder: BlockDecoder, + push_lens_decoder: BlockDecoder, + suffixes: &'a [u8], + current_key: Vec, + cursor: usize, +} + + +impl<'a> TermBlockDecoder<'a> { + pub fn new() -> TermBlockDecoder<'a> { + TermBlockDecoder::given_previous_term(&[]) + } + + pub fn cursor(&self) -> usize { + self.cursor + } + + pub fn given_previous_term(previous_term: &[u8]) -> TermBlockDecoder<'a> { + let mut current_key = Vec::with_capacity(30); + current_key.extend_from_slice(previous_term); + TermBlockDecoder { + pop_lens_decoder: BlockDecoder::new(), + push_lens_decoder: BlockDecoder::new(), + current_key: current_key, + suffixes: &[], + cursor: 0, + } + } + + pub fn term(&self) -> &[u8] { + &self.current_key + } + + pub fn decode_block(&mut self, mut compressed_data: &'a [u8]) -> &'a [u8] { + { + let consumed_data_len = self.pop_lens_decoder.uncompress_block_unsorted(compressed_data); + compressed_data = &compressed_data[consumed_data_len..]; + } + { + let consumed_data_len = self.push_lens_decoder.uncompress_block_unsorted(compressed_data); + compressed_data = &compressed_data[consumed_data_len..]; + } + let suffix_len: u32 = self.push_lens_decoder.output_array()[0..].iter().cloned().sum(); + let suffix_len: usize = suffix_len as usize; + self.suffixes = &compressed_data[..suffix_len]; + self.cursor = 0; + &compressed_data[suffix_len..] + } + + pub fn advance(&mut self) { + assert!(self.cursor < NUM_DOCS_PER_BLOCK); + let pop_len = self.pop_lens_decoder.output(self.cursor) as usize; + let push_len = self.push_lens_decoder.output(self.cursor) as usize; + let previous_len = self.current_key.len(); + self.current_key.truncate(previous_len - pop_len); + self.current_key.extend_from_slice(&self.suffixes[..push_len]); + self.suffixes = &self.suffixes[push_len..]; + self.cursor += 1; + } +} + + + +#[cfg(test)] +mod tests { + use super::{TermBlockEncoder, TermBlockDecoder}; + + #[test] + fn test_encoding_terms() { + let mut buffer: Vec = vec!(); + let mut terms = vec!(); + { + let mut term_block_encoder = TermBlockEncoder::new(); + for i in 0..128 { + terms.push(format!("term{}", i * 7231)); + } + for term in &terms { + term_block_encoder.encode(term.as_bytes()); + } + term_block_encoder.flush(&mut buffer).unwrap(); + } + assert_eq!(buffer.len(), 711); + + let mut block_decoder = TermBlockDecoder::new(); + assert_eq!(block_decoder.decode_block(&buffer[..]).len(), 0); + for i in 0..128 { + block_decoder.advance(); + assert_eq!(block_decoder.term(), terms[i].as_bytes()); + } + + } +} + + + diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index 3a3b6101b..5c15652cb 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -3,7 +3,6 @@ use std::io::{self, Write}; use fst; use fst::raw::Fst; -use common::VInt; use directory::ReadOnlySource; use common::BinarySerializable; use common::CountingWriter; @@ -12,13 +11,15 @@ use std::cmp::Ordering; use postings::TermInfo; use schema::FieldType; use fst::raw::Node; +use compression::NUM_DOCS_PER_BLOCK; use super::make_deserializer_options; use super::TermDeserializerOption; use super::streamer::stream_before; use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; +use super::{TermBlockEncoder, TermInfoBlockEncoder}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; -const BLOCK_SIZE: usize = 1024; +const INDEX_INTERVAL: usize = 1024; fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -28,19 +29,16 @@ fn convert_fst_error(e: fst::Error) -> io::Error { pub struct TermDictionaryBuilderImpl { write: CountingWriter, + + term_block_encoder: TermBlockEncoder, + terminfo_block_encoder: TermInfoBlockEncoder, + block_index: fst::MapBuilder>, last_key: Vec, + len: usize, - deserializer_options: TermDeserializerOption, } -fn common_prefix_length(left: &[u8], right: &[u8]) -> usize { - left.iter() - .cloned() - .zip(right.iter().cloned()) - .take_while(|&(b1, b2)| b1 == b2) - .count() -} fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec) { while let Some(transition) = node.transitions().last() { @@ -66,30 +64,31 @@ impl TermDictionaryBuilderImpl /// /// Prefer using `.insert(key, value)` pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { - if self.len % BLOCK_SIZE == 0 { + if self.len % INDEX_INTERVAL == 0 { self.add_index_entry(); } + self.last_key.clear(); + self.last_key.extend_from_slice(key); + self.term_block_encoder.encode(key); self.len += 1; - let common_len = common_prefix_length(key, &self.last_key); - VInt(common_len as u64).serialize(&mut self.write)?; - self.last_key.truncate(common_len); - self.last_key.extend_from_slice(&key[common_len..]); - VInt((key.len() - common_len) as u64) - .serialize(&mut self.write)?; - self.write.write_all(&key[common_len..])?; + Ok(()) + } + + fn flush_block(&mut self) -> io::Result<()> { + let block_size = self.term_block_encoder.len(); + if block_size > 0 { + self.write.write(&[block_size as u8])?; + self.term_block_encoder.flush(&mut self.write)?; + self.terminfo_block_encoder.flush(&mut self.write)?; + } Ok(()) } pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { - - VInt(value.doc_freq as u64).serialize(&mut self.write)?; - VInt(value.postings_offset as u64).serialize(&mut self.write)?; - - if self.deserializer_options == TermDeserializerOption::StrWithPositions { - VInt(value.positions_offset as u64).serialize(&mut self.write)?; - self.write.write_all(&[value.positions_inner_offset])?; + self.terminfo_block_encoder.encode(value); + if self.len % NUM_DOCS_PER_BLOCK == 0 { + self.flush_block()?; } - Ok(()) } } @@ -107,13 +106,15 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl write.write_all(&[data.len() as u8])?; write.write_all(&data[..])?; } + let has_positions = deserializer_options.has_positions(); Ok(TermDictionaryBuilderImpl { - write: CountingWriter::wrap(write), - block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), - last_key: Vec::with_capacity(128), - len: 0, - deserializer_options: deserializer_options, - }) + term_block_encoder: TermBlockEncoder::new(), + terminfo_block_encoder: TermInfoBlockEncoder::new(has_positions), + write: CountingWriter::wrap(write), + block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), + last_key: Vec::with_capacity(128), + len: 0, + }) } /// Inserts a `(key, value)` pair in the term dictionary. @@ -128,7 +129,9 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl /// Finalize writing the builder, and returns the underlying /// `Write` object. fn finish(mut self) -> io::Result { + self.flush_block()?; self.add_index_entry(); + self.write.write_all(&[0u8])?; let (mut w, split_len) = self.write.finish()?; let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?; w.write_all(&fst_write)?; @@ -253,7 +256,7 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl let position = streamer.key().cmp(target_key.as_ref()); match position { Ordering::Less => {} - Ordering::Equal => return Some(streamer.extract_value()), + Ordering::Equal => return Some(streamer.value().clone()), Ordering::Greater => { return None; } diff --git a/src/termdict/streamdict/terminfo_block_encoder.rs b/src/termdict/streamdict/terminfo_block_encoder.rs new file mode 100644 index 000000000..c2d2ef26b --- /dev/null +++ b/src/termdict/streamdict/terminfo_block_encoder.rs @@ -0,0 +1,117 @@ +use compression::{BlockEncoder, BlockDecoder, VIntEncoder, VIntDecoder, NUM_DOCS_PER_BLOCK}; +use postings::TermInfo; +use std::io::{self, Write}; + +pub struct TermInfoBlockEncoder { + block_encoder: BlockEncoder, + + doc_freqs: [u32; NUM_DOCS_PER_BLOCK], + postings_offsets: [u32; NUM_DOCS_PER_BLOCK], + positions_offsets: [u32; NUM_DOCS_PER_BLOCK], + positions_inner_offset: [u8; NUM_DOCS_PER_BLOCK], + + cursor: usize, + encode_positions: bool, +} + +impl TermInfoBlockEncoder { + pub fn new(encode_positions: bool) -> TermInfoBlockEncoder { + TermInfoBlockEncoder { + block_encoder: BlockEncoder::new(), + + doc_freqs: [0u32; NUM_DOCS_PER_BLOCK], + postings_offsets: [0u32; NUM_DOCS_PER_BLOCK], + positions_offsets: [0u32; NUM_DOCS_PER_BLOCK], + positions_inner_offset: [0u8; NUM_DOCS_PER_BLOCK], + + cursor: 0, + encode_positions: encode_positions, + } + } + + pub fn encode(&mut self, term_info: &TermInfo) { + self.doc_freqs[self.cursor] = term_info.doc_freq; + self.postings_offsets[self.cursor] = term_info.postings_offset; + self.positions_offsets[self.cursor] = term_info.positions_offset; + self.positions_inner_offset[self.cursor] = term_info.positions_inner_offset; + self.cursor += 1; + } + + pub fn flush(&mut self, output: &mut W) -> io::Result<()> { + output.write_all(self.block_encoder.compress_vint_unsorted(&self.doc_freqs[..self.cursor]))?; + output.write_all(self.block_encoder.compress_vint_sorted(&self.postings_offsets[..self.cursor], 0u32))?; + if self.encode_positions { + output.write_all(self.block_encoder.compress_vint_sorted(&self.positions_offsets[..self.cursor], 0u32))?; + output.write_all(&self.positions_inner_offset[..self.cursor])?; + } + self.cursor = 0; + Ok(()) + } +} + + + +pub struct TermInfoBlockDecoder<'a> { + doc_freq_decoder: BlockDecoder, + postings_decoder: BlockDecoder, + positions_decoder: BlockDecoder, + positions_inner_offset: &'a [u8], + current_term_info: TermInfo, + + cursor: usize, + has_positions: bool, +} + + +impl<'a> TermInfoBlockDecoder<'a> { + pub fn new(has_positions: bool) -> TermInfoBlockDecoder<'a> { + TermInfoBlockDecoder { + doc_freq_decoder: BlockDecoder::new(), + postings_decoder: BlockDecoder::new(), + positions_decoder: BlockDecoder::new(), + positions_inner_offset: &[], + + current_term_info: TermInfo::default(), + cursor: 0, + has_positions: has_positions, + } + } + + + pub fn term_info(&self) -> &TermInfo { + &self.current_term_info + } + + pub fn decode_block(&mut self, mut compressed_data: &'a [u8], num_els: usize) -> &'a [u8] { + self.cursor = 0; + { + let consumed_size = self.doc_freq_decoder.uncompress_vint_unsorted(compressed_data, num_els); + compressed_data = &compressed_data[consumed_size..]; + } + { + let consumed_size = self.postings_decoder.uncompress_vint_sorted(compressed_data, 0u32, num_els); + compressed_data = &compressed_data[consumed_size..]; + } + if self.has_positions { + let consumed_size = self.positions_decoder.uncompress_vint_sorted(compressed_data, 0u32, num_els); + compressed_data = &compressed_data[consumed_size..]; + self.positions_inner_offset = &compressed_data[..num_els]; + &compressed_data[num_els..] + } + else { + compressed_data + } + } + + pub fn advance(&mut self) { + assert!(self.cursor < NUM_DOCS_PER_BLOCK); + self.current_term_info.doc_freq = self.doc_freq_decoder.output(self.cursor); + self.current_term_info.postings_offset = self.postings_decoder.output(self.cursor); + if self.has_positions { + self.current_term_info.positions_offset = self.positions_decoder.output(self.cursor); + self.current_term_info.positions_inner_offset = self.positions_inner_offset[self.cursor]; + } + self.cursor += 1; + } + +} \ No newline at end of file From 850f10c1feea01f2a304b2c717ca22e5ea71b4e0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 22 Aug 2017 18:21:35 +0900 Subject: [PATCH 21/29] Exposing Field --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 5f4cfa3ee..6a1e6be70 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -128,7 +128,7 @@ pub use directory::Directory; pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher}; pub use indexer::IndexWriter; pub use schema::{Term, Document}; -pub use core::SegmentReader; +pub use core::{SegmentReader, FieldReader}; pub use self::common::TimerTree; pub use postings::DocSet; From b3a8074826000db527f1fe02e893ff3315c7927a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 22 Aug 2017 18:58:17 +0900 Subject: [PATCH 22/29] removed println --- src/termdict/streamdict/streamer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 419468308..b326334ee 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -104,7 +104,6 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let data: &[u8] = self.term_dictionary.stream_data(); let start = self.offset_from; let stop = max(self.offset_to, start); - println!("current_key {:?}", self.current_key); TermStreamerImpl { remaining_in_block: 0, cursor: &data[start..stop], From 8e450c770a55a6ee636c5ab5ac2357683bbf469d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 26 Aug 2017 18:40:30 +0900 Subject: [PATCH 23/29] Better error handling. Some doc. --- src/common/mod.rs | 1 + src/compression/stream.rs | 2 +- src/core/segment_reader.rs | 28 ++- src/error.rs | 1 + src/fastfield/reader.rs | 9 +- src/indexer/segment_manager.rs | 2 +- src/lib.rs | 2 + src/termdict/mod.rs | 3 +- src/termdict/streamdict/delta_encoder.rs | 48 +++++ src/termdict/streamdict/mod.rs | 44 +---- src/termdict/streamdict/streamer.rs | 143 +++++++-------- src/termdict/streamdict/term_block_encoder.rs | 164 ------------------ src/termdict/streamdict/termdict.rs | 115 ++++++------ .../streamdict/terminfo_block_encoder.rs | 117 ------------- 14 files changed, 199 insertions(+), 480 deletions(-) create mode 100644 src/termdict/streamdict/delta_encoder.rs delete mode 100644 src/termdict/streamdict/term_block_encoder.rs delete mode 100644 src/termdict/streamdict/terminfo_block_encoder.rs diff --git a/src/common/mod.rs b/src/common/mod.rs index e8c8763f1..803fe8bde 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -1,3 +1,4 @@ + mod serialize; mod timer; mod vint; diff --git a/src/compression/stream.rs b/src/compression/stream.rs index 0af50ca5b..29d180353 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -18,7 +18,7 @@ impl CompressedIntStream { } } - pub fn read(&mut self, mut output: &mut [u32]) { + pub fn read(&mut self, output: &mut [u32]) { let mut num_els: usize = output.len(); let mut start: usize = 0; loop { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index e3d203488..80bc2525e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,10 +4,12 @@ use core::SegmentId; use core::SegmentComponent; use std::sync::RwLock; use common::HasLen; +use error::ErrorKind; use core::SegmentMeta; use fastfield::{self, FastFieldNotAvailableError}; use fastfield::DeleteBitSet; use store::StoreReader; +use directory::ReadOnlySource; use schema::Document; use DocId; use std::str; @@ -171,25 +173,39 @@ impl SegmentReader { }) } + + /// Returns a field reader associated to the field given in argument. + /// + /// The field reader is in charge of iterating through the + /// term dictionary associated to a specific field, + /// and opening the posting list associated to any term. pub fn field_reader(&self, field: Field) -> Result> { if let Some(field_reader) = self.field_reader_cache.read() - .unwrap() // TODO + .expect("Lock poisoned. This should never happen") .get(&field) { return Ok(field_reader.clone()); } // TODO better error - let termdict_source = self.termdict_composite + let termdict_source: ReadOnlySource = self.termdict_composite .open_read(field) - .ok_or("Field not found")?; + .ok_or_else(|| { + ErrorKind::SchemaError( + format!("Could not find {:?} term dictionary", field) + ) + })?; let postings_source = self.postings_composite .open_read(field) - .ok_or("field not found")?; + .ok_or_else(|| { + ErrorKind::SchemaError(format!("Could not find {:?} postings", field)) + })?; let positions_source = self.positions_composite .open_read(field) - .ok_or("field not found")?; + .ok_or_else(|| { + ErrorKind::SchemaError(format!("Could not find {:?} positions", field)) + })?; let field_reader = Arc::new(FieldReader::new( termdict_source, @@ -201,7 +217,7 @@ impl SegmentReader { self.field_reader_cache .write() - .unwrap() // TODO + .expect("Field reader cache lock poisoned. This should never happen.") .insert(field, field_reader.clone()); Ok(field_reader) } diff --git a/src/error.rs b/src/error.rs index d6ce4a33d..8b345717d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -10,6 +10,7 @@ use schema; use fastfield::FastFieldNotAvailableError; use serde_json; + error_chain!( errors { /// Path does not exist. diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index a2992c361..0c59cba05 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -136,9 +136,12 @@ impl From> for U64FastFieldReader { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); - for val in vals { - let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap(); - fast_field_writer.add_val(val); + // TODO Error not unwrap + { + let fast_field_writer = fast_field_writers.get_field_writer(field).unwrap(); + for val in vals { + fast_field_writer.add_val(val); + } } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 35c264cdc..7a37f3574 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -194,7 +194,7 @@ impl SegmentManager { .writing .remove(&after_merge_segment_entry.segment_id()); - let mut target_register: &mut SegmentRegister = { + let target_register: &mut SegmentRegister = { if registers_lock .uncommitted .contains_all(before_merge_segment_ids) { diff --git a/src/lib.rs b/src/lib.rs index 6a1e6be70..d8f2acc31 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -98,6 +98,8 @@ mod core; mod compression; mod indexer; mod common; + +#[allow(unused_doc_comment)] mod error; mod analyzer; mod datastruct; diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 1d1912f34..da8b65910 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -220,7 +220,7 @@ mod tests { } #[test] - fn test_term_dictionary() { + fn test_term_dictionary_simple() { let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { @@ -347,6 +347,7 @@ mod tests { let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { + println!("doc {}", id); term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap(); } term_dictionary_builder.finish().unwrap() diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs new file mode 100644 index 000000000..21e5aac74 --- /dev/null +++ b/src/termdict/streamdict/delta_encoder.rs @@ -0,0 +1,48 @@ +pub fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { + s1.iter() + .zip(s2.iter()) + .take_while(|&(a, b)| a==b) + .count() +} + + +#[derive(Default)] +pub struct DeltaEncoder { + last_term: Vec, +} + +impl DeltaEncoder { + pub fn encode<'a>(&mut self, term: &'a [u8]) -> (usize, &'a [u8]) { + let prefix_len = common_prefix_len(term, &self.last_term); + self.last_term.truncate(prefix_len); + self.last_term.extend_from_slice(&term[prefix_len..]); + (prefix_len, &term[prefix_len..]) + } + + pub fn term(&self) -> &[u8] { + &self.last_term[..] + } +} + +#[derive(Default)] +pub struct DeltaDecoder { + term: Vec, +} + +impl DeltaDecoder { + pub fn with_previous_term(term: Vec) -> DeltaDecoder { + DeltaDecoder { + term: Vec::from(term) + } + } + + pub fn decode(&mut self, prefix_len: usize, suffix: &[u8]) -> &[u8] { + self.term.truncate(prefix_len); + self.term.extend_from_slice(suffix); + &self.term[..] + } + + pub fn term(&self) -> &[u8] { + &self.term[..] + } +} diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index 4a4db7690..96a2c4141 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -1,49 +1,9 @@ - mod termdict; mod streamer; -mod term_block_encoder; -mod terminfo_block_encoder; +mod delta_encoder; +pub use self::delta_encoder::{DeltaEncoder, DeltaDecoder}; pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; -use self::term_block_encoder::{TermBlockEncoder, TermBlockDecoder}; -use self::terminfo_block_encoder::{TermInfoBlockEncoder, TermInfoBlockDecoder}; - -use schema::FieldType; - -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy)] -pub(crate) enum TermDeserializerOption { - StrNoPositions, - StrWithPositions, - U64, -} - -impl TermDeserializerOption { - - pub fn has_positions(&self) -> bool { - match *self { - TermDeserializerOption::StrWithPositions => true, - _ => false - } - } - -} - -fn make_deserializer_options(field_type: &FieldType) -> TermDeserializerOption { - match *field_type { - FieldType::Str(ref text_options) => { - let indexing_options = text_options.get_indexing_options(); - if indexing_options.is_position_enabled() { - TermDeserializerOption::StrWithPositions - } - else { - TermDeserializerOption::StrNoPositions - } - } - _ => { - TermDeserializerOption::U64 - } - } -} \ No newline at end of file diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index b326334ee..8ed95fda9 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -3,23 +3,22 @@ use std::cmp::max; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; -use super::{TermBlockDecoder, TermInfoBlockDecoder}; use postings::TermInfo; -use super::TermDeserializerOption; +use super::delta_encoder::DeltaDecoder; -pub(crate) fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, +fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, target_key: &[u8], - deserializer_option: TermDeserializerOption) + has_positions: bool) -> TermStreamerImpl<'a> { let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref()); let offset: usize = offset as usize; TermStreamerImpl { - remaining_in_block: 0, - term_block_decoder: TermBlockDecoder::given_previous_term(&prev_key[..]), - terminfo_block_decoder: TermInfoBlockDecoder::new(deserializer_option.has_positions()), cursor: &term_dictionary.stream_data()[offset..], + delta_decoder: DeltaDecoder::with_previous_term(prev_key), + term_info: TermInfo::default(), + has_positions: has_positions, } } @@ -28,13 +27,11 @@ pub(crate) fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, pub struct TermStreamerBuilderImpl<'a> { term_dictionary: &'a TermDictionaryImpl, - block_start: &'a [u8], origin: usize, - cursor: usize, offset_from: usize, offset_to: usize, current_key: Vec, - deserializer_option: TermDeserializerOption, + has_positions: bool, } impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> @@ -43,60 +40,44 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// Limit the range to terms greater or equal to the bound fn ge>(mut self, bound: T) -> Self { - unimplemented!(); - /* let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.lt(target_key); - let (block_start, cursor, current_key) = get_offset(smaller_than, streamer); - self.block_start = block_start; + let (offset_before, current_key) = get_offset(smaller_than, streamer); self.current_key = current_key; - self.cursor = cursor; - //self.offset_from = ; - */ + self.offset_from = offset_before - self.origin; self } /// Limit the range to terms strictly greater than the bound fn gt>(mut self, bound: T) -> Self { - unimplemented!(); - /* let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.le(target_key); - let (block_start, cursor, current_key) = get_offset(smaller_than, streamer); - self.block_start = block_start; + let (offset_before, current_key) = get_offset(smaller_than, streamer); self.current_key = current_key; - self.cursor = cursor; - //self.offset_from = offset_before - self.origin; - */ + self.offset_from = offset_before - self.origin; self } /// Limit the range to terms lesser or equal to the bound fn lt>(mut self, bound: T) -> Self { - unimplemented!(); - /* let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.lt(target_key); let (offset_before, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self - */ } /// Limit the range to terms lesser or equal to the bound fn le>(mut self, bound: T) -> Self { - unimplemented!(); - /* let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.deserializer_option); + let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.le(target_key); let (offset_before, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self - */ } /// Build the streamer. @@ -105,10 +86,10 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let start = self.offset_from; let stop = max(self.offset_to, start); TermStreamerImpl { - remaining_in_block: 0, cursor: &data[start..stop], - term_block_decoder: TermBlockDecoder::given_previous_term(&self.current_key), - terminfo_block_decoder: TermInfoBlockDecoder::new(self.deserializer_option.has_positions()), + delta_decoder: DeltaDecoder::with_previous_term(self.current_key), + term_info: TermInfo::default(), + has_positions: self.has_positions, } } } @@ -122,42 +103,37 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// - the term_buffer state to initialize the block) fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, mut streamer: TermStreamerImpl<'a>) - -> (&'a [u8], usize, Vec) -{//&'a [u8] - let mut block_start: &[u8] = streamer.cursor; - let mut cursor = 0; - let mut term_buffer: Vec = vec!(); + -> (usize, Vec) +{ + let mut prev: &[u8] = streamer.cursor; - while streamer.advance() { - let iter_key = streamer.key(); + let mut prev_data: Vec = Vec::from(streamer.delta_decoder.term()); + + while let Some((iter_key, _)) = streamer.next() { if !predicate(iter_key.as_ref()) { - return (block_start, streamer.term_block_decoder.cursor() - 1, term_buffer); - } - if streamer.remaining_in_block == 0 { - block_start = streamer.cursor; - term_buffer.clear(); - term_buffer.extend_from_slice(iter_key.as_ref()); + return (prev.as_ptr() as usize, prev_data); } + prev = streamer.cursor; + prev_data.clear(); + prev_data.extend_from_slice(iter_key.as_ref()); } - (block_start, streamer.term_block_decoder.cursor() - 1, term_buffer) + (prev.as_ptr() as usize, prev_data) } impl<'a> TermStreamerBuilderImpl<'a> { pub(crate) fn new( term_dictionary: &'a TermDictionaryImpl, - deserializer_option: TermDeserializerOption) -> Self { + has_positions: bool) -> Self { let data = term_dictionary.stream_data(); let origin = data.as_ptr() as usize; TermStreamerBuilderImpl { term_dictionary: term_dictionary, - block_start: term_dictionary.stream_data().as_ref(), - cursor: 0, origin: origin, offset_from: 0, offset_to: data.len(), current_key: Vec::with_capacity(300), - deserializer_option: deserializer_option, + has_positions: has_positions, } } } @@ -167,49 +143,56 @@ impl<'a> TermStreamerBuilderImpl<'a> /// See [`TermStreamer`](./trait.TermStreamer.html) pub struct TermStreamerImpl<'a> { - remaining_in_block: usize, - term_block_decoder: TermBlockDecoder<'a>, - terminfo_block_decoder: TermInfoBlockDecoder<'a>, cursor: &'a [u8], + delta_decoder: DeltaDecoder, + term_info: TermInfo, + has_positions: bool } -impl<'a> TermStreamerImpl<'a> -{ - fn load_block(&mut self) -> bool { - self.remaining_in_block = self.cursor[0] as usize; - if self.remaining_in_block == 0 { - false - } - else { - self.cursor = &self.cursor[1..]; - self.cursor = self.term_block_decoder.decode_block(self.cursor); - self.cursor = self.terminfo_block_decoder.decode_block(self.cursor, self.remaining_in_block); - true + +fn deserialize_vint(data: &mut &[u8]) -> u64 { + let mut res = 0; + let mut shift = 0; + for i in 0.. { + let b = data[i]; + res |= ((b % 128u8) as u64) << shift; + if b & 128u8 != 0u8 { + *data = &data[(i + 1)..]; + break; } + shift += 7; } + res } - impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { - if self.remaining_in_block == 0 { - if !self.load_block() { - return false; - } + if self.cursor.is_empty() { + return false; + } + let common_length: usize = deserialize_vint(&mut self.cursor) as usize; + let suffix_length: usize = deserialize_vint(&mut self.cursor) as usize; + self.delta_decoder.decode(common_length, &self.cursor[..suffix_length]); + self.cursor = &self.cursor[suffix_length..]; + + self.term_info.doc_freq = deserialize_vint(&mut self.cursor) as u32; + self.term_info.postings_offset = deserialize_vint(&mut self.cursor) as u32; + + if self.has_positions { + self.term_info.positions_offset = deserialize_vint(&mut self.cursor) as u32; + self.term_info.positions_inner_offset = self.cursor[0]; + self.cursor = &self.cursor[1..]; } - self.remaining_in_block -= 1; - self.term_block_decoder.advance(); - self.terminfo_block_decoder.advance(); true } fn key(&self) -> &[u8] { - self.term_block_decoder.term() + self.delta_decoder.term() } fn value(&self) -> &TermInfo { - self.terminfo_block_decoder.term_info() + &self.term_info } } diff --git a/src/termdict/streamdict/term_block_encoder.rs b/src/termdict/streamdict/term_block_encoder.rs deleted file mode 100644 index 157a3cf28..000000000 --- a/src/termdict/streamdict/term_block_encoder.rs +++ /dev/null @@ -1,164 +0,0 @@ -use compression::{BlockEncoder, BlockDecoder, NUM_DOCS_PER_BLOCK}; -use std::io::{self, Write}; - -fn compute_common_prefix_length(left: &[u8], right: &[u8]) -> usize { - left.iter() - .cloned() - .zip(right.iter().cloned()) - .take_while(|&(b1, b2)| b1 == b2) - .count() -} - - -pub struct TermBlockEncoder { - block_encoder: BlockEncoder, - - pop_lens: [u32; NUM_DOCS_PER_BLOCK], - push_lens: [u32; NUM_DOCS_PER_BLOCK], - suffixes: Vec, - - previous_key: Vec, - count: usize, -} - -impl TermBlockEncoder { - pub fn new() -> TermBlockEncoder { - TermBlockEncoder { - block_encoder: BlockEncoder::new(), - pop_lens: [0u32; NUM_DOCS_PER_BLOCK], - push_lens: [0u32; NUM_DOCS_PER_BLOCK], - suffixes: Vec::with_capacity(NUM_DOCS_PER_BLOCK*5), - - previous_key: Vec::with_capacity(30), - - count: 0, - } - } - - pub fn encode(&mut self, key: &[u8]) { - let common_prefix_len = compute_common_prefix_length(&self.previous_key, key); - self.pop_lens[self.count] = (self.previous_key.len() - common_prefix_len) as u32; - self.push_lens[self.count] = (key.len() - common_prefix_len) as u32; - self.previous_key.clear(); - let suffix = &key[common_prefix_len..]; - self.suffixes.extend_from_slice(suffix); - self.previous_key.extend_from_slice(key); - self.count += 1; - } - - pub fn len(&self) -> usize { - self.count - } - - pub fn flush(&mut self, output: &mut W) -> io::Result<()> { - for i in self.count..NUM_DOCS_PER_BLOCK { - self.pop_lens[i] = 0u32; - self.push_lens[i] = 0u32; - } - output.write_all(self.block_encoder.compress_block_unsorted(&self.pop_lens))?; - output.write_all(self.block_encoder.compress_block_unsorted(&self.push_lens))?; - output.write_all(&self.suffixes[..])?; - self.suffixes.clear(); - self.count = 0; - Ok(()) - } -} - - - -pub struct TermBlockDecoder<'a> { - pop_lens_decoder: BlockDecoder, - push_lens_decoder: BlockDecoder, - suffixes: &'a [u8], - current_key: Vec, - cursor: usize, -} - - -impl<'a> TermBlockDecoder<'a> { - pub fn new() -> TermBlockDecoder<'a> { - TermBlockDecoder::given_previous_term(&[]) - } - - pub fn cursor(&self) -> usize { - self.cursor - } - - pub fn given_previous_term(previous_term: &[u8]) -> TermBlockDecoder<'a> { - let mut current_key = Vec::with_capacity(30); - current_key.extend_from_slice(previous_term); - TermBlockDecoder { - pop_lens_decoder: BlockDecoder::new(), - push_lens_decoder: BlockDecoder::new(), - current_key: current_key, - suffixes: &[], - cursor: 0, - } - } - - pub fn term(&self) -> &[u8] { - &self.current_key - } - - pub fn decode_block(&mut self, mut compressed_data: &'a [u8]) -> &'a [u8] { - { - let consumed_data_len = self.pop_lens_decoder.uncompress_block_unsorted(compressed_data); - compressed_data = &compressed_data[consumed_data_len..]; - } - { - let consumed_data_len = self.push_lens_decoder.uncompress_block_unsorted(compressed_data); - compressed_data = &compressed_data[consumed_data_len..]; - } - let suffix_len: u32 = self.push_lens_decoder.output_array()[0..].iter().cloned().sum(); - let suffix_len: usize = suffix_len as usize; - self.suffixes = &compressed_data[..suffix_len]; - self.cursor = 0; - &compressed_data[suffix_len..] - } - - pub fn advance(&mut self) { - assert!(self.cursor < NUM_DOCS_PER_BLOCK); - let pop_len = self.pop_lens_decoder.output(self.cursor) as usize; - let push_len = self.push_lens_decoder.output(self.cursor) as usize; - let previous_len = self.current_key.len(); - self.current_key.truncate(previous_len - pop_len); - self.current_key.extend_from_slice(&self.suffixes[..push_len]); - self.suffixes = &self.suffixes[push_len..]; - self.cursor += 1; - } -} - - - -#[cfg(test)] -mod tests { - use super::{TermBlockEncoder, TermBlockDecoder}; - - #[test] - fn test_encoding_terms() { - let mut buffer: Vec = vec!(); - let mut terms = vec!(); - { - let mut term_block_encoder = TermBlockEncoder::new(); - for i in 0..128 { - terms.push(format!("term{}", i * 7231)); - } - for term in &terms { - term_block_encoder.encode(term.as_bytes()); - } - term_block_encoder.flush(&mut buffer).unwrap(); - } - assert_eq!(buffer.len(), 711); - - let mut block_decoder = TermBlockDecoder::new(); - assert_eq!(block_decoder.decode_block(&buffer[..]).len(), 0); - for i in 0..128 { - block_decoder.advance(); - assert_eq!(block_decoder.term(), terms[i].as_bytes()); - } - - } -} - - - diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index 5c15652cb..9c0dfb841 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -6,18 +6,14 @@ use fst::raw::Fst; use directory::ReadOnlySource; use common::BinarySerializable; use common::CountingWriter; -use bincode; -use std::cmp::Ordering; use postings::TermInfo; use schema::FieldType; +use super::DeltaEncoder; use fst::raw::Node; -use compression::NUM_DOCS_PER_BLOCK; -use super::make_deserializer_options; -use super::TermDeserializerOption; -use super::streamer::stream_before; +use common::VInt; use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; -use super::{TermBlockEncoder, TermInfoBlockEncoder}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; +use termdict::TermStreamerBuilder; const INDEX_INTERVAL: usize = 1024; @@ -25,17 +21,30 @@ fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) } +fn has_positions(field_type: &FieldType) -> bool { + match *field_type { + FieldType::Str(ref text_options) => { + let indexing_options = text_options.get_indexing_options(); + if indexing_options.is_position_enabled() { + true + } + else { + false + } + } + _ => { + false + } + } +} + /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) pub struct TermDictionaryBuilderImpl { + has_positions: bool, write: CountingWriter, - - term_block_encoder: TermBlockEncoder, - terminfo_block_encoder: TermInfoBlockEncoder, - + delta_encoder: DeltaEncoder, block_index: fst::MapBuilder>, - last_key: Vec, - len: usize, } @@ -52,7 +61,7 @@ impl TermDictionaryBuilderImpl { fn add_index_entry(&mut self) { self.block_index - .insert(&self.last_key, self.write.written_bytes() as u64) + .insert(&self.delta_encoder.term(), self.write.written_bytes() as u64) .unwrap(); } @@ -67,27 +76,20 @@ impl TermDictionaryBuilderImpl if self.len % INDEX_INTERVAL == 0 { self.add_index_entry(); } - self.last_key.clear(); - self.last_key.extend_from_slice(key); - self.term_block_encoder.encode(key); + let (common_prefix_len, suffix) = self.delta_encoder.encode(key); + VInt(common_prefix_len as u64).serialize(&mut self.write)?; + VInt(suffix.len() as u64).serialize(&mut self.write)?; + self.write.write_all(suffix)?; self.len += 1; Ok(()) } - fn flush_block(&mut self) -> io::Result<()> { - let block_size = self.term_block_encoder.len(); - if block_size > 0 { - self.write.write(&[block_size as u8])?; - self.term_block_encoder.flush(&mut self.write)?; - self.terminfo_block_encoder.flush(&mut self.write)?; - } - Ok(()) - } - pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { - self.terminfo_block_encoder.encode(value); - if self.len % NUM_DOCS_PER_BLOCK == 0 { - self.flush_block()?; + VInt(value.doc_freq as u64).serialize(&mut self.write)?; + VInt(value.postings_offset as u64).serialize(&mut self.write)?; + if self.has_positions { + VInt(value.positions_offset as u64).serialize(&mut self.write)?; + self.write.write(&[value.positions_inner_offset])?; } Ok(()) } @@ -98,21 +100,14 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl { /// Creates a new `TermDictionaryBuilder` fn new(mut write: W, field_type: FieldType) -> io::Result { - let deserializer_options = make_deserializer_options(&field_type); - { - // serialize the field type. - let data: Vec = bincode::serialize(&deserializer_options, bincode::Bounded(256u64)) - .expect("Failed to serialize field type within 256 bytes. This should never be a problem."); - write.write_all(&[data.len() as u8])?; - write.write_all(&data[..])?; - } - let has_positions = deserializer_options.has_positions(); + let has_positions = has_positions(&field_type); + let has_positions_code = if has_positions { 255u8 } else { 0u8 }; + write.write_all(&[has_positions_code])?; Ok(TermDictionaryBuilderImpl { - term_block_encoder: TermBlockEncoder::new(), - terminfo_block_encoder: TermInfoBlockEncoder::new(has_positions), + has_positions: has_positions, write: CountingWriter::wrap(write), + delta_encoder: DeltaEncoder::default(), block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), - last_key: Vec::with_capacity(128), len: 0, }) } @@ -129,9 +124,7 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl /// Finalize writing the builder, and returns the underlying /// `Write` object. fn finish(mut self) -> io::Result { - self.flush_block()?; self.add_index_entry(); - self.write.write_all(&[0u8])?; let (mut w, split_len) = self.write.finish()?; let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?; w.write_all(&fst_write)?; @@ -159,7 +152,7 @@ pub struct TermDictionaryImpl { stream_data: ReadOnlySource, fst_index: fst::Map, - deserializer_option: TermDeserializerOption, + has_positions: bool, } impl TermDictionaryImpl @@ -224,13 +217,8 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl /// Opens a `TermDictionary` given a data source. fn from_source(mut source: ReadOnlySource) -> io::Result { - // it won't take more than 100 bytes - let deserialize_option_len = source.slice(0, 1).as_slice()[0] as usize; - let deserialize_option_source = source.slice(1, 1 + deserialize_option_len); - let deserialize_option_buffer: &[u8] = deserialize_option_source.as_slice(); - let deserializer_option: TermDeserializerOption = bincode::deserialize(deserialize_option_buffer) - .expect("Field dictionary data is corrupted. Failed to deserialize field type."); - source = source.slice_from(1 + deserialize_option_len); + let has_positions = source.slice(0, 1).as_ref()[0] == 255u8; + source = source.slice_from(1); let total_len = source.len(); let length_offset = total_len - 8; @@ -243,31 +231,28 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl let fst_index = open_fst_index(fst_data)?; Ok(TermDictionaryImpl { + has_positions: has_positions, stream_data: stream_data, fst_index: fst_index, - deserializer_option: deserializer_option, }) } /// Lookups the value corresponding to the key. fn get>(&self, target_key: K) -> Option { - let mut streamer = stream_before(self, target_key.as_ref(), self.deserializer_option); - while streamer.advance() { - let position = streamer.key().cmp(target_key.as_ref()); - match position { - Ordering::Less => {} - Ordering::Equal => return Some(streamer.value().clone()), - Ordering::Greater => { - return None; - } - } + let mut streamer = self.range() + .ge(&target_key) + .into_stream(); + if streamer.advance() && streamer.key() == target_key.as_ref() { + Some(streamer.value().clone()) + } + else { + None } - None } /// Returns a range builder, to stream all of the terms /// within an interval. fn range(&'a self) -> Self::StreamBuilder { - Self::StreamBuilder::new(self, self.deserializer_option) + Self::StreamBuilder::new(self, self.has_positions) } } diff --git a/src/termdict/streamdict/terminfo_block_encoder.rs b/src/termdict/streamdict/terminfo_block_encoder.rs deleted file mode 100644 index c2d2ef26b..000000000 --- a/src/termdict/streamdict/terminfo_block_encoder.rs +++ /dev/null @@ -1,117 +0,0 @@ -use compression::{BlockEncoder, BlockDecoder, VIntEncoder, VIntDecoder, NUM_DOCS_PER_BLOCK}; -use postings::TermInfo; -use std::io::{self, Write}; - -pub struct TermInfoBlockEncoder { - block_encoder: BlockEncoder, - - doc_freqs: [u32; NUM_DOCS_PER_BLOCK], - postings_offsets: [u32; NUM_DOCS_PER_BLOCK], - positions_offsets: [u32; NUM_DOCS_PER_BLOCK], - positions_inner_offset: [u8; NUM_DOCS_PER_BLOCK], - - cursor: usize, - encode_positions: bool, -} - -impl TermInfoBlockEncoder { - pub fn new(encode_positions: bool) -> TermInfoBlockEncoder { - TermInfoBlockEncoder { - block_encoder: BlockEncoder::new(), - - doc_freqs: [0u32; NUM_DOCS_PER_BLOCK], - postings_offsets: [0u32; NUM_DOCS_PER_BLOCK], - positions_offsets: [0u32; NUM_DOCS_PER_BLOCK], - positions_inner_offset: [0u8; NUM_DOCS_PER_BLOCK], - - cursor: 0, - encode_positions: encode_positions, - } - } - - pub fn encode(&mut self, term_info: &TermInfo) { - self.doc_freqs[self.cursor] = term_info.doc_freq; - self.postings_offsets[self.cursor] = term_info.postings_offset; - self.positions_offsets[self.cursor] = term_info.positions_offset; - self.positions_inner_offset[self.cursor] = term_info.positions_inner_offset; - self.cursor += 1; - } - - pub fn flush(&mut self, output: &mut W) -> io::Result<()> { - output.write_all(self.block_encoder.compress_vint_unsorted(&self.doc_freqs[..self.cursor]))?; - output.write_all(self.block_encoder.compress_vint_sorted(&self.postings_offsets[..self.cursor], 0u32))?; - if self.encode_positions { - output.write_all(self.block_encoder.compress_vint_sorted(&self.positions_offsets[..self.cursor], 0u32))?; - output.write_all(&self.positions_inner_offset[..self.cursor])?; - } - self.cursor = 0; - Ok(()) - } -} - - - -pub struct TermInfoBlockDecoder<'a> { - doc_freq_decoder: BlockDecoder, - postings_decoder: BlockDecoder, - positions_decoder: BlockDecoder, - positions_inner_offset: &'a [u8], - current_term_info: TermInfo, - - cursor: usize, - has_positions: bool, -} - - -impl<'a> TermInfoBlockDecoder<'a> { - pub fn new(has_positions: bool) -> TermInfoBlockDecoder<'a> { - TermInfoBlockDecoder { - doc_freq_decoder: BlockDecoder::new(), - postings_decoder: BlockDecoder::new(), - positions_decoder: BlockDecoder::new(), - positions_inner_offset: &[], - - current_term_info: TermInfo::default(), - cursor: 0, - has_positions: has_positions, - } - } - - - pub fn term_info(&self) -> &TermInfo { - &self.current_term_info - } - - pub fn decode_block(&mut self, mut compressed_data: &'a [u8], num_els: usize) -> &'a [u8] { - self.cursor = 0; - { - let consumed_size = self.doc_freq_decoder.uncompress_vint_unsorted(compressed_data, num_els); - compressed_data = &compressed_data[consumed_size..]; - } - { - let consumed_size = self.postings_decoder.uncompress_vint_sorted(compressed_data, 0u32, num_els); - compressed_data = &compressed_data[consumed_size..]; - } - if self.has_positions { - let consumed_size = self.positions_decoder.uncompress_vint_sorted(compressed_data, 0u32, num_els); - compressed_data = &compressed_data[consumed_size..]; - self.positions_inner_offset = &compressed_data[..num_els]; - &compressed_data[num_els..] - } - else { - compressed_data - } - } - - pub fn advance(&mut self) { - assert!(self.cursor < NUM_DOCS_PER_BLOCK); - self.current_term_info.doc_freq = self.doc_freq_decoder.output(self.cursor); - self.current_term_info.postings_offset = self.postings_decoder.output(self.cursor); - if self.has_positions { - self.current_term_info.positions_offset = self.positions_decoder.output(self.cursor); - self.current_term_info.positions_inner_offset = self.positions_inner_offset[self.cursor]; - } - self.cursor += 1; - } - -} \ No newline at end of file From 3d0082d0202c8a0c0b6b82c503605c427b6d72bd Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 26 Aug 2017 19:38:29 +0900 Subject: [PATCH 24/29] Delta encoded. Range and get are broken --- src/compression/stream.rs | 11 +-- src/core/field_reader.rs | 4 +- src/directory/mod.rs | 9 +- src/directory/read_only_source.rs | 9 +- src/postings/docset.rs | 16 +++ src/postings/serializer.rs | 11 ++- src/termdict/mod.rs | 1 - src/termdict/streamdict/delta_encoder.rs | 118 ++++++++++++++++++++--- src/termdict/streamdict/mod.rs | 4 +- src/termdict/streamdict/streamer.rs | 53 +++------- src/termdict/streamdict/termdict.rs | 50 +++++----- 11 files changed, 189 insertions(+), 97 deletions(-) diff --git a/src/compression/stream.rs b/src/compression/stream.rs index 29d180353..de902da85 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -1,7 +1,7 @@ use compression::BlockDecoder; use compression::NUM_DOCS_PER_BLOCK; use compression::compressed_block_size; -use directory::SourceRead; +use directory::{ReadOnlySource, SourceRead}; pub struct CompressedIntStream { buffer: SourceRead, @@ -10,9 +10,9 @@ pub struct CompressedIntStream { } impl CompressedIntStream { - pub fn wrap(buffer: SourceRead) -> CompressedIntStream { + pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream { CompressedIntStream { - buffer: buffer, + buffer: SourceRead::from(source), block_decoder: BlockDecoder::new(), inner_offset: NUM_DOCS_PER_BLOCK, } @@ -72,7 +72,7 @@ pub mod tests { use compression::compressed_block_size; use compression::NUM_DOCS_PER_BLOCK; use compression::BlockEncoder; - use directory::{SourceRead, ReadOnlySource}; + use directory::ReadOnlySource; fn create_stream_buffer() -> ReadOnlySource { let mut buffer: Vec = vec!(); @@ -90,8 +90,7 @@ pub mod tests { #[test] fn test_compressed_int_stream() { let buffer = create_stream_buffer(); - let buffer_reader = SourceRead::from(buffer); - let mut stream = CompressedIntStream::wrap(buffer_reader); + let mut stream = CompressedIntStream::wrap(buffer); let mut block: [u32; NUM_DOCS_PER_BLOCK] = [0u32; NUM_DOCS_PER_BLOCK]; stream.read(&mut block[0..2]); diff --git a/src/core/field_reader.rs b/src/core/field_reader.rs index eaf35514b..ca0e95111 100644 --- a/src/core/field_reader.rs +++ b/src/core/field_reader.rs @@ -100,8 +100,8 @@ impl FieldReader { let position_stream = { if option.has_positions() { let position_offset = term_info.positions_offset; - let positions_reader = SourceRead::from(self.positions_source.slice_from(position_offset as usize)); - let mut stream = CompressedIntStream::wrap(positions_reader); + let positions_source = self.positions_source.slice_from(position_offset as usize); + let mut stream = CompressedIntStream::wrap(positions_source); stream.skip(term_info.positions_inner_offset as usize); Some(stream) } diff --git a/src/directory/mod.rs b/src/directory/mod.rs index cfdaee719..b4c18b359 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -13,14 +13,15 @@ mod managed_directory; /// Errors specific to the directory module. pub mod error; -use std::io::{Write, Seek}; +use std::io::{Write, Seek, BufWriter}; -use std::io::BufWriter; -pub use self::read_only_source::{SourceRead, ReadOnlySource}; +pub use self::read_only_source::ReadOnlySource; pub use self::directory::Directory; pub use self::ram_directory::RAMDirectory; pub use self::mmap_directory::MmapDirectory; -pub use self::managed_directory::{ManagedDirectory, FileProtection}; + +pub(crate) use self::read_only_source::SourceRead; +pub(crate) use self::managed_directory::{ManagedDirectory, FileProtection}; /// Synonym of Seek + Write pub trait SeekableWrite: Seek + Write {} diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index 1fd0afc0f..3db74bb01 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -65,6 +65,8 @@ impl ReadOnlySource { } } + /// Like `.slice(...)` but enforcing only the `from` + /// boundary. pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource { let len = self.len(); self.slice(from_offset, len) @@ -90,12 +92,15 @@ impl From> for ReadOnlySource { } } -pub struct SourceRead { + +/// Acts as a owning cursor over the data backed up by a ReadOnlySource +pub(crate) struct SourceRead { _data_owner: ReadOnlySource, cursor: &'static [u8] } impl SourceRead { + // Advance the cursor by a given number of bytes. pub fn advance(&mut self, len: usize) { self.cursor = &self.cursor[len..]; } @@ -108,6 +113,8 @@ impl AsRef<[u8]> for SourceRead { } impl From for SourceRead { + + // Creates a new `SourceRead` from a given `ReadOnlySource` fn from(source: ReadOnlySource) -> SourceRead { let len = source.len(); let slice_ptr = source.as_slice().as_ptr(); diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 219a85dcb..4b1ea3c7a 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -52,6 +52,22 @@ pub trait DocSet { } } + + /// Fills a given mutable buffer with the next doc ids from the + /// `DocSet` + /// + /// If that many `DocId`s are available, the method should + /// fill the entire buffer and return the length of the buffer. + /// + /// If we reach the end of the `DocSet` before filling + /// it entirely, then the buffer is filled up to this point, and + /// return value is the number of elements that were filled. + /// + /// # Warning + /// + /// This method is only here for specific high-performance + /// use case where batching. The normal way to + /// go through the `DocId`'s is to call `.advance()`. fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize { for (i, buffer_val) in buffer.iter_mut().enumerate() { if self.advance() { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index caec58b1f..5c24256cc 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -108,6 +108,8 @@ impl InvertedIndexSerializer { } +/// The field serializer is in charge of +/// the serialization of a specific field. pub struct FieldSerializer<'a> { term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter>, postings_serializer: PostingsSerializer<&'a mut CountingWriter>, @@ -173,9 +175,10 @@ impl<'a> FieldSerializer<'a> { /// to the lexicographical order. /// * doc_freq - return the number of document containing the term. pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> { - if self.term_open { - panic!("Called new_term, while the previous term was not closed."); - } + assert!( + !self.term_open, + "Called new_term, while the previous term was not closed." + ); self.term_open = true; self.postings_serializer.clear(); self.current_term_info = self.current_term_info(); @@ -217,6 +220,8 @@ impl<'a> FieldSerializer<'a> { Ok(()) } + + /// Closes the current current field. pub fn close(mut self) -> io::Result<()> { self.close_term()?; if let Some(positions_serializer) = self.positions_serializer_opt { diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index da8b65910..1ce1d6c54 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -347,7 +347,6 @@ mod tests { let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { - println!("doc {}", id); term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap(); } term_dictionary_builder.finish().unwrap() diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs index 21e5aac74..7418e4f85 100644 --- a/src/termdict/streamdict/delta_encoder.rs +++ b/src/termdict/streamdict/delta_encoder.rs @@ -1,4 +1,15 @@ -pub fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { +use postings::TermInfo; +use common::VInt; +use common::BinarySerializable; +use std::io::{self, Write}; +use std::mem; + +/// Returns the len of the longest +/// common prefix of `s1` and `s2`. +/// +/// ie: the greatest `L` such that +/// for all `0 <= i < L`, `s1[i] == s2[i]` +fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { s1.iter() .zip(s2.iter()) .take_while(|&(a, b)| a==b) @@ -7,16 +18,20 @@ pub fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { #[derive(Default)] -pub struct DeltaEncoder { +pub struct TermDeltaEncoder { last_term: Vec, } -impl DeltaEncoder { - pub fn encode<'a>(&mut self, term: &'a [u8]) -> (usize, &'a [u8]) { +impl TermDeltaEncoder { + pub fn encode<'a, W: Write>(&mut self, term: &'a [u8], write: &mut W) -> io::Result<()> { let prefix_len = common_prefix_len(term, &self.last_term); self.last_term.truncate(prefix_len); self.last_term.extend_from_slice(&term[prefix_len..]); - (prefix_len, &term[prefix_len..]) + let suffix = &term[prefix_len..]; + VInt(prefix_len as u64).serialize(write)?; + VInt(suffix.len() as u64).serialize(write)?; + write.write_all(suffix)?; + Ok(()) } pub fn term(&self) -> &[u8] { @@ -25,24 +40,105 @@ impl DeltaEncoder { } #[derive(Default)] -pub struct DeltaDecoder { +pub struct TermDeltaDecoder { term: Vec, } -impl DeltaDecoder { - pub fn with_previous_term(term: Vec) -> DeltaDecoder { - DeltaDecoder { +impl TermDeltaDecoder { + pub fn with_previous_term(term: Vec) -> TermDeltaDecoder { + TermDeltaDecoder { term: Vec::from(term) } } - pub fn decode(&mut self, prefix_len: usize, suffix: &[u8]) -> &[u8] { + pub fn decode(&mut self, cursor: &mut &[u8]) { + let prefix_len: usize = deserialize_vint(cursor) as usize; + let suffix_length: usize = deserialize_vint(cursor) as usize; + let suffix = &cursor[..suffix_length]; + *cursor = &cursor[suffix_length..]; self.term.truncate(prefix_len); self.term.extend_from_slice(suffix); - &self.term[..] } pub fn term(&self) -> &[u8] { &self.term[..] } } + + + +pub struct TermInfoDeltaEncoder { + term_info: TermInfo, + has_positions: bool, +} + +impl TermInfoDeltaEncoder { + + pub fn new(has_positions: bool) -> Self { + TermInfoDeltaEncoder { + term_info: TermInfo::default(), + has_positions: has_positions, + } + } + + pub fn encode(&mut self, term_info: TermInfo, write: &mut W) -> io::Result<()> { + VInt(term_info.doc_freq as u64).serialize(write)?; + let delta_postings_offset = term_info.postings_offset - self.term_info.postings_offset; + VInt(delta_postings_offset as u64).serialize(write)?; + if self.has_positions { + let delta_positions_offset = term_info.positions_offset - self.term_info.positions_offset; + VInt(delta_positions_offset as u64).serialize(write)?; + write.write(&[term_info.positions_inner_offset])?; + } + mem::replace(&mut self.term_info, term_info); + Ok(()) + } +} + +fn deserialize_vint(data: &mut &[u8]) -> u64 { + let mut res = 0; + let mut shift = 0; + for i in 0.. { + let b = data[i]; + res |= ((b % 128u8) as u64) << shift; + if b & 128u8 != 0u8 { + *data = &data[(i + 1)..]; + break; + } + shift += 7; + } + res +} + +pub struct TermInfoDeltaDecoder { + term_info: TermInfo, + has_positions: bool, +} + +impl TermInfoDeltaDecoder { + pub fn new(has_positions: bool) -> TermInfoDeltaDecoder { + TermInfoDeltaDecoder { + term_info: TermInfo::default(), + has_positions: has_positions, + } + } + + pub fn decode(&mut self, cursor: &mut &[u8]) { + let doc_freq = deserialize_vint(cursor) as u32; + self.term_info.doc_freq = doc_freq; + let delta_postings = deserialize_vint(cursor) as u32; + self.term_info.postings_offset += delta_postings; + if self.has_positions { + let delta_positions = deserialize_vint(cursor) as u32; + self.term_info.positions_offset += delta_positions; + let position_inner_offset = cursor[0]; + *cursor = &cursor[1..]; + self.term_info.positions_inner_offset = position_inner_offset; + } + } + + pub fn term_info(&self) -> &TermInfo { + &self.term_info + } +} + diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index 96a2c4141..1c9a148a1 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -2,7 +2,9 @@ mod termdict; mod streamer; mod delta_encoder; -pub use self::delta_encoder::{DeltaEncoder, DeltaDecoder}; +pub use self::delta_encoder::{TermDeltaEncoder, TermDeltaDecoder}; +pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder}; + pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 8ed95fda9..1363bf50a 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -4,7 +4,7 @@ use std::cmp::max; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; use postings::TermInfo; -use super::delta_encoder::DeltaDecoder; +use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder}; fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, @@ -16,9 +16,8 @@ fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, let offset: usize = offset as usize; TermStreamerImpl { cursor: &term_dictionary.stream_data()[offset..], - delta_decoder: DeltaDecoder::with_previous_term(prev_key), - term_info: TermInfo::default(), - has_positions: has_positions, + term_delta_decoder: TermDeltaDecoder::with_previous_term(prev_key), + term_info_decoder: TermInfoDeltaDecoder::new(has_positions), // TODO checkpoint } } @@ -87,9 +86,8 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let stop = max(self.offset_to, start); TermStreamerImpl { cursor: &data[start..stop], - delta_decoder: DeltaDecoder::with_previous_term(self.current_key), - term_info: TermInfo::default(), - has_positions: self.has_positions, + term_delta_decoder: TermDeltaDecoder::with_previous_term(self.current_key), + term_info_decoder: TermInfoDeltaDecoder::new(self.has_positions), // TODO checkpoint } } } @@ -107,7 +105,7 @@ fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, { let mut prev: &[u8] = streamer.cursor; - let mut prev_data: Vec = Vec::from(streamer.delta_decoder.term()); + let mut prev_data: Vec = Vec::from(streamer.term_delta_decoder.term()); while let Some((iter_key, _)) = streamer.next() { if !predicate(iter_key.as_ref()) { @@ -144,26 +142,12 @@ impl<'a> TermStreamerBuilderImpl<'a> pub struct TermStreamerImpl<'a> { cursor: &'a [u8], - delta_decoder: DeltaDecoder, - term_info: TermInfo, - has_positions: bool + term_delta_decoder: TermDeltaDecoder, + term_info_decoder: TermInfoDeltaDecoder, } -fn deserialize_vint(data: &mut &[u8]) -> u64 { - let mut res = 0; - let mut shift = 0; - for i in 0.. { - let b = data[i]; - res |= ((b % 128u8) as u64) << shift; - if b & 128u8 != 0u8 { - *data = &data[(i + 1)..]; - break; - } - shift += 7; - } - res -} + impl<'a> TermStreamer for TermStreamerImpl<'a> { @@ -171,28 +155,17 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> if self.cursor.is_empty() { return false; } - let common_length: usize = deserialize_vint(&mut self.cursor) as usize; - let suffix_length: usize = deserialize_vint(&mut self.cursor) as usize; - self.delta_decoder.decode(common_length, &self.cursor[..suffix_length]); - self.cursor = &self.cursor[suffix_length..]; - - self.term_info.doc_freq = deserialize_vint(&mut self.cursor) as u32; - self.term_info.postings_offset = deserialize_vint(&mut self.cursor) as u32; - - if self.has_positions { - self.term_info.positions_offset = deserialize_vint(&mut self.cursor) as u32; - self.term_info.positions_inner_offset = self.cursor[0]; - self.cursor = &self.cursor[1..]; - } + self.term_delta_decoder.decode(&mut self.cursor); + self.term_info_decoder.decode(&mut self.cursor); true } fn key(&self) -> &[u8] { - self.delta_decoder.term() + self.term_delta_decoder.term() } fn value(&self) -> &TermInfo { - &self.term_info + &self.term_info_decoder.term_info() } } diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index 9c0dfb841..ab5f9dfc3 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -8,9 +8,8 @@ use common::BinarySerializable; use common::CountingWriter; use postings::TermInfo; use schema::FieldType; -use super::DeltaEncoder; +use super::{TermDeltaEncoder, TermInfoDeltaEncoder}; use fst::raw::Node; -use common::VInt; use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; use termdict::TermStreamerBuilder; @@ -41,9 +40,9 @@ fn has_positions(field_type: &FieldType) -> bool { /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) pub struct TermDictionaryBuilderImpl { - has_positions: bool, write: CountingWriter, - delta_encoder: DeltaEncoder, + term_delta_encoder: TermDeltaEncoder, + term_info_encoder: TermInfoDeltaEncoder, block_index: fst::MapBuilder>, len: usize, } @@ -61,7 +60,7 @@ impl TermDictionaryBuilderImpl { fn add_index_entry(&mut self) { self.block_index - .insert(&self.delta_encoder.term(), self.write.written_bytes() as u64) + .insert(&self.term_delta_encoder.term(), self.write.written_bytes() as u64) .unwrap(); } @@ -76,21 +75,13 @@ impl TermDictionaryBuilderImpl if self.len % INDEX_INTERVAL == 0 { self.add_index_entry(); } - let (common_prefix_len, suffix) = self.delta_encoder.encode(key); - VInt(common_prefix_len as u64).serialize(&mut self.write)?; - VInt(suffix.len() as u64).serialize(&mut self.write)?; - self.write.write_all(suffix)?; + self.term_delta_encoder.encode(key, &mut self.write)?; self.len += 1; Ok(()) } - pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { - VInt(value.doc_freq as u64).serialize(&mut self.write)?; - VInt(value.postings_offset as u64).serialize(&mut self.write)?; - if self.has_positions { - VInt(value.positions_offset as u64).serialize(&mut self.write)?; - self.write.write(&[value.positions_inner_offset])?; - } + pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> { + self.term_info_encoder.encode(term_info.clone(), &mut self.write)?; Ok(()) } } @@ -104,9 +95,9 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl let has_positions_code = if has_positions { 255u8 } else { 0u8 }; write.write_all(&[has_positions_code])?; Ok(TermDictionaryBuilderImpl { - has_positions: has_positions, write: CountingWriter::wrap(write), - delta_encoder: DeltaEncoder::default(), + term_delta_encoder: TermDeltaEncoder::default(), + term_info_encoder: TermInfoDeltaEncoder::new(has_positions), block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), len: 0, }) @@ -118,7 +109,8 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { let key = key_ref.as_ref(); self.insert_key(key)?; - self.insert_value(value) + self.insert_value(value)?; + Ok(()) } /// Finalize writing the builder, and returns the underlying @@ -136,15 +128,17 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl fn open_fst_index(source: ReadOnlySource) -> io::Result { - Ok(fst::Map::from(match source { - ReadOnlySource::Anonymous(data) => { - try!(Fst::from_shared_bytes(data.data, data.start, data.len) - .map_err(convert_fst_error)) - } - ReadOnlySource::Mmap(mmap_readonly) => { - try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)) - } - })) + use self::ReadOnlySource::*; + let fst_result = match source { + Anonymous(data) => { + Fst::from_shared_bytes(data.data, data.start, data.len) + } + Mmap(mmap_readonly) => { + Fst::from_mmap(mmap_readonly) + } + }; + let fst = fst_result.map_err(convert_fst_error)?; + Ok(fst::Map::from(fst)) } /// See [`TermDictionary`](./trait.TermDictionary.html) From 69351fb4a59681ee309c344a6afa5dd7e14d1cc8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 27 Aug 2017 18:44:37 +0900 Subject: [PATCH 25/29] Toward a new codec --- src/termdict/streamdict/delta_encoder.rs | 97 ++++++++++++------------ src/termdict/streamdict/mod.rs | 3 +- src/termdict/streamdict/streamer.rs | 27 ++++++- src/termdict/streamdict/termdict.rs | 86 +++++++++++++++++++-- 4 files changed, 153 insertions(+), 60 deletions(-) diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs index 7418e4f85..34152961b 100644 --- a/src/termdict/streamdict/delta_encoder.rs +++ b/src/termdict/streamdict/delta_encoder.rs @@ -1,7 +1,4 @@ use postings::TermInfo; -use common::VInt; -use common::BinarySerializable; -use std::io::{self, Write}; use std::mem; /// Returns the len of the longest @@ -20,23 +17,23 @@ fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { #[derive(Default)] pub struct TermDeltaEncoder { last_term: Vec, + prefix_len: usize, } impl TermDeltaEncoder { - pub fn encode<'a, W: Write>(&mut self, term: &'a [u8], write: &mut W) -> io::Result<()> { - let prefix_len = common_prefix_len(term, &self.last_term); - self.last_term.truncate(prefix_len); - self.last_term.extend_from_slice(&term[prefix_len..]); - let suffix = &term[prefix_len..]; - VInt(prefix_len as u64).serialize(write)?; - VInt(suffix.len() as u64).serialize(write)?; - write.write_all(suffix)?; - Ok(()) + pub fn encode<'a>(&mut self, term: &'a [u8]) { + self.prefix_len = common_prefix_len(term, &self.last_term); + self.last_term.truncate(self.prefix_len); + self.last_term.extend_from_slice(&term[self.prefix_len..]); } pub fn term(&self) -> &[u8] { &self.last_term[..] } + + pub fn prefix_suffix(&mut self) -> (usize, &[u8]) { + (self.prefix_len, &self.last_term[self.prefix_len..]) + } } #[derive(Default)] @@ -51,11 +48,7 @@ impl TermDeltaDecoder { } } - pub fn decode(&mut self, cursor: &mut &[u8]) { - let prefix_len: usize = deserialize_vint(cursor) as usize; - let suffix_length: usize = deserialize_vint(cursor) as usize; - let suffix = &cursor[..suffix_length]; - *cursor = &cursor[suffix_length..]; + pub fn decode(&mut self, prefix_len: usize, suffix: &[u8]) { self.term.truncate(prefix_len); self.term.extend_from_slice(suffix); } @@ -65,11 +58,17 @@ impl TermDeltaDecoder { } } - +#[derive(Default)] +pub struct DeltaTermInfo { + pub doc_freq: u32, + pub delta_postings_offset: u32, + pub delta_positions_offset: u32, + pub positions_inner_offset: u8, +} pub struct TermInfoDeltaEncoder { term_info: TermInfo, - has_positions: bool, + pub has_positions: bool, } impl TermInfoDeltaEncoder { @@ -81,34 +80,22 @@ impl TermInfoDeltaEncoder { } } - pub fn encode(&mut self, term_info: TermInfo, write: &mut W) -> io::Result<()> { - VInt(term_info.doc_freq as u64).serialize(write)?; - let delta_postings_offset = term_info.postings_offset - self.term_info.postings_offset; - VInt(delta_postings_offset as u64).serialize(write)?; + pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo { + let mut delta_term_info = DeltaTermInfo { + doc_freq: term_info.doc_freq, + delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset, + delta_positions_offset: 0, + positions_inner_offset: 0, + }; if self.has_positions { - let delta_positions_offset = term_info.positions_offset - self.term_info.positions_offset; - VInt(delta_positions_offset as u64).serialize(write)?; - write.write(&[term_info.positions_inner_offset])?; + delta_term_info.delta_positions_offset = term_info.positions_offset - self.term_info.positions_offset; + delta_term_info.positions_inner_offset = term_info.positions_inner_offset; } mem::replace(&mut self.term_info, term_info); - Ok(()) + delta_term_info } } -fn deserialize_vint(data: &mut &[u8]) -> u64 { - let mut res = 0; - let mut shift = 0; - for i in 0.. { - let b = data[i]; - res |= ((b % 128u8) as u64) << shift; - if b & 128u8 != 0u8 { - *data = &data[(i + 1)..]; - break; - } - shift += 7; - } - res -} pub struct TermInfoDeltaDecoder { term_info: TermInfo, @@ -123,17 +110,27 @@ impl TermInfoDeltaDecoder { } } - pub fn decode(&mut self, cursor: &mut &[u8]) { - let doc_freq = deserialize_vint(cursor) as u32; + pub fn decode(&mut self, code: u8, cursor: &mut &[u8]) { + let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize; + let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize; + const MASK: [u32; 4] = [ + 0xffu32, + 0xffffu32, + 0xffffffu32, + 0xffffffffu32, + ]; + let doc_freq: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & MASK[num_bytes_docfreq]; + *cursor = &cursor[num_bytes_docfreq + 1 ..]; + let delta_postings_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & MASK[num_bytes_postings_offset]; + *cursor = &cursor[num_bytes_postings_offset + 1..]; self.term_info.doc_freq = doc_freq; - let delta_postings = deserialize_vint(cursor) as u32; - self.term_info.postings_offset += delta_postings; + self.term_info.postings_offset += delta_postings_offset; if self.has_positions { - let delta_positions = deserialize_vint(cursor) as u32; - self.term_info.positions_offset += delta_positions; - let position_inner_offset = cursor[0]; - *cursor = &cursor[1..]; - self.term_info.positions_inner_offset = position_inner_offset; + let num_bytes_positions_offset = ((code >> 5) & 3) as usize; + let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & MASK[num_bytes_positions_offset]; + self.term_info.positions_offset += delta_positions_offset; + self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset + 1]; + *cursor = &cursor[num_bytes_positions_offset + 2..]; } } diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index 1c9a148a1..faf9c13fd 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -3,9 +3,10 @@ mod streamer; mod delta_encoder; pub use self::delta_encoder::{TermDeltaEncoder, TermDeltaDecoder}; -pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder}; +pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder, DeltaTermInfo}; pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; + diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 1363bf50a..4779a65c5 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -4,9 +4,9 @@ use std::cmp::max; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; use postings::TermInfo; +use common::BinarySerializable; use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder}; - fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, target_key: &[u8], has_positions: bool) @@ -155,8 +155,29 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> if self.cursor.is_empty() { return false; } - self.term_delta_decoder.decode(&mut self.cursor); - self.term_info_decoder.decode(&mut self.cursor); + let code: u8 = self.cursor[0]; + let mut cursor: &[u8] = &self.cursor[1..]; + + let prefix_suffix_packed = (code & 1u8) == 1u8; + let (prefix_len, suffix_len): (usize, usize) = + if prefix_suffix_packed { + let b = cursor[0]; + cursor = &cursor[1..]; + let prefix_len = (b & 15u8) as usize; + let suffix_len = (b >> 4u8) as usize; + (prefix_len, suffix_len) + } + else { + let prefix_len = u32::deserialize(&mut cursor).unwrap(); + let suffix_len = u32::deserialize(&mut cursor).unwrap(); + (prefix_len as usize, suffix_len as usize) + }; + + let suffix = &cursor[..suffix_len]; + self.term_delta_decoder.decode(prefix_len, suffix); + cursor = &cursor[suffix_len..]; + self.term_info_decoder.decode(code, &mut cursor); + self.cursor = cursor; true } diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index ab5f9dfc3..e5487b5f3 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -2,18 +2,21 @@ use std::io::{self, Write}; use fst; + use fst::raw::Fst; use directory::ReadOnlySource; use common::BinarySerializable; use common::CountingWriter; use postings::TermInfo; use schema::FieldType; -use super::{TermDeltaEncoder, TermInfoDeltaEncoder}; +use super::{TermDeltaEncoder, TermInfoDeltaEncoder, DeltaTermInfo}; use fst::raw::Node; use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; use termdict::TermStreamerBuilder; +use std::mem::transmute; +const PADDING_SIZE: usize = 16; const INDEX_INTERVAL: usize = 1024; fn convert_fst_error(e: fst::Error) -> io::Error { @@ -75,17 +78,71 @@ impl TermDictionaryBuilderImpl if self.len % INDEX_INTERVAL == 0 { self.add_index_entry(); } - self.term_delta_encoder.encode(key, &mut self.write)?; - self.len += 1; + self.term_delta_encoder.encode(key); Ok(()) } pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> { - self.term_info_encoder.encode(term_info.clone(), &mut self.write)?; + let delta_term_info = self.term_info_encoder.encode(term_info.clone()); + let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix(); + write_term_kv(prefix_len, suffix, &delta_term_info, self.term_info_encoder.has_positions, &mut self.write)?; + self.len += 1; Ok(()) } } +fn num_bytes_required(mut n: u32) -> u8 { + for i in 1u8..5u8 { + if n < 256u32 { + return i; + } + else { + n /= 256; + } + } + 0u8 +} + +fn write_term_kv(prefix_len: usize, + suffix: &[u8], + delta_term_info: &DeltaTermInfo, + has_positions: bool, + write: &mut W) -> io::Result<()> { + let suffix_len = suffix.len(); + let mut code = 0u8; + let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq); + let num_bytes_postings_offset = num_bytes_required(delta_term_info.delta_postings_offset); + let num_bytes_positions_offset = num_bytes_required(delta_term_info.delta_positions_offset); + code |= (num_bytes_docfreq - 1) << 1u8; + code |= (num_bytes_postings_offset - 1) << 3u8; + code |= (num_bytes_positions_offset - 1) << 5u8; + if (prefix_len < 16) && (suffix_len < 16) { + code |= 1u8; + write.write_all(&[code, (prefix_len as u8) | ((suffix_len as u8) << 4u8)])?; + } + else { + write.write_all(&[code])?; + (prefix_len as u32).serialize(write)?; + (suffix_len as u32).serialize(write)?; + } + write.write_all(suffix)?; + { + let bytes: [u8; 4] = unsafe { transmute(delta_term_info.doc_freq) }; + write.write_all(&bytes[0..num_bytes_docfreq as usize])?; + } + { + let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) }; + write.write_all(&bytes[0..num_bytes_postings_offset as usize])?; + } + if has_positions { + let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) }; + write.write_all(&bytes[0..num_bytes_positions_offset as usize])?; + write.write_all(&[delta_term_info.positions_inner_offset])?; + } + Ok(()) + +} + impl TermDictionaryBuilder for TermDictionaryBuilderImpl where W: Write { @@ -116,7 +173,8 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl /// Finalize writing the builder, and returns the underlying /// `Write` object. fn finish(mut self) -> io::Result { - self.add_index_entry(); + self.write.write_all(&[0u8; PADDING_SIZE])?; + // self.add_index_entry(); let (mut w, split_len) = self.write.finish()?; let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?; w.write_all(&fst_write)?; @@ -224,9 +282,10 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl let fst_data = source.slice(split_len, length_offset); let fst_index = open_fst_index(fst_data)?; + let len_without_padding = stream_data.len() - PADDING_SIZE; Ok(TermDictionaryImpl { has_positions: has_positions, - stream_data: stream_data, + stream_data: stream_data.slice(0, len_without_padding), fst_index: fst_index, }) } @@ -250,3 +309,18 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl Self::StreamBuilder::new(self, self.has_positions) } } + + +#[cfg(test)] +mod tests { + use super::num_bytes_required; + + #[test] + fn test_num_bytes_required() { + assert_eq!(num_bytes_required(0), 1); + assert_eq!(num_bytes_required(1), 1); + assert_eq!(num_bytes_required(255), 1); + assert_eq!(num_bytes_required(256), 2); + assert_eq!(num_bytes_required(u32::max_value()), 4); + } +} \ No newline at end of file From 5b1e71947fc578a85f51448bed51ade40ce61f60 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 27 Aug 2017 20:20:38 +0900 Subject: [PATCH 26/29] Stream working, all test passing --- src/directory/read_only_source.rs | 6 ++ src/termdict/mod.rs | 3 +- src/termdict/streamdict/delta_encoder.rs | 23 +++++++- src/termdict/streamdict/mod.rs | 31 ++++++++++ src/termdict/streamdict/streamer.rs | 34 ++++++----- src/termdict/streamdict/termdict.rs | 72 +++++++++++++++++------- 6 files changed, 132 insertions(+), 37 deletions(-) diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index 3db74bb01..8ddec278b 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -43,6 +43,12 @@ impl ReadOnlySource { } } + pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) { + let left = self.slice(0, addr); + let right = self.slice_from(addr); + (left, right) + } + /// Creates a ReadOnlySource that is just a /// view over a slice of the data. /// diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 1ce1d6c54..5930eeaaf 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -340,7 +340,8 @@ mod tests { #[test] fn test_stream_range() { - let ids: Vec<_> = (0u32..50_000u32) +// let ids: Vec<_> = (0u32..10_000u32) + let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); let field_type = FieldType::Str(TEXT); diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs index 34152961b..1666e0161 100644 --- a/src/termdict/streamdict/delta_encoder.rs +++ b/src/termdict/streamdict/delta_encoder.rs @@ -1,4 +1,5 @@ use postings::TermInfo; +use super::CheckPoint; use std::mem; /// Returns the len of the longest @@ -80,6 +81,10 @@ impl TermInfoDeltaEncoder { } } + pub fn term_info(&self) -> &TermInfo { + &self.term_info + } + pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo { let mut delta_term_info = DeltaTermInfo { doc_freq: term_info.doc_freq, @@ -102,14 +107,28 @@ pub struct TermInfoDeltaDecoder { has_positions: bool, } + impl TermInfoDeltaDecoder { - pub fn new(has_positions: bool) -> TermInfoDeltaDecoder { + + pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder { TermInfoDeltaDecoder { - term_info: TermInfo::default(), + term_info: term_info, has_positions: has_positions, } } + pub fn from_checkpoint(checkpoint: &CheckPoint, has_positions: bool) -> TermInfoDeltaDecoder { + TermInfoDeltaDecoder { + term_info: TermInfo { + doc_freq: 0u32, + postings_offset: checkpoint.postings_offset, + positions_offset: checkpoint.positions_offset, + positions_inner_offset: 0u8, + }, + has_positions: has_positions + } + } + pub fn decode(&mut self, code: u8, cursor: &mut &[u8]) { let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize; let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize; diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index faf9c13fd..f9c01529e 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -1,7 +1,11 @@ +use std::io::{self, Write, Read}; +use common::BinarySerializable; + mod termdict; mod streamer; mod delta_encoder; + pub use self::delta_encoder::{TermDeltaEncoder, TermDeltaDecoder}; pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder, DeltaTermInfo}; @@ -10,3 +14,30 @@ pub use self::termdict::TermDictionaryBuilderImpl; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; +#[derive(Debug)] +pub struct CheckPoint { + pub stream_offset: u32, + pub postings_offset: u32, + pub positions_offset: u32, +} + +impl BinarySerializable for CheckPoint { + + fn serialize(&self, writer: &mut W) -> io::Result<()> { + self.stream_offset.serialize(writer)?; + self.postings_offset.serialize(writer)?; + self.positions_offset.serialize(writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let stream_offset = u32::deserialize(reader)?; + let postings_offset = u32::deserialize(reader)?; + let positions_offset = u32::deserialize(reader)?; + Ok(CheckPoint { + stream_offset: stream_offset, + postings_offset: postings_offset, + positions_offset: positions_offset, + }) + } +} \ No newline at end of file diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 4779a65c5..eacffe5dc 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -7,17 +7,19 @@ use postings::TermInfo; use common::BinarySerializable; use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder}; + fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, target_key: &[u8], has_positions: bool) -> TermStreamerImpl<'a> { - let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref()); - let offset: usize = offset as usize; + + let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref()); + let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..]; TermStreamerImpl { - cursor: &term_dictionary.stream_data()[offset..], + cursor: stream_data, term_delta_decoder: TermDeltaDecoder::with_previous_term(prev_key), - term_info_decoder: TermInfoDeltaDecoder::new(has_positions), // TODO checkpoint + term_info_decoder: TermInfoDeltaDecoder::from_checkpoint(&checkpoint, has_positions), } } @@ -30,6 +32,7 @@ pub struct TermStreamerBuilderImpl<'a> offset_from: usize, offset_to: usize, current_key: Vec, + term_info: TermInfo, has_positions: bool, } @@ -42,8 +45,9 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.lt(target_key); - let (offset_before, current_key) = get_offset(smaller_than, streamer); + let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer); self.current_key = current_key; + self.term_info = term_info; self.offset_from = offset_before - self.origin; self } @@ -53,8 +57,9 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.le(target_key); - let (offset_before, current_key) = get_offset(smaller_than, streamer); + let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer); self.current_key = current_key; + self.term_info = term_info; self.offset_from = offset_before - self.origin; self } @@ -64,7 +69,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.lt(target_key); - let (offset_before, _) = get_offset(smaller_than, streamer); + let (offset_before, _, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self } @@ -74,7 +79,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let target_key = bound.as_ref(); let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); let smaller_than = |k: &[u8]| k.le(target_key); - let (offset_before, _) = get_offset(smaller_than, streamer); + let (offset_before, _, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; self } @@ -87,7 +92,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> TermStreamerImpl { cursor: &data[start..stop], term_delta_decoder: TermDeltaDecoder::with_previous_term(self.current_key), - term_info_decoder: TermInfoDeltaDecoder::new(self.has_positions), // TODO checkpoint + term_info_decoder: TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions), // TODO checkpoint } } } @@ -101,21 +106,23 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// - the term_buffer state to initialize the block) fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, mut streamer: TermStreamerImpl<'a>) - -> (usize, Vec) + -> (usize, Vec, TermInfo) { let mut prev: &[u8] = streamer.cursor; + let mut term_info = streamer.value().clone(); let mut prev_data: Vec = Vec::from(streamer.term_delta_decoder.term()); - while let Some((iter_key, _)) = streamer.next() { + while let Some((iter_key, iter_term_info)) = streamer.next() { if !predicate(iter_key.as_ref()) { - return (prev.as_ptr() as usize, prev_data); + return (prev.as_ptr() as usize, prev_data, term_info); } prev = streamer.cursor; prev_data.clear(); prev_data.extend_from_slice(iter_key.as_ref()); + term_info = iter_term_info.clone(); } - (prev.as_ptr() as usize, prev_data) + (prev.as_ptr() as usize, prev_data, term_info) } impl<'a> TermStreamerBuilderImpl<'a> @@ -127,6 +134,7 @@ impl<'a> TermStreamerBuilderImpl<'a> let origin = data.as_ptr() as usize; TermStreamerBuilderImpl { term_dictionary: term_dictionary, + term_info: TermInfo::default(), origin: origin, offset_from: 0, offset_to: data.len(), diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index e5487b5f3..c2bdbbe68 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -1,6 +1,7 @@ #![allow(should_implement_trait)] use std::io::{self, Write}; +use super::CheckPoint; use fst; use fst::raw::Fst; @@ -30,9 +31,9 @@ fn has_positions(field_type: &FieldType) -> bool { if indexing_options.is_position_enabled() { true } - else { - false - } + else { + false + } } _ => { false @@ -47,6 +48,7 @@ pub struct TermDictionaryBuilderImpl term_delta_encoder: TermDeltaEncoder, term_info_encoder: TermInfoDeltaEncoder, block_index: fst::MapBuilder>, + checkpoints: Vec, len: usize, } @@ -62,9 +64,20 @@ impl TermDictionaryBuilderImpl where W: Write { fn add_index_entry(&mut self) { + let stream_offset = self.write.written_bytes() as u32; + let term_info = self.term_info_encoder.term_info(); + let postings_offset = term_info.postings_offset as u32; + let positions_offset = term_info.positions_offset as u32; + let checkpoint = CheckPoint { + stream_offset: stream_offset, + postings_offset: postings_offset, + positions_offset: positions_offset, + }; self.block_index - .insert(&self.term_delta_encoder.term(), self.write.written_bytes() as u64) - .unwrap(); + .insert(&self.term_delta_encoder.term(), self.checkpoints.len() as u64) + .expect("Serializing fst on a Vec should never fail. Where your terms not in order maybe?"); + checkpoint.serialize(&mut self.checkpoints) + .expect("Serializing checkpoint on a Vec should never fail."); } /// # Warning @@ -156,6 +169,7 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl term_delta_encoder: TermDeltaEncoder::default(), term_info_encoder: TermInfoDeltaEncoder::new(has_positions), block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), + checkpoints: vec!(), len: 0, }) } @@ -173,12 +187,16 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl /// Finalize writing the builder, and returns the underlying /// `Write` object. fn finish(mut self) -> io::Result { + self.add_index_entry(); self.write.write_all(&[0u8; PADDING_SIZE])?; - // self.add_index_entry(); - let (mut w, split_len) = self.write.finish()?; + let fst_addr = self.write.written_bytes(); let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?; - w.write_all(&fst_write)?; - (split_len as u64).serialize(&mut w)?; + self.write.write_all(&fst_write)?; + let check_points_addr = self.write.written_bytes(); + let (mut w, _) = self.write.finish()?; + w.write_all(&self.checkpoints)?; + (fst_addr as u64).serialize(&mut w)?; + (check_points_addr as u64).serialize(&mut w)?; w.flush()?; Ok(w) } @@ -204,6 +222,7 @@ pub struct TermDictionaryImpl { stream_data: ReadOnlySource, fst_index: fst::Map, + checkpoints_data: ReadOnlySource, has_positions: bool, } @@ -213,7 +232,15 @@ impl TermDictionaryImpl self.stream_data.as_slice() } - pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec, u64) { + pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec, CheckPoint) { + let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key); + let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..]; + let checkpoint = CheckPoint::deserialize(&mut checkpoint_data) + .expect("Checkpoint data is corrupted"); + (term, checkpoint) + } + + fn strictly_previous_key_checkpoint_offset(&self, key: &[u8]) -> (Vec, usize) { let fst_map = &self.fst_index; let fst = fst_map.as_fst(); let mut node = fst.root(); @@ -246,12 +273,12 @@ impl TermDictionaryImpl result.push(last_transition.inp); let fork_node = fst.node(last_transition.addr); fill_last(fst, fork_node, &mut result); - let val = fst_map.get(&result).unwrap(); + let val = fst_map.get(&result).expect("Fst data corrupted") as usize; return (result, val); } else if cur_node.is_final() { // the previous key is a prefix let result_buffer = Vec::from(&key[..i]); - let val = fst_map.get(&result_buffer).unwrap(); + let val = fst_map.get(&result_buffer).expect("Fst data corrupted") as usize; return (result_buffer, val); } } @@ -273,19 +300,22 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl source = source.slice_from(1); let total_len = source.len(); - let length_offset = total_len - 8; - let split_len: usize = { - let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - u64::deserialize(&mut split_len_buffer)? as usize - }; - let stream_data = source.slice(0, split_len); - let fst_data = source.slice(split_len, length_offset); + let (body, footer) = source.split(total_len - 16); + + let mut footer_buffer: &[u8] = footer.as_slice(); + let fst_addr: usize = u64::deserialize(&mut footer_buffer)? as usize; + let checkpoints_addr: usize = u64::deserialize(&mut footer_buffer)? as usize; + + let stream_data = body.slice(0, fst_addr - PADDING_SIZE); + let fst_data = body.slice(fst_addr, checkpoints_addr); + let checkpoints_data = body.slice_from(checkpoints_addr); + let fst_index = open_fst_index(fst_data)?; - let len_without_padding = stream_data.len() - PADDING_SIZE; Ok(TermDictionaryImpl { has_positions: has_positions, - stream_data: stream_data.slice(0, len_without_padding), + stream_data: stream_data, + checkpoints_data: checkpoints_data, fst_index: fst_index, }) } From fc25516b7a33d863d312cb100266757508a829a2 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 28 Aug 2017 11:10:29 +0900 Subject: [PATCH 27/29] Added unit test. --- Cargo.toml | 2 +- src/common/composite_file.rs | 4 +- src/compression/mod.rs | 6 +- .../pack/compression_pack_nosimd.rs | 22 +++---- src/compression/pack/compression_pack_simd.rs | 8 +-- src/compression/stream.rs | 18 +++--- src/directory/read_only_source.rs | 12 ++++ src/postings/segment_postings.rs | 18 +++--- src/postings/serializer.rs | 14 ++--- src/termdict/mod.rs | 28 ++++++++- src/termdict/streamdict/delta_encoder.rs | 62 ++++++++++++------- src/termdict/streamdict/streamer.rs | 27 ++------ src/termdict/streamdict/termdict.rs | 2 +- 13 files changed, 131 insertions(+), 92 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 11711a088..845f1d31d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ debug-assertions = false [features] -default = ["simdcompression", "streamdict"] +default = ["simdcompression"] simdcompression = ["libc", "gcc"] streamdict = [] diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 60f9286df..db9b8ba31 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -64,7 +64,7 @@ pub struct CompositeFile { impl CompositeFile { pub fn open(data: ReadOnlySource) -> io::Result { let end = data.len(); - let footer_len_data = data.slice(end - 4, end); + let footer_len_data = data.slice_from(end - 4); let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; let footer_start = end - 4 - footer_len; @@ -93,7 +93,7 @@ impl CompositeFile { } Ok(CompositeFile { - data: data.slice(0, footer_start), + data: data.slice_to(footer_start), offsets_index: field_index, }) } diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 43096622c..a6bb0eb17 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -83,7 +83,7 @@ impl VIntDecoder for BlockDecoder { } -pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize. +pub const COMPRESSION_BLOCK_SIZE: usize = 128; #[cfg(test)] pub mod tests { @@ -186,14 +186,14 @@ pub mod tests { #[bench] fn bench_compress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1); + let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1); b.iter(|| { encoder.compress_block_sorted(&data, 0u32); }); } #[bench] fn bench_uncompress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1); + let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1); let compressed = encoder.compress_block_sorted(&data, 0u32); let mut decoder = BlockDecoder::new(); b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); }); diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 1c15567e4..24379b9a4 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -2,15 +2,15 @@ use common::bitpacker::compute_num_bits; use common::bitpacker::{BitPacker, BitUnpacker}; use std::cmp; use std::io::Write; -use super::super::NUM_DOCS_PER_BLOCK; +use super::super::COMPRESSION_BLOCK_SIZE; -const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; +const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1; pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize { let mut max_delta = 0; { let mut local_offset = offset; - for i in 0..NUM_DOCS_PER_BLOCK { + for i in 0..COMPRESSION_BLOCK_SIZE { let val = vals[i]; let delta = val - local_offset; max_delta = cmp::max(max_delta, delta); @@ -35,7 +35,7 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> pub struct BlockEncoder { pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE], pub output_len: usize, - input_buffer: [u32; NUM_DOCS_PER_BLOCK], + input_buffer: [u32; COMPRESSION_BLOCK_SIZE], } impl BlockEncoder { @@ -43,7 +43,7 @@ impl BlockEncoder { BlockEncoder { output: [0u8; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, - input_buffer: [0u32; NUM_DOCS_PER_BLOCK], + input_buffer: [0u32; COMPRESSION_BLOCK_SIZE], } } @@ -100,26 +100,26 @@ impl BlockDecoder { let consumed_size = { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); - for i in 0..NUM_DOCS_PER_BLOCK { + for i in 0..COMPRESSION_BLOCK_SIZE { let delta = bit_unpacker.get(i); let val = offset + delta; self.output[i] = val; offset = val; } - 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8 + 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8 }; - self.output_len = NUM_DOCS_PER_BLOCK; + self.output_len = COMPRESSION_BLOCK_SIZE; &compressed_data[consumed_size..] } pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); - for i in 0..NUM_DOCS_PER_BLOCK { + for i in 0..COMPRESSION_BLOCK_SIZE { self.output[i] = bit_unpacker.get(i); } - let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8; - self.output_len = NUM_DOCS_PER_BLOCK; + let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8; + self.output_len = COMPRESSION_BLOCK_SIZE; &compressed_data[consumed_size..] } diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index c430a728f..d24d0f65b 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -1,6 +1,6 @@ -use super::super::NUM_DOCS_PER_BLOCK; +use super::super::COMPRESSION_BLOCK_SIZE; -const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; +const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1; mod simdcomp { use libc::size_t; @@ -83,13 +83,13 @@ impl BlockDecoder { offset: u32) -> usize { let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset); - self.output_len = NUM_DOCS_PER_BLOCK; + self.output_len = COMPRESSION_BLOCK_SIZE; consumed_size } pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize { let consumed_size = uncompress_unsorted(compressed_data, &mut self.output); - self.output_len = NUM_DOCS_PER_BLOCK; + self.output_len = COMPRESSION_BLOCK_SIZE; consumed_size } diff --git a/src/compression/stream.rs b/src/compression/stream.rs index de902da85..9829fe20a 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -1,5 +1,5 @@ use compression::BlockDecoder; -use compression::NUM_DOCS_PER_BLOCK; +use compression::COMPRESSION_BLOCK_SIZE; use compression::compressed_block_size; use directory::{ReadOnlySource, SourceRead}; @@ -14,7 +14,7 @@ impl CompressedIntStream { CompressedIntStream { buffer: SourceRead::from(source), block_decoder: BlockDecoder::new(), - inner_offset: NUM_DOCS_PER_BLOCK, + inner_offset: COMPRESSION_BLOCK_SIZE, } } @@ -22,7 +22,7 @@ impl CompressedIntStream { let mut num_els: usize = output.len(); let mut start: usize = 0; loop { - let available = NUM_DOCS_PER_BLOCK - self.inner_offset; + let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; if num_els >= available { if available > 0 { let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..]; @@ -44,15 +44,15 @@ impl CompressedIntStream { } pub fn skip(&mut self, mut skip_len: usize) { - let available = NUM_DOCS_PER_BLOCK - self.inner_offset; + let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; if available >= skip_len { self.inner_offset += skip_len; } else { skip_len -= available; // entirely skip decompressing some blocks. - while skip_len >= NUM_DOCS_PER_BLOCK { - skip_len -= NUM_DOCS_PER_BLOCK; + while skip_len >= COMPRESSION_BLOCK_SIZE { + skip_len -= COMPRESSION_BLOCK_SIZE; let num_bits: u8 = self.buffer.as_ref()[0]; let block_len = compressed_block_size(num_bits); self.buffer.advance(block_len); @@ -70,7 +70,7 @@ pub mod tests { use super::CompressedIntStream; use compression::compressed_block_size; - use compression::NUM_DOCS_PER_BLOCK; + use compression::COMPRESSION_BLOCK_SIZE; use compression::BlockEncoder; use directory::ReadOnlySource; @@ -78,7 +78,7 @@ pub mod tests { let mut buffer: Vec = vec!(); let mut encoder = BlockEncoder::new(); let vals: Vec = (0u32..1_025u32).collect(); - for chunk in vals.chunks(NUM_DOCS_PER_BLOCK) { + for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) { let compressed_block = encoder.compress_block_unsorted(chunk); let num_bits = compressed_block[0]; assert_eq!(compressed_block_size(num_bits), compressed_block.len()); @@ -91,7 +91,7 @@ pub mod tests { fn test_compressed_int_stream() { let buffer = create_stream_buffer(); let mut stream = CompressedIntStream::wrap(buffer); - let mut block: [u32; NUM_DOCS_PER_BLOCK] = [0u32; NUM_DOCS_PER_BLOCK]; + let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE]; stream.read(&mut block[0..2]); assert_eq!(block[0], 0); diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index 8ddec278b..32423ff96 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -43,6 +43,8 @@ impl ReadOnlySource { } } + /// Splits into 2 `ReadOnlySource`, at the offset given + /// as an argument. pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) { let left = self.slice(0, addr); let right = self.slice_from(addr); @@ -73,10 +75,20 @@ impl ReadOnlySource { /// Like `.slice(...)` but enforcing only the `from` /// boundary. + /// + /// Equivalent to `.slice(from_offset, self.len())` pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource { let len = self.len(); self.slice(from_offset, len) } + + /// Like `.slice(...)` but enforcing only the `to` + /// boundary. + /// + /// Equivalent to `.slice(0, to_offset)` + pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource { + self.slice(0, to_offset) + } } impl HasLen for ReadOnlySource { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 0866a5fe5..7d2bab7f8 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,4 +1,4 @@ -use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder, CompressedIntStream}; +use compression::{COMPRESSION_BLOCK_SIZE, BlockDecoder, VIntDecoder, CompressedIntStream}; use DocId; use postings::{Postings, DocSet, HasLen, SkipResult}; use std::cmp; @@ -92,7 +92,7 @@ impl SegmentPostings { }); SegmentPostings { block_cursor: segment_block_postings, - cur: NUM_DOCS_PER_BLOCK, // cursor within the block + cur: COMPRESSION_BLOCK_SIZE, // cursor within the block delete_bitset: delete_bitset, position_computer: position_computer, } @@ -104,7 +104,7 @@ impl SegmentPostings { SegmentPostings { block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), - cur: NUM_DOCS_PER_BLOCK, + cur: COMPRESSION_BLOCK_SIZE, position_computer: None, } } @@ -131,7 +131,7 @@ impl DocSet for SegmentPostings { if self.cur >= self.block_cursor.block_len() { self.cur = 0; if !self.block_cursor.advance() { - self.cur = NUM_DOCS_PER_BLOCK; + self.cur = COMPRESSION_BLOCK_SIZE; return false; } } @@ -315,8 +315,8 @@ impl BlockSegmentPostings { data: SourceRead, has_freq: bool) -> BlockSegmentPostings { - let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; - let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE; + let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks; BlockSegmentPostings { num_binpacked_blocks: num_binpacked_blocks, num_vint_docs: num_vint_docs, @@ -343,8 +343,8 @@ impl BlockSegmentPostings { // // This does not reset the positions list. pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) { - let num_binpacked_blocks: usize = doc_freq / NUM_DOCS_PER_BLOCK; - let num_vint_docs = doc_freq & (NUM_DOCS_PER_BLOCK - 1); + let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE; + let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1); self.num_binpacked_blocks = num_binpacked_blocks; self.num_vint_docs = num_vint_docs; self.remaining_data = postings_data; @@ -414,7 +414,7 @@ impl BlockSegmentPostings { self.remaining_data.advance(num_consumed_bytes); } // it will be used as the next offset. - self.doc_offset = self.doc_decoder.output(NUM_DOCS_PER_BLOCK - 1); + self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1); self.num_binpacked_blocks -= 1; true } else if self.num_vint_docs > 0 { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 5c24256cc..14a22ccea 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -6,7 +6,7 @@ use schema::FieldEntry; use schema::FieldType; use schema::Schema; use directory::WritePtr; -use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder}; +use compression::{COMPRESSION_BLOCK_SIZE, BlockEncoder}; use DocId; use core::Segment; use std::io::{self, Write}; @@ -264,7 +264,7 @@ impl PostingsSerializer { if self.termfreq_enabled { self.term_freqs.push(term_freq as u32); } - if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { + if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE { { // encode the doc ids let block_encoded: &[u8] = @@ -336,7 +336,7 @@ struct PositionSerializer { impl PositionSerializer { fn new(write: W) -> PositionSerializer { PositionSerializer { - buffer: Vec::with_capacity(NUM_DOCS_PER_BLOCK), + buffer: Vec::with_capacity(COMPRESSION_BLOCK_SIZE), write: CountingWriter::wrap(write), block_encoder: BlockEncoder::new(), } @@ -347,7 +347,7 @@ impl PositionSerializer { } fn write_block(&mut self) -> io::Result<()> { - assert_eq!(self.buffer.len(), NUM_DOCS_PER_BLOCK); + assert_eq!(self.buffer.len(), COMPRESSION_BLOCK_SIZE); let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer); self.write.write_all(block_compressed)?; self.buffer.clear(); @@ -356,8 +356,8 @@ impl PositionSerializer { fn write(&mut self, mut vals: &[u32]) -> io::Result<()> { let mut buffer_len = self.buffer.len(); - while vals.len() + buffer_len >= NUM_DOCS_PER_BLOCK { - let len_to_completion = NUM_DOCS_PER_BLOCK - buffer_len; + while vals.len() + buffer_len >= COMPRESSION_BLOCK_SIZE { + let len_to_completion = COMPRESSION_BLOCK_SIZE - buffer_len; self.buffer.extend_from_slice(&vals[..len_to_completion]); self.write_block()?; vals = &vals[len_to_completion..]; @@ -368,7 +368,7 @@ impl PositionSerializer { } fn close(mut self) -> io::Result<()> { - self.buffer.resize(NUM_DOCS_PER_BLOCK, 0u32); + self.buffer.resize(COMPRESSION_BLOCK_SIZE, 0u32); self.write_block()?; self.write.flush() } diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 5930eeaaf..171a70c2f 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -338,9 +338,35 @@ mod tests { term_dictionary.get(key.as_bytes()); } + + #[test] + fn test_stream_high_range_prefix_suffix() { + let field_type = FieldType::Str(TEXT); + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); + // term requires more than 16bits + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(1)).unwrap(); + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)).unwrap(); + term_dictionary_builder.insert("abr", &make_term_info(2)).unwrap(); + term_dictionary_builder.finish().unwrap() + }; + let source = ReadOnlySource::from(buffer); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) + .unwrap(); + let mut kv_stream = term_dictionary.stream(); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(1)); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(2)); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abr".as_bytes()); + assert!(!kv_stream.advance()); + } + #[test] fn test_stream_range() { -// let ids: Vec<_> = (0u32..10_000u32) let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs index 1666e0161..9a2a4173e 100644 --- a/src/termdict/streamdict/delta_encoder.rs +++ b/src/termdict/streamdict/delta_encoder.rs @@ -1,6 +1,7 @@ use postings::TermInfo; use super::CheckPoint; use std::mem; +use common::BinarySerializable; /// Returns the len of the longest /// common prefix of `s1` and `s2`. @@ -49,9 +50,24 @@ impl TermDeltaDecoder { } } - pub fn decode(&mut self, prefix_len: usize, suffix: &[u8]) { - self.term.truncate(prefix_len); - self.term.extend_from_slice(suffix); + #[inline(always)] + pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] { + let (prefix_len, suffix_len): (usize, usize) = + if (code & 1u8) == 1u8 { + let b = cursor[0]; + cursor = &cursor[1..]; + let prefix_len = (b & 15u8) as usize; + let suffix_len = (b >> 4u8) as usize; + (prefix_len, suffix_len) + } + else { + let prefix_len = u32::deserialize(&mut cursor).unwrap(); + let suffix_len = u32::deserialize(&mut cursor).unwrap(); + (prefix_len as usize, suffix_len as usize) + }; + unsafe { self.term.set_len(prefix_len) }; + self.term.extend_from_slice(&(*cursor)[..suffix_len]); + &cursor[suffix_len..] } pub fn term(&self) -> &[u8] { @@ -108,6 +124,12 @@ pub struct TermInfoDeltaDecoder { } +#[inline(always)] +pub fn make_mask(num_bytes: usize) -> u32 { + const MASK: [u32; 4] = [0xffu32, 0xffffu32, 0xffffffu32, 0xffffffffu32]; + *unsafe { MASK.get_unchecked(num_bytes.wrapping_sub(1) as usize) } +} + impl TermInfoDeltaDecoder { pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder { @@ -129,27 +151,26 @@ impl TermInfoDeltaDecoder { } } - pub fn decode(&mut self, code: u8, cursor: &mut &[u8]) { - let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize; - let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize; - const MASK: [u32; 4] = [ - 0xffu32, - 0xffffu32, - 0xffffffu32, - 0xffffffffu32, - ]; - let doc_freq: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & MASK[num_bytes_docfreq]; - *cursor = &cursor[num_bytes_docfreq + 1 ..]; - let delta_postings_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & MASK[num_bytes_postings_offset]; - *cursor = &cursor[num_bytes_postings_offset + 1..]; + #[inline(always)] + pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] { + let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize + 1; + let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize + 1; + let mut v: u64 = unsafe { *(cursor.as_ptr() as *const u64) }; + let doc_freq: u32 = (v as u32) & make_mask(num_bytes_docfreq); + v >>= (num_bytes_docfreq as u64) * 8u64; + let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset); + cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..]; self.term_info.doc_freq = doc_freq; self.term_info.postings_offset += delta_postings_offset; if self.has_positions { - let num_bytes_positions_offset = ((code >> 5) & 3) as usize; - let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & MASK[num_bytes_positions_offset]; + let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1; + let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset); self.term_info.positions_offset += delta_positions_offset; - self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset + 1]; - *cursor = &cursor[num_bytes_positions_offset + 2..]; + self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset]; + &cursor[num_bytes_positions_offset + 1..] + } + else { + cursor } } @@ -157,4 +178,3 @@ impl TermInfoDeltaDecoder { &self.term_info } } - diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index eacffe5dc..dcb4b8bdb 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -4,7 +4,6 @@ use std::cmp::max; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; use postings::TermInfo; -use common::BinarySerializable; use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder}; @@ -163,28 +162,10 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> if self.cursor.is_empty() { return false; } - let code: u8 = self.cursor[0]; - let mut cursor: &[u8] = &self.cursor[1..]; - - let prefix_suffix_packed = (code & 1u8) == 1u8; - let (prefix_len, suffix_len): (usize, usize) = - if prefix_suffix_packed { - let b = cursor[0]; - cursor = &cursor[1..]; - let prefix_len = (b & 15u8) as usize; - let suffix_len = (b >> 4u8) as usize; - (prefix_len, suffix_len) - } - else { - let prefix_len = u32::deserialize(&mut cursor).unwrap(); - let suffix_len = u32::deserialize(&mut cursor).unwrap(); - (prefix_len as usize, suffix_len as usize) - }; - - let suffix = &cursor[..suffix_len]; - self.term_delta_decoder.decode(prefix_len, suffix); - cursor = &cursor[suffix_len..]; - self.term_info_decoder.decode(code, &mut cursor); + let mut cursor: &[u8] = &self.cursor; + let code: u8 = cursor[0]; + cursor = self.term_delta_decoder.decode(code, &cursor[1..]); + cursor = self.term_info_decoder.decode(code, cursor); self.cursor = cursor; true } diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index c2bdbbe68..bf4d899fd 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -17,7 +17,7 @@ use super::{TermStreamerImpl, TermStreamerBuilderImpl}; use termdict::TermStreamerBuilder; use std::mem::transmute; -const PADDING_SIZE: usize = 16; +const PADDING_SIZE: usize = 4; const INDEX_INTERVAL: usize = 1024; fn convert_fst_error(e: fst::Error) -> io::Error { From 8d05b8f7b217e877622524e9507bc7f00c82a476 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 28 Aug 2017 16:47:06 +0900 Subject: [PATCH 28/29] Added comments. Renamed field reader --- src/common/composite_file.rs | 25 ++++++++++- src/compression/mod.rs | 42 +++++++++++++++++-- src/compression/stream.rs | 15 +++++++ .../vint/compression_vint_nosimd.rs | 8 ++-- src/compression/vint/compression_vint_simd.rs | 9 ++-- ...eld_reader.rs => inverted_index_reader.rs} | 22 ++++++++-- src/core/mod.rs | 4 +- src/core/searcher.rs | 24 +++++------ src/core/segment_reader.rs | 23 +++++----- src/indexer/index_writer.rs | 4 +- src/indexer/merger.rs | 6 +-- src/lib.rs | 38 ++++++++--------- src/postings/mod.rs | 38 ++++++++--------- src/postings/segment_postings.rs | 18 ++++---- src/query/phrase_query/phrase_weight.rs | 4 +- src/query/term_query/term_weight.rs | 33 ++++++++------- src/termdict/fstdict/streamer.rs | 6 +-- src/termdict/fstdict/termdict.rs | 12 +++--- src/termdict/merger.rs | 4 ++ src/termdict/mod.rs | 4 +- 20 files changed, 214 insertions(+), 125 deletions(-) rename src/core/{field_reader.rs => inverted_index_reader.rs} (90%) diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index db9b8ba31..bc0d40786 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -9,13 +9,16 @@ use directory::ReadOnlySource; use common::BinarySerializable; - +/// A `CompositeWrite` is used to write a `CompositeFile`. pub struct CompositeWrite { write: CountingWriter, offsets: HashMap, } impl CompositeWrite { + + /// Crate a new API writer that writes a composite file + /// in a given write. pub fn wrap(w: W) -> CompositeWrite { CompositeWrite { write: CountingWriter::wrap(w), @@ -23,6 +26,7 @@ impl CompositeWrite { } } + /// Start writing a new field. pub fn for_field(&mut self, field: Field) -> &mut CountingWriter { let offset = self.write.written_bytes(); assert!(!self.offsets.contains_key(&field)); @@ -30,6 +34,11 @@ impl CompositeWrite { &mut self.write } + + /// Close the composite file. + /// + /// An index of the different field offsets + /// will be written as a footer. pub fn close(mut self) -> io::Result<()> { let footer_offset = self.write.written_bytes(); VInt(self.offsets.len() as u64).serialize(&mut self.write)?; @@ -55,6 +64,12 @@ impl CompositeWrite { } +/// A composite file is an abstraction to store a +/// file partitioned by field. +/// +/// The file needs to be written field by field. +/// A footer describes the start and stop offsets +/// for each field. #[derive(Clone)] pub struct CompositeFile { data: ReadOnlySource, @@ -62,6 +77,9 @@ pub struct CompositeFile { } impl CompositeFile { + + /// Opens a composite file stored in a given + /// `ReadOnlySource`. pub fn open(data: ReadOnlySource) -> io::Result { let end = data.len(); let footer_len_data = data.slice_from(end - 4); @@ -98,6 +116,8 @@ impl CompositeFile { }) } + /// Returns a composite file that stores + /// no fields. pub fn empty() -> CompositeFile { CompositeFile { offsets_index: HashMap::new(), @@ -105,7 +125,8 @@ impl CompositeFile { } } - + /// Returns the `ReadOnlySource` associated + /// to a given `Field` and stored in a `CompositeFile`. pub fn open_read(&self, field: Field) -> Option { self.offsets_index .get(&field) diff --git a/src/compression/mod.rs b/src/compression/mod.rs index a6bb0eb17..03750074b 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -8,13 +8,13 @@ pub use self::stream::CompressedIntStream; #[cfg(not(feature="simdcompression"))] mod pack { mod compression_pack_nosimd; - pub use self::compression_pack_nosimd::*; + pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder}; } #[cfg(feature="simdcompression")] mod pack { mod compression_pack_simd; - pub use self::compression_pack_simd::*; + pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder}; } pub use self::pack::{BlockEncoder, BlockDecoder}; @@ -22,13 +22,13 @@ pub use self::pack::{BlockEncoder, BlockDecoder}; #[cfg( any(not(feature="simdcompression"), target_env="msvc") )] mod vint { mod compression_vint_nosimd; - pub use self::compression_vint_nosimd::*; + pub(crate) use self::compression_vint_nosimd::*; } #[cfg( all(feature="simdcompression", not(target_env="msvc")) )] mod vint { mod compression_vint_simd; - pub use self::compression_vint_simd::*; + pub(crate) use self::compression_vint_simd::*; } /// Returns the size in bytes of a compressed block, given num_bits. @@ -37,16 +37,50 @@ pub fn compressed_block_size(num_bits: u8) -> usize { } pub trait VIntEncoder { + /// Compresses an array of `u32` integers, + /// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding) + /// and variable bytes encoding. + /// + /// The method takes an array of ints to compress, and returns + /// a `&[u8]` representing the compressed data. + /// + /// The method also takes an offset to give the value of the + /// hypothetical previous element in the delta-encoding. fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8]; + + /// Compresses an array of `u32` integers, + /// using variable bytes encoding. + /// + /// The method takes an array of ints to compress, and returns + /// a `&[u8]` representing the compressed data. fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8]; } pub trait VIntDecoder { + /// Uncompress an array of `u32` integers, + /// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding) + /// and variable bytes encoding. + /// + /// The method takes a number of int to decompress, and returns + /// the amount of bytes that were read to decompress them. + /// + /// The method also takes an offset to give the value of the + /// hypothetical previous element in the delta-encoding. + /// + /// For instance, if delta encoded are `1, 3, 9`, and the + /// `offset` is 5, then the output will be: + /// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18` fn uncompress_vint_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32, num_els: usize) -> usize; + + /// Uncompress an array of `u32s`, compressed using variable + /// byte encoding. + /// + /// The method takes a number of int to decompress, and returns + /// the amount of bytes that were read to decompress them. fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) diff --git a/src/compression/stream.rs b/src/compression/stream.rs index 9829fe20a..a4c4422c5 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -3,6 +3,12 @@ use compression::COMPRESSION_BLOCK_SIZE; use compression::compressed_block_size; use directory::{ReadOnlySource, SourceRead}; +/// Reads a stream of compressed ints. +/// +/// Tantivy uses `CompressedIntStream` to read +/// the position file. +/// The `.skip(...)` makes it possible to avoid +/// decompressing blocks that are not required. pub struct CompressedIntStream { buffer: SourceRead, block_decoder: BlockDecoder, @@ -10,6 +16,8 @@ pub struct CompressedIntStream { } impl CompressedIntStream { + + /// Opens a compressed int stream. pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream { CompressedIntStream { buffer: SourceRead::from(source), @@ -18,6 +26,8 @@ impl CompressedIntStream { } } + /// Fills a buffer with the next `output.len()` integers, + /// and advance the stream by that many els. pub fn read(&mut self, output: &mut [u32]) { let mut num_els: usize = output.len(); let mut start: usize = 0; @@ -43,6 +53,11 @@ impl CompressedIntStream { } } + + /// Skip the next `skip_len` integer. + /// + /// If a full block is skipped, calling + /// `.skip(...)` will avoid decompressing it. pub fn skip(&mut self, mut skip_len: usize) { let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; if available >= skip_len { diff --git a/src/compression/vint/compression_vint_nosimd.rs b/src/compression/vint/compression_vint_nosimd.rs index a3af5e489..4b5e6ec3d 100644 --- a/src/compression/vint/compression_vint_nosimd.rs +++ b/src/compression/vint/compression_vint_nosimd.rs @@ -1,6 +1,6 @@ #[inline(always)] -pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { +pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v - offset; @@ -22,7 +22,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) } #[inline(always)] -pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { +pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v; @@ -43,7 +43,7 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { } #[inline(always)] -pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], +pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> &'a [u8] { @@ -67,7 +67,7 @@ pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], } #[inline(always)] -pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { +pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { let mut read_byte = 0; let num_els = output.len(); for i in 0..num_els { diff --git a/src/compression/vint/compression_vint_simd.rs b/src/compression/vint/compression_vint_simd.rs index f8e09536f..7c4cd9fe0 100644 --- a/src/compression/vint/compression_vint_simd.rs +++ b/src/compression/vint/compression_vint_simd.rs @@ -27,7 +27,7 @@ mod streamvbyte { #[inline(always)] -pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] { +pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] { let compress_length = unsafe { streamvbyte::streamvbyte_delta_encode(input.as_ptr(), input.len() as u32, @@ -37,8 +37,9 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &output[..compress_length] } + #[inline(always)] -pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { +pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { let compress_length = unsafe { streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr()) }; @@ -46,7 +47,7 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { } #[inline(always)] -pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], +pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize { @@ -59,7 +60,7 @@ pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], } #[inline(always)] -pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { +pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { unsafe { streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len()) } diff --git a/src/core/field_reader.rs b/src/core/inverted_index_reader.rs similarity index 90% rename from src/core/field_reader.rs rename to src/core/inverted_index_reader.rs index ca0e95111..b44cc176c 100644 --- a/src/core/field_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -10,7 +10,21 @@ use fastfield::DeleteBitSet; use schema::Schema; use compression::CompressedIntStream; -pub struct FieldReader { + +/// The inverted index reader is in charge of accessing +/// the inverted index associated to a specific field. +/// +/// # Note +/// +/// It is safe to delete the segment associated to +/// an `InvertedIndexReader`. As long as it is open, +/// the `ReadOnlySource` it is relying on should +/// stay available. +/// +/// +/// `InvertedIndexReader` are created by calling +/// the `SegmentReader`'s [`.inverted_index(...)`] method +pub struct InvertedIndexReader { termdict: TermDictionaryImpl, postings_source: ReadOnlySource, positions_source: ReadOnlySource, @@ -18,7 +32,7 @@ pub struct FieldReader { schema: Schema, } -impl FieldReader { +impl InvertedIndexReader { pub(crate) fn new( termdict_source: ReadOnlySource, @@ -26,9 +40,9 @@ impl FieldReader { positions_source: ReadOnlySource, delete_bitset: DeleteBitSet, schema: Schema, - ) -> io::Result { + ) -> io::Result { - Ok(FieldReader { + Ok(InvertedIndexReader { termdict: TermDictionaryImpl::from_source(termdict_source)?, postings_source: postings_source, positions_source: positions_source, diff --git a/src/core/mod.rs b/src/core/mod.rs index bba1447ef..3a6c9568a 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,9 +7,9 @@ mod segment; mod index_meta; mod pool; mod segment_meta; -mod field_reader; +mod inverted_index_reader; -pub use self::field_reader::FieldReader; +pub use self::inverted_index_reader::InvertedIndexReader; pub use self::searcher::Searcher; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 515b8fda6..5afdc4684 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -10,7 +10,7 @@ use schema::{Term, Field}; use termdict::{TermMerger, TermDictionary}; use std::sync::Arc; use std::fmt; -use core::FieldReader; +use core::InvertedIndexReader; /// Holds a list of `SegmentReader`s ready for search. @@ -48,7 +48,7 @@ impl Searcher { .iter() .map(|segment_reader| { segment_reader - .field_reader(term.field()) + .inverted_index(term.field()) .unwrap() // TODO error handling .doc_freq(term) }) @@ -71,15 +71,15 @@ impl Searcher { } - // This API may change in the future. + /// pub fn field(&self, field: Field) -> Result { - let field_readers = self.segment_readers + let inv_index_readers = self.segment_readers .iter() .map(|segment_reader| { - segment_reader.field_reader(field) + segment_reader.inverted_index(field) }) .collect::>>()?; - Ok(FieldSearcher::new(field_readers)) + Ok(FieldSearcher::new(inv_index_readers)) } } @@ -87,15 +87,15 @@ impl Searcher { pub struct FieldSearcher { - field_readers: Vec>, + inv_index_readers: Vec>, } impl FieldSearcher { - fn new(field_readers: Vec>) -> FieldSearcher { + fn new(inv_index_readers: Vec>) -> FieldSearcher { FieldSearcher { - field_readers: field_readers, + inv_index_readers: inv_index_readers, } } @@ -103,10 +103,10 @@ impl FieldSearcher { /// Returns a Stream over all of the sorted unique terms of /// for the given field. pub fn terms(&self) -> TermMerger { - let term_streamers: Vec<_> = self.field_readers + let term_streamers: Vec<_> = self.inv_index_readers .iter() - .map(|field_reader| { - field_reader.terms().stream() + .map(|inverted_index| { + inverted_index.terms().stream() }) .collect(); TermMerger::new(term_streamers) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 80bc2525e..77195304e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use std::collections::HashMap; use common::CompositeFile; use std::fmt; -use core::FieldReader; +use core::InvertedIndexReader; use schema::Field; use fastfield::{FastFieldReader, U64FastFieldReader}; use schema::Schema; @@ -37,7 +37,7 @@ use schema::Schema; /// #[derive(Clone)] pub struct SegmentReader { - field_reader_cache: Arc>>>, + inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, segment_meta: SegmentMeta, @@ -159,7 +159,7 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { - field_reader_cache: Arc::new(RwLock::new(HashMap::new())), + inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), segment_meta: segment.meta().clone(), termdict_composite: termdict_composite, postings_composite: postings_composite, @@ -179,14 +179,13 @@ impl SegmentReader { /// The field reader is in charge of iterating through the /// term dictionary associated to a specific field, /// and opening the posting list associated to any term. - pub fn field_reader(&self, field: Field) -> Result> { - if let Some(field_reader) = self.field_reader_cache.read() + pub fn inverted_index(&self, field: Field) -> Result> { + if let Some(inv_idx_reader) = self.inv_idx_reader_cache.read() .expect("Lock poisoned. This should never happen") .get(&field) { - return Ok(field_reader.clone()); + return Ok(inv_idx_reader.clone()); } - // TODO better error let termdict_source: ReadOnlySource = self.termdict_composite .open_read(field) .ok_or_else(|| { @@ -207,7 +206,7 @@ impl SegmentReader { ErrorKind::SchemaError(format!("Could not find {:?} positions", field)) })?; - let field_reader = Arc::new(FieldReader::new( + let inv_idx_reader = Arc::new(InvertedIndexReader::new( termdict_source, postings_source, positions_source, @@ -215,11 +214,13 @@ impl SegmentReader { self.schema.clone(), )?); - self.field_reader_cache + // by releasing the lock in between, we may end up opening the inverting index + // twice, but this is fine. + self.inv_idx_reader_cache .write() .expect("Field reader cache lock poisoned. This should never happen.") - .insert(field, field_reader.clone()); - Ok(field_reader) + .insert(field, inv_idx_reader.clone()); + Ok(inv_idx_reader) } /// Returns the document (or to be accurate, its stored field) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 1477fb50b..26e6c6330 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -177,9 +177,9 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, // Limit doc helps identify the first document // that may be affected by the delete operation. let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); - let field_reader = segment_reader.field_reader(delete_op.term.field())?; + let inverted_index = segment_reader.inverted_index(delete_op.term.field())?; if let Some(mut docset) = - field_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + inverted_index.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { while docset.advance() { let deleted_doc = docset.doc(); if deleted_doc < limit_doc { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 87a2f1ed5..8e26784da 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -212,7 +212,7 @@ impl IndexMerger { let field_readers = self.readers .iter() .map(|reader| - reader.field_reader(indexed_field)) + reader.inverted_index(indexed_field)) .collect::>>()?; let field_term_streams = field_readers @@ -273,8 +273,8 @@ impl IndexMerger { let segment_ord = heap_item.segment_ord; let term_info = heap_item.streamer.value(); let segment_reader = &self.readers[heap_item.segment_ord]; - let field_reader = segment_reader.field_reader(term.field()).unwrap(); // TODO fix unwrap - let mut segment_postings = field_reader + let inverted_index = segment_reader.inverted_index(term.field()).unwrap(); // TODO fix unwrap + let mut segment_postings = inverted_index .read_postings_from_terminfo(term_info, segment_postings_option); if segment_postings.advance() { Some((segment_ord, segment_postings)) diff --git a/src/lib.rs b/src/lib.rs index d8f2acc31..0b26c6197 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,7 +130,7 @@ pub use directory::Directory; pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher}; pub use indexer::IndexWriter; pub use schema::{Term, Document}; -pub use core::{SegmentReader, FieldReader}; +pub use core::{SegmentReader, InvertedIndexReader}; pub use self::common::TimerTree; pub use postings::DocSet; @@ -391,16 +391,16 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let field_reader = reader.field_reader(text_field).unwrap(); - assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let inverted_index = reader.inverted_index(text_field).unwrap(); + assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = field_reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = field_reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -426,17 +426,17 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let field_reader = reader.field_reader(term_abcd.field()).unwrap(); + let inverted_index = reader.inverted_index(term_abcd.field()).unwrap(); - assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = field_reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = field_reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -462,14 +462,14 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let field_reader = reader.field_reader(term_abcd.field()).unwrap(); - assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let inverted_index = reader.inverted_index(term_abcd.field()).unwrap(); + assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = field_reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(!postings.advance()); } { - let mut postings = field_reader.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -477,7 +477,7 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = field_reader.read_postings(&term_c, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_c, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -501,7 +501,7 @@ mod tests { let term = Term::from_field_u64(field, 1u64); let mut postings = searcher .segment_reader(0) - .field_reader(term.field()).unwrap() + .inverted_index(term.field()).unwrap() .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -525,7 +525,7 @@ mod tests { let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) - .field_reader(term.field()).unwrap() + .inverted_index(term.field()).unwrap() .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -588,11 +588,11 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let field_reader = reader.field_reader(text_field).unwrap(); + let inverted_index = reader.inverted_index(text_field).unwrap(); let term_abcd = Term::from_field_text(text_field, "abcd"); - assert!(field_reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); let term_af = Term::from_field_text(text_field, "af"); - let mut postings = field_reader.read_postings(&term_af, FreqAndPositions).unwrap(); + let mut postings = inverted_index.read_postings(&term_af, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 05991a1d1..9b725cd86 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -132,14 +132,14 @@ mod tests { { let term_a = Term::from_field_text(text_field, "abcdef"); assert!(segment_reader - .field_reader(term_a.field()).unwrap() + .inverted_index(term_a.field()).unwrap() .read_postings(&term_a, FreqAndPositions) .is_none()); } { let term_a = Term::from_field_text(text_field, "a"); let mut postings_a = segment_reader - .field_reader(term_a.field()).unwrap() + .inverted_index(term_a.field()).unwrap() .read_postings(&term_a, FreqAndPositions) .unwrap(); assert_eq!(postings_a.len(), 1000); @@ -162,7 +162,7 @@ mod tests { { let term_e = Term::from_field_text(text_field, "e"); let mut postings_e = segment_reader - .field_reader(term_e.field()).unwrap() + .inverted_index(term_e.field()).unwrap() .read_postings(&term_e, FreqAndPositions) .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); @@ -250,7 +250,7 @@ mod tests { for i in 0..num_docs - 1 { for j in i + 1..num_docs { let mut segment_postings = segment_reader - .field_reader(term_2.field()).unwrap() + .inverted_index(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -264,7 +264,7 @@ mod tests { { let mut segment_postings = segment_reader - .field_reader(term_2.field()).unwrap() + .inverted_index(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -285,7 +285,7 @@ mod tests { // check that filtering works { let mut segment_postings = segment_reader - .field_reader(term_0.field()).unwrap() + .inverted_index(term_0.field()).unwrap() .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -295,7 +295,7 @@ mod tests { } let mut segment_postings = segment_reader - .field_reader(term_0.field()).unwrap() + .inverted_index(term_0.field()).unwrap() .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -320,7 +320,7 @@ mod tests { // make sure seeking still works for i in 0..num_docs { let mut segment_postings = segment_reader - .field_reader(term_2.field()).unwrap() + .inverted_index(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -336,7 +336,7 @@ mod tests { // now try with a longer sequence { let mut segment_postings = segment_reader - .field_reader(term_2.field()).unwrap() + .inverted_index(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -372,14 +372,14 @@ mod tests { // finally, check that it's empty { let mut segment_postings = segment_reader - .field_reader(term_2.field()).unwrap() + .inverted_index(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); assert_eq!(segment_postings.skip_next(0), SkipResult::End); let mut segment_postings = segment_reader - .field_reader(term_2.field()).unwrap() + .inverted_index(term_2.field()).unwrap() .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -447,7 +447,7 @@ mod tests { b.iter(|| { let mut segment_postings = segment_reader - .field_reader(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); while segment_postings.advance() {} @@ -460,19 +460,19 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { let segment_postings_a = segment_reader - .field_reader(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_b = segment_reader - .field_reader(TERM_B.field()).unwrap() + .inverted_index(TERM_B.field()).unwrap() .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_c = segment_reader - .field_reader(TERM_C.field()).unwrap() + .inverted_index(TERM_C.field()).unwrap() .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_d = segment_reader - .field_reader(TERM_D.field()).unwrap() + .inverted_index(TERM_D.field()).unwrap() .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq) .unwrap(); let mut intersection = IntersectionDocSet::from(vec![segment_postings_a, @@ -489,7 +489,7 @@ mod tests { let docs = tests::sample(segment_reader.num_docs(), p); let mut segment_postings = segment_reader - .field_reader(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); @@ -506,7 +506,7 @@ mod tests { b.iter(|| { let mut segment_postings = segment_reader - .field_reader(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { @@ -544,7 +544,7 @@ mod tests { b.iter(|| { let n: u32 = test::black_box(17); let mut segment_postings = segment_reader - .field_reader(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()).unwrap() .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let mut s = 0u32; diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 7d2bab7f8..d8d08e40b 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -508,11 +508,11 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0); - let field_reader = segment_reader.field_reader(int_field).unwrap(); + let inverted_index = segment_reader.inverted_index(int_field).unwrap(); let term = Term::from_field_u64(int_field, 0u64); - let term_info = field_reader.get_term_info(&term).unwrap(); + let term_info = inverted_index.get_term_info(&term).unwrap(); let mut block_segments = - field_reader + inverted_index .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); let mut offset: u32 = 0u32; // checking that the block before calling advance is empty @@ -549,19 +549,19 @@ mod tests { let mut block_segments; { let term = Term::from_field_u64(int_field, 0u64); - let field_reader = segment_reader.field_reader(int_field).unwrap(); - let term_info = field_reader.get_term_info(&term).unwrap(); + let inverted_index = segment_reader.inverted_index(int_field).unwrap(); + let term_info = inverted_index.get_term_info(&term).unwrap(); block_segments = - field_reader + inverted_index .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[0, 2, 4]); { let term = Term::from_field_u64(int_field, 1u64); - let field_reader = segment_reader.field_reader(int_field).unwrap(); - let term_info = field_reader.get_term_info(&term).unwrap(); - field_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments); + let inverted_index = segment_reader.inverted_index(int_field).unwrap(); + let term_info = inverted_index.get_term_info(&term).unwrap(); + inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[1, 3, 5]); diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 2e9efd463..119f32dbe 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -22,9 +22,9 @@ impl Weight for PhraseWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let mut term_postings_list = Vec::new(); for term in &self.phrase_terms { - let field_reader = reader.field_reader(term.field())?; + let inverted_index = reader.inverted_index(term.field())?; let term_postings_option = - field_reader.read_postings(term, SegmentPostingsOption::FreqAndPositions); + inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions); if let Some(term_postings) = term_postings_option { term_postings_list.push(term_postings); } else { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 65f56b054..a755a2921 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -27,26 +27,27 @@ impl TermWeight { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } + /// If the field is not found, returns an empty `DocSet`. pub fn specialized_scorer(&self, reader: &SegmentReader) -> Result> { let field = self.term.field(); - let field_reader = reader.field_reader(field)?; - // TODO move field reader too + let inverted_index = reader.inverted_index(field)?; let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - let postings: Option = field_reader.read_postings(&self.term, self.segment_postings_options); - Ok(postings - .map(|segment_postings| { - TermScorer { - idf: self.idf(), - fieldnorm_reader_opt: fieldnorm_reader_opt, - postings: segment_postings, - } - }) - .unwrap_or(TermScorer { - idf: 1f32, - fieldnorm_reader_opt: None, - postings: SegmentPostings::empty(), - })) + let postings_opt: Option = inverted_index.read_postings(&self.term, self.segment_postings_options); + if let Some(segment_postings) = postings_opt { + Ok(TermScorer { + idf: self.idf(), + fieldnorm_reader_opt: fieldnorm_reader_opt, + postings: segment_postings, + }) + } + else { + Ok(TermScorer { + idf: 1f32, + fieldnorm_reader_opt: None, + postings: SegmentPostings::empty(), + }) + } } } diff --git a/src/termdict/fstdict/streamer.rs b/src/termdict/fstdict/streamer.rs index 823c0ba61..823e5cdc4 100644 --- a/src/termdict/fstdict/streamer.rs +++ b/src/termdict/fstdict/streamer.rs @@ -1,6 +1,5 @@ use fst::{IntoStreamer, Streamer}; use fst::map::{StreamBuilder, Stream}; -use common::BinarySerializable; use postings::TermInfo; use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; @@ -12,8 +11,7 @@ pub struct TermStreamerBuilderImpl<'a> stream_builder: StreamBuilder<'a>, } -impl<'a, V> TermStreamerBuilderImpl<'a> - where V: 'a + BinarySerializable + Default +impl<'a> TermStreamerBuilderImpl<'a> { pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) @@ -55,7 +53,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> stream: self.stream_builder.into_stream(), offset: 0u64, current_key: Vec::with_capacity(100), - current_value: V::default(), + current_value: TermInfo::default(), } } } diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index c9054323a..253e70b2a 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -3,7 +3,7 @@ use fst; use fst::raw::Fst; use directory::ReadOnlySource; use common::BinarySerializable; -use bincode; +use schema::FieldType; use postings::TermInfo; use termdict::{TermDictionary, TermDictionaryBuilder}; use super::{TermStreamerImpl, TermStreamerBuilderImpl}; @@ -39,7 +39,7 @@ impl TermDictionaryBuilderImpl /// # Warning /// /// Horribly dangerous internal API. See `.insert_key(...)`. - pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> { + pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { value.serialize(&mut self.data)?; Ok(()) } @@ -48,7 +48,7 @@ impl TermDictionaryBuilderImpl impl TermDictionaryBuilder for TermDictionaryBuilderImpl where W: Write { - fn new(w: W, field_option: FieldOption) -> io::Result { + fn new(w: W, _field_type: FieldType) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilderImpl { fst_builder: fst_builder, @@ -98,10 +98,10 @@ pub struct TermDictionaryImpl impl TermDictionaryImpl { /// Deserialize and returns the value at address `offset` - pub(crate) fn read_value(&self, offset: u64) -> io::Result { + pub(crate) fn read_value(&self, offset: u64) -> io::Result { let buffer = self.values_mmap.as_slice(); let mut cursor = &buffer[(offset as usize)..]; - V::deserialize(&mut cursor) + TermInfo::deserialize(&mut cursor) } } @@ -127,7 +127,7 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl }) } - fn get>(&self, key: K) -> Option { + fn get>(&self, key: K) -> Option { self.fst_index .get(key) .map(|offset| { diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 4efdfd5d2..1e0dde82f 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -48,6 +48,10 @@ pub struct TermMerger<'a> impl<'a> TermMerger<'a> { + + /// Stream of merged term dictionary + /// + /// pub fn new(streams: Vec>) -> TermMerger<'a> { TermMerger { heap: BinaryHeap::new(), diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 171a70c2f..13a31b6d7 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -345,7 +345,7 @@ mod tests { let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); // term requires more than 16bits - term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(1)).unwrap(); + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)).unwrap(); term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)).unwrap(); term_dictionary_builder.insert("abr", &make_term_info(2)).unwrap(); term_dictionary_builder.finish().unwrap() @@ -355,7 +355,7 @@ mod tests { .unwrap(); let mut kv_stream = term_dictionary.stream(); assert!(kv_stream.advance()); - assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); assert_eq!(kv_stream.value(), &make_term_info(1)); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); From f8710bd4b0b914d37be2671d92d67a4e5fbd941b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 28 Aug 2017 17:42:26 +0900 Subject: [PATCH 29/29] Format --- examples/simple_search.rs | 8 +- src/collector/chained_collector.rs | 9 +- src/collector/count_collector.rs | 12 +- src/collector/facet_collector.rs | 19 +- src/collector/mod.rs | 32 +- src/collector/multi_collector.rs | 13 +- src/collector/top_collector.rs | 24 +- src/common/bitpacker.rs | 12 +- src/common/composite_file.rs | 27 +- src/common/serialize.rs | 6 +- src/common/timer.rs | 18 +- src/common/vint.rs | 7 +- src/compression/mod.rs | 47 +- .../pack/compression_pack_nosimd.rs | 28 +- src/compression/pack/compression_pack_simd.rs | 14 +- src/compression/stream.rs | 24 +- .../vint/compression_vint_nosimd.rs | 15 +- src/compression/vint/compression_vint_simd.rs | 60 +-- src/core/index.rs | 46 +- src/core/index_meta.rs | 2 +- src/core/inverted_index_reader.rs | 62 +-- src/core/pool.rs | 30 +- src/core/searcher.rs | 27 +- src/core/segment.rs | 18 +- src/core/segment_component.rs | 16 +- src/core/segment_meta.rs | 24 +- src/core/segment_reader.rs | 92 ++-- src/datastruct/skip/skiplist_builder.rs | 16 +- src/datastruct/stacker/hashmap.rs | 40 +- src/datastruct/stacker/heap.rs | 40 +- src/directory/error.rs | 16 +- src/directory/managed_directory.rs | 94 ++-- src/directory/mmap_directory.rs | 174 ++++---- src/directory/ram_directory.rs | 64 +-- src/directory/read_only_source.rs | 3 +- src/error.rs | 9 +- src/fastfield/mod.rs | 45 +- src/fastfield/reader.rs | 30 +- src/fastfield/serializer.rs | 35 +- src/fastfield/writer.rs | 12 +- src/indexer/delete_queue.rs | 36 +- src/indexer/doc_opstamp_mapping.rs | 6 +- src/indexer/index_writer.rs | 182 ++++---- src/indexer/log_merge_policy.rs | 50 ++- src/indexer/merger.rs | 406 +++++++++++------- src/indexer/segment_entry.rs | 9 +- src/indexer/segment_manager.rs | 105 +++-- src/indexer/segment_register.rs | 69 +-- src/indexer/segment_serializer.rs | 10 +- src/indexer/segment_updater.rs | 218 +++++----- src/indexer/segment_writer.rs | 108 +++-- src/lib.rs | 113 +++-- src/postings/docset.rs | 3 +- src/postings/mod.rs | 78 ++-- src/postings/postings_writer.rs | 83 ++-- src/postings/recorder.rs | 63 +-- src/postings/segment_postings.rs | 100 ++--- src/postings/segment_postings_option.rs | 1 - src/postings/serializer.rs | 101 ++--- src/postings/term_info.rs | 2 +- src/query/boolean_query/boolean_query.rs | 17 +- src/query/boolean_query/boolean_scorer.rs | 10 +- src/query/boolean_query/boolean_weight.rs | 11 +- src/query/boolean_query/mod.rs | 26 +- src/query/phrase_query/mod.rs | 6 +- src/query/phrase_query/phrase_weight.rs | 6 +- src/query/query.rs | 5 +- src/query/query_parser/query_grammar.rs | 53 ++- src/query/query_parser/query_parser.rs | 237 ++++++---- src/query/term_query/mod.rs | 6 +- src/query/term_query/term_scorer.rs | 12 +- src/query/term_query/term_weight.rs | 15 +- src/schema/field.rs | 2 +- src/schema/field_entry.rs | 20 +- src/schema/field_type.rs | 13 +- src/schema/schema.rs | 87 ++-- src/schema/term.rs | 10 +- src/schema/text_options.rs | 14 +- src/schema/value.rs | 16 +- src/store/mod.rs | 22 +- src/store/reader.rs | 8 +- src/store/writer.rs | 28 +- src/termdict/fstdict/streamer.rs | 26 +- src/termdict/fstdict/termdict.rs | 60 ++- src/termdict/merger.rs | 33 +- src/termdict/mod.rs | 91 ++-- src/termdict/streamdict/delta_encoder.rs | 45 +- src/termdict/streamdict/mod.rs | 3 +- src/termdict/streamdict/streamer.rs | 69 +-- src/termdict/streamdict/termdict.rs | 122 +++--- 90 files changed, 2291 insertions(+), 1795 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 0d35f0e42..3cc82ae4d 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { let mut old_man_doc = Document::default(); old_man_doc.add_text(title, "The Old Man and the Sea"); - old_man_doc.add_text(body, - "He was an old man who fished alone in a skiff in the Gulf Stream and \ - he had gone eighty-four days now without taking a fish."); + old_man_doc.add_text( + body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish.", + ); // ... and add it to the `IndexWriter`. index_writer.add_document(old_man_doc); diff --git a/src/collector/chained_collector.rs b/src/collector/chained_collector.rs index 6cc5785b4..1dff3e3c6 100644 --- a/src/collector/chained_collector.rs +++ b/src/collector/chained_collector.rs @@ -38,10 +38,11 @@ impl ChainedCollector { } impl Collector for ChainedCollector { - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()> { + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()> { try!(self.left.set_segment(segment_local_id, segment)); try!(self.right.set_segment(segment_local_id, segment)); Ok(()) diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index bfb17eb3c..1fd9613ec 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -45,11 +45,11 @@ mod tests { #[bench] fn build_collector(b: &mut Bencher) { b.iter(|| { - let mut count_collector = CountCollector::default(); - for doc in 0..1_000_000 { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + let mut count_collector = CountCollector::default(); + for doc in 0..1_000_000 { + count_collector.collect(doc, 1f32); + } + count_collector.count() + }); } } diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 2d760dfc6..b99822089 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -15,8 +15,9 @@ use SegmentLocalId; /// Facet collector for i64/u64 fast field pub struct FacetCollector - where T: FastFieldReader, - T::ValueType: Eq + Hash +where + T: FastFieldReader, + T::ValueType: Eq + Hash, { counters: HashMap, field: Field, @@ -25,8 +26,9 @@ pub struct FacetCollector impl FacetCollector - where T: FastFieldReader, - T::ValueType: Eq + Hash +where + T: FastFieldReader, + T::ValueType: Eq + Hash, { /// Creates a new facet collector for aggregating a given field. pub fn new(field: Field) -> FacetCollector { @@ -40,8 +42,9 @@ impl FacetCollector impl Collector for FacetCollector - where T: FastFieldReader, - T::ValueType: Eq + Hash +where + T: FastFieldReader, + T::ValueType: Eq + Hash, { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.ff_reader = Some(reader.get_fast_field_reader(self.field)?); @@ -51,7 +54,9 @@ impl Collector for FacetCollector fn collect(&mut self, doc: DocId, _: Score) { let val = self.ff_reader .as_ref() - .expect("collect() was called before set_segment. This should never happen.") + .expect( + "collect() was called before set_segment. This should never happen.", + ) .get(doc); *(self.counters.entry(val).or_insert(0)) += 1; } diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 27435592d..3832abbd1 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -51,20 +51,22 @@ pub use self::chained_collector::chain; pub trait Collector { /// `set_segment` is called before beginning to enumerate /// on this segment. - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()>; + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()>; /// The query pushes the scored document to the collector via this method. fn collect(&mut self, doc: DocId, score: Score); } impl<'a, C: Collector> Collector for &'a mut C { - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()> { + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()> { (*self).set_segment(segment_local_id, segment) } /// The query pushes the scored document to the collector via this method. @@ -169,12 +171,12 @@ pub mod tests { #[bench] fn build_collector(b: &mut Bencher) { b.iter(|| { - let mut count_collector = CountCollector::default(); - let docs: Vec = (0..1_000_000).collect(); - for doc in docs { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + let mut count_collector = CountCollector::default(); + let docs: Vec = (0..1_000_000).collect(); + for doc in docs { + count_collector.collect(doc, 1f32); + } + count_collector.count() + }); } } diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index c2515782d..2e6bf0628 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> { impl<'a> Collector for MultiCollector<'a> { - fn set_segment(&mut self, - segment_local_id: SegmentLocalId, - segment: &SegmentReader) - -> Result<()> { + fn set_segment( + &mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader, + ) -> Result<()> { for collector in &mut self.collectors { try!(collector.set_segment(segment_local_id, segment)); } @@ -53,8 +54,8 @@ mod tests { let mut top_collector = TopCollector::with_limit(2); let mut count_collector = CountCollector::default(); { - let mut collectors = MultiCollector::from(vec![&mut top_collector, - &mut count_collector]); + let mut collectors = + MultiCollector::from(vec![&mut top_collector, &mut count_collector]); collectors.collect(1, 0.2); collectors.collect(2, 0.1); collectors.collect(3, 0.5); diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 7d3c33c9e..e022c4ba9 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc { impl Ord for GlobalScoredDoc { #[inline] fn cmp(&self, other: &GlobalScoredDoc) -> Ordering { - other - .score - .partial_cmp(&self.score) - .unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) + other.score.partial_cmp(&self.score).unwrap_or_else(|| { + other.doc_address.cmp(&self.doc_address) + }) } } @@ -87,7 +86,9 @@ impl TopCollector { scored_docs.sort(); scored_docs .into_iter() - .map(|GlobalScoredDoc { score, doc_address }| (score, doc_address)) + .map(|GlobalScoredDoc { score, doc_address }| { + (score, doc_address) + }) .collect() } @@ -108,14 +109,13 @@ impl Collector for TopCollector { fn collect(&mut self, doc: DocId, score: Score) { if self.at_capacity() { // It's ok to unwrap as long as a limit of 0 is forbidden. - let limit_doc: GlobalScoredDoc = - *self.heap - .peek() - .expect("Top collector with size 0 is forbidden"); + let limit_doc: GlobalScoredDoc = *self.heap.peek().expect( + "Top collector with size 0 is forbidden", + ); if limit_doc.score < score { - let mut mut_head = self.heap - .peek_mut() - .expect("Top collector with size 0 is forbidden"); + let mut mut_head = self.heap.peek_mut().expect( + "Top collector with size 0 is forbidden", + ); mut_head.score = score; mut_head.doc_address = DocAddress(self.segment_id, doc); } diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 7d7aeb23c..a900ae92a 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -88,7 +88,8 @@ impl BitPacker { pub struct BitUnpacker - where Data: Deref +where + Data: Deref, { num_bits: usize, mask: u64, @@ -96,7 +97,8 @@ pub struct BitUnpacker } impl BitUnpacker - where Data: Deref +where + Data: Deref, { pub fn new(data: Data, num_bits: usize) -> BitUnpacker { let mask: u64 = if num_bits == 64 { @@ -121,8 +123,10 @@ impl BitUnpacker let addr_in_bits = idx * num_bits; let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; - debug_assert!(addr + 8 <= data.len(), - "The fast field field should have been padded with 7 bytes."); + debug_assert!( + addr + 8 <= data.len(), + "The fast field field should have been padded with 7 bytes." + ); let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & mask) diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index bc0d40786..4ab843d38 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -10,13 +10,12 @@ use common::BinarySerializable; /// A `CompositeWrite` is used to write a `CompositeFile`. -pub struct CompositeWrite { +pub struct CompositeWrite { write: CountingWriter, offsets: HashMap, } impl CompositeWrite { - /// Crate a new API writer that writes a composite file /// in a given write. pub fn wrap(w: W) -> CompositeWrite { @@ -43,7 +42,8 @@ impl CompositeWrite { let footer_offset = self.write.written_bytes(); VInt(self.offsets.len() as u64).serialize(&mut self.write)?; - let mut offset_fields: Vec<_> = self.offsets.iter() + let mut offset_fields: Vec<_> = self.offsets + .iter() .map(|(field, offset)| (offset, field)) .collect(); @@ -51,7 +51,9 @@ impl CompositeWrite { let mut prev_offset = 0; for (offset, field) in offset_fields { - VInt( (offset - prev_offset) as u64).serialize(&mut self.write)?; + VInt((offset - prev_offset) as u64).serialize( + &mut self.write, + )?; field.serialize(&mut self.write)?; prev_offset = *offset; } @@ -77,7 +79,6 @@ pub struct CompositeFile { } impl CompositeFile { - /// Opens a composite file stored in a given /// `ReadOnlySource`. pub fn open(data: ReadOnlySource) -> io::Result { @@ -90,8 +91,8 @@ impl CompositeFile { let mut footer_buffer = footer_data.as_slice(); let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; - let mut fields = vec!(); - let mut offsets = vec!(); + let mut fields = vec![]; + let mut offsets = vec![]; let mut field_index = HashMap::new(); @@ -106,7 +107,7 @@ impl CompositeFile { for i in 0..num_fields { let field = fields[i]; let start_offset = offsets[i]; - let end_offset = offsets[i+1]; + let end_offset = offsets[i + 1]; field_index.insert(field, (start_offset, end_offset)); } @@ -128,11 +129,9 @@ impl CompositeFile { /// Returns the `ReadOnlySource` associated /// to a given `Field` and stored in a `CompositeFile`. pub fn open_read(&self, field: Field) -> Option { - self.offsets_index - .get(&field) - .map(|&(from, to)| { - self.data.slice(from, to) - }) + self.offsets_index.get(&field).map(|&(from, to)| { + self.data.slice(from, to) + }) } } @@ -189,4 +188,4 @@ mod test { } -} \ No newline at end of file +} diff --git a/src/common/serialize.rs b/src/common/serialize.rs index ee86247c5..87b735769 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -101,9 +101,9 @@ impl BinarySerializable for String { fn deserialize(reader: &mut R) -> io::Result { let string_length = VInt::deserialize(reader)?.val() as usize; let mut result = String::with_capacity(string_length); - reader - .take(string_length as u64) - .read_to_string(&mut result)?; + reader.take(string_length as u64).read_to_string( + &mut result, + )?; Ok(result) } } diff --git a/src/common/timer.rs b/src/common/timer.rs index 035bd65de..84e0f8c3a 100644 --- a/src/common/timer.rs +++ b/src/common/timer.rs @@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> { impl<'a> Drop for OpenTimer<'a> { fn drop(&mut self) { - self.timer_tree - .timings - .push(Timing { - name: self.name, - duration: self.start - .to(PreciseTime::now()) - .num_microseconds() - .unwrap(), - depth: self.depth, - }); + self.timer_tree.timings.push(Timing { + name: self.name, + duration: self.start + .to(PreciseTime::now()) + .num_microseconds() + .unwrap(), + depth: self.depth, + }); } } diff --git a/src/common/vint.rs b/src/common/vint.rs index 39653e8a7..07cdfa24c 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -47,7 +47,12 @@ impl BinarySerializable for VInt { } shift += 7; } - _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")), + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Reach end of buffer", + )) + } } } Ok(VInt(result)) diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 03750074b..cd40e4f1a 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -5,13 +5,13 @@ mod stream; pub use self::stream::CompressedIntStream; -#[cfg(not(feature="simdcompression"))] +#[cfg(not(feature = "simdcompression"))] mod pack { mod compression_pack_nosimd; pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder}; } -#[cfg(feature="simdcompression")] +#[cfg(feature = "simdcompression")] mod pack { mod compression_pack_simd; pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder}; @@ -19,13 +19,13 @@ mod pack { pub use self::pack::{BlockEncoder, BlockDecoder}; -#[cfg( any(not(feature="simdcompression"), target_env="msvc") )] +#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))] mod vint { mod compression_vint_nosimd; pub(crate) use self::compression_vint_nosimd::*; } -#[cfg( all(feature="simdcompression", not(target_env="msvc")) )] +#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))] mod vint { mod compression_vint_simd; pub(crate) use self::compression_vint_simd::*; @@ -70,21 +70,19 @@ pub trait VIntDecoder { /// For instance, if delta encoded are `1, 3, 9`, and the /// `offset` is 5, then the output will be: /// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18` - fn uncompress_vint_sorted<'a>(&mut self, - compressed_data: &'a [u8], - offset: u32, - num_els: usize) - -> usize; + fn uncompress_vint_sorted<'a>( + &mut self, + compressed_data: &'a [u8], + offset: u32, + num_els: usize, + ) -> usize; /// Uncompress an array of `u32s`, compressed using variable /// byte encoding. /// /// The method takes a number of int to decompress, and returns /// the amount of bytes that were read to decompress them. - fn uncompress_vint_unsorted<'a>(&mut self, - compressed_data: &'a [u8], - num_els: usize) - -> usize; + fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize; } impl VIntEncoder for BlockEncoder { @@ -98,19 +96,17 @@ impl VIntEncoder for BlockEncoder { } impl VIntDecoder for BlockDecoder { - fn uncompress_vint_sorted<'a>(&mut self, - compressed_data: &'a [u8], - offset: u32, - num_els: usize) - -> usize { + fn uncompress_vint_sorted<'a>( + &mut self, + compressed_data: &'a [u8], + offset: u32, + num_els: usize, + ) -> usize { self.output_len = num_els; vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset) } - fn uncompress_vint_unsorted<'a>(&mut self, - compressed_data: &'a [u8], - num_els: usize) - -> usize { + fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize { self.output_len = num_els; vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els]) } @@ -125,7 +121,6 @@ pub mod tests { use super::*; use tests; use test::Bencher; - use std::iter; #[test] fn test_encode_sorted_block() { @@ -236,7 +231,7 @@ pub mod tests { #[test] fn test_all_docs_compression_numbits() { for num_bits in 0..33 { - let mut data: Vec = iter::repeat(0u32).take(128).collect(); + let mut data = [0u32; 128]; if num_bits > 0 { data[0] = 1 << (num_bits - 1); } @@ -262,7 +257,9 @@ pub mod tests { let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001); let compressed = encoder.compress_vint_sorted(&data, 0u32); let mut decoder = BlockDecoder::new(); - b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); }); + b.iter(|| { + decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); + }); } } diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 24379b9a4..7780d63b9 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -25,9 +25,9 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> bit_packer.write(*val, &mut output).unwrap(); } 1 + - bit_packer - .close(&mut output) - .expect("packing in memory should never fail") + bit_packer.close(&mut output).expect( + "packing in memory should never fail", + ) } @@ -56,10 +56,9 @@ impl BlockEncoder { pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] { let compressed_size: usize = { let mut output: &mut [u8] = &mut self.output; - let max = vals.iter() - .cloned() - .max() - .expect("compress unsorted called with an empty array"); + let max = vals.iter().cloned().max().expect( + "compress unsorted called with an empty array", + ); let num_bits = compute_num_bits(max); output.write_all(&[num_bits]).unwrap(); let mut bit_packer = BitPacker::new(num_bits as usize); @@ -67,9 +66,9 @@ impl BlockEncoder { bit_packer.write(*val, &mut output).unwrap(); } 1 + - bit_packer - .close(&mut output) - .expect("packing in memory should never fail") + bit_packer.close(&mut output).expect( + "packing in memory should never fail", + ) }; &self.output[..compressed_size] } @@ -93,10 +92,11 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted<'a>(&mut self, - compressed_data: &'a [u8], - mut offset: u32) - -> &'a [u8] { + pub fn uncompress_block_sorted<'a>( + &mut self, + compressed_data: &'a [u8], + mut offset: u32, + ) -> &'a [u8] { let consumed_size = { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index d24d0f65b..498eb7852 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -8,10 +8,11 @@ mod simdcomp { extern "C" { pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t; - pub fn uncompress_sorted(compressed_data: *const u8, - output: *mut u32, - offset: u32) - -> size_t; + pub fn uncompress_sorted( + compressed_data: *const u8, + output: *mut u32, + offset: u32, + ) -> size_t; pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t; @@ -78,10 +79,7 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted(&mut self, - compressed_data: &[u8], - offset: u32) - -> usize { + pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize { let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset); self.output_len = COMPRESSION_BLOCK_SIZE; consumed_size diff --git a/src/compression/stream.rs b/src/compression/stream.rs index a4c4422c5..cd1771759 100644 --- a/src/compression/stream.rs +++ b/src/compression/stream.rs @@ -16,7 +16,6 @@ pub struct CompressedIntStream { } impl CompressedIntStream { - /// Opens a compressed int stream. pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream { CompressedIntStream { @@ -35,17 +34,21 @@ impl CompressedIntStream { let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; if num_els >= available { if available > 0 { - let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..]; + let uncompressed_block = &self.block_decoder.output_array() + [self.inner_offset..]; &mut output[start..start + available].clone_from_slice(uncompressed_block); } num_els -= available; start += available; - let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref()); + let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted( + self.buffer.as_ref(), + ); self.buffer.advance(num_consumed_bytes); self.inner_offset = 0; - } - else { - let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..self.inner_offset + num_els]; + } else { + let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset.. + self.inner_offset + + num_els]; &output[start..start + num_els].clone_from_slice(uncompressed_block); self.inner_offset += num_els; break; @@ -62,8 +65,7 @@ impl CompressedIntStream { let available = COMPRESSION_BLOCK_SIZE - self.inner_offset; if available >= skip_len { self.inner_offset += skip_len; - } - else { + } else { skip_len -= available; // entirely skip decompressing some blocks. while skip_len >= COMPRESSION_BLOCK_SIZE { @@ -72,7 +74,9 @@ impl CompressedIntStream { let block_len = compressed_block_size(num_bits); self.buffer.advance(block_len); } - let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref()); + let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted( + self.buffer.as_ref(), + ); self.buffer.advance(num_consumed_bytes); self.inner_offset = skip_len; } @@ -90,7 +94,7 @@ pub mod tests { use directory::ReadOnlySource; fn create_stream_buffer() -> ReadOnlySource { - let mut buffer: Vec = vec!(); + let mut buffer: Vec = vec![]; let mut encoder = BlockEncoder::new(); let vals: Vec = (0u32..1_025u32).collect(); for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) { diff --git a/src/compression/vint/compression_vint_nosimd.rs b/src/compression/vint/compression_vint_nosimd.rs index 4b5e6ec3d..0e0e272d4 100644 --- a/src/compression/vint/compression_vint_nosimd.rs +++ b/src/compression/vint/compression_vint_nosimd.rs @@ -1,6 +1,10 @@ #[inline(always)] -pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { +pub(crate) fn compress_sorted<'a>( + input: &[u32], + output: &'a mut [u8], + mut offset: u32, +) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v - offset; @@ -43,10 +47,11 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a } #[inline(always)] -pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8], - output: &mut [u32], - offset: u32) - -> &'a [u8] { +pub(crate) fn uncompress_sorted<'a>( + compressed_data: &'a [u8], + output: &mut [u32], + offset: u32, +) -> &'a [u8] { let mut read_byte = 0; let mut result = offset; let num_els = output.len(); diff --git a/src/compression/vint/compression_vint_simd.rs b/src/compression/vint/compression_vint_simd.rs index 7c4cd9fe0..0b508a812 100644 --- a/src/compression/vint/compression_vint_simd.rs +++ b/src/compression/vint/compression_vint_simd.rs @@ -4,24 +4,27 @@ mod streamvbyte { use libc::size_t; extern "C" { - pub fn streamvbyte_delta_encode(data: *const u32, - num_els: u32, - output: *mut u8, - offset: u32) - -> size_t; + pub fn streamvbyte_delta_encode( + data: *const u32, + num_els: u32, + output: *mut u8, + offset: u32, + ) -> size_t; - pub fn streamvbyte_delta_decode(compressed_data: *const u8, - output: *mut u32, - num_els: u32, - offset: u32) - -> size_t; + pub fn streamvbyte_delta_decode( + compressed_data: *const u8, + output: *mut u32, + num_els: u32, + offset: u32, + ) -> size_t; pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t; - pub fn streamvbyte_decode(compressed_data: *const u8, - output: *mut u32, - num_els: usize) - -> size_t; + pub fn streamvbyte_decode( + compressed_data: *const u8, + output: *mut u32, + num_els: usize, + ) -> size_t; } } @@ -29,10 +32,12 @@ mod streamvbyte { #[inline(always)] pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] { let compress_length = unsafe { - streamvbyte::streamvbyte_delta_encode(input.as_ptr(), - input.len() as u32, - output.as_mut_ptr(), - offset) + streamvbyte::streamvbyte_delta_encode( + input.as_ptr(), + input.len() as u32, + output.as_mut_ptr(), + offset, + ) }; &output[..compress_length] } @@ -47,15 +52,18 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a } #[inline(always)] -pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8], - output: &mut [u32], - offset: u32) - -> usize { +pub(crate) fn uncompress_sorted<'a>( + compressed_data: &'a [u8], + output: &mut [u32], + offset: u32, +) -> usize { unsafe { - streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(), - output.as_mut_ptr(), - output.len() as u32, - offset) + streamvbyte::streamvbyte_delta_decode( + compressed_data.as_ptr(), + output.as_mut_ptr(), + output.len() as u32, + offset, + ) } } diff --git a/src/core/index.rs b/src/core/index.rs index 01a0abe54..e4acb8a07 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -48,9 +48,10 @@ impl Index { pub fn create_in_ram(schema: Schema) -> Index { let ram_directory = RAMDirectory::create(); // unwrap is ok here - let directory = ManagedDirectory::new(ram_directory) - .expect("Creating a managed directory from a brand new RAM directory \ - should never fail."); + let directory = ManagedDirectory::new(ram_directory).expect( + "Creating a managed directory from a brand new RAM directory \ + should never fail.", + ); Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") } @@ -127,10 +128,11 @@ impl Index { /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics /// If the heap size per thread is too small, panics. - pub fn writer_with_num_threads(&self, - num_threads: usize, - heap_size_in_bytes: usize) - -> Result { + pub fn writer_with_num_threads( + &self, + num_threads: usize, + heap_size_in_bytes: usize, + ) -> Result { open_index_writer(self, num_threads, heap_size_in_bytes) } @@ -155,10 +157,12 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - Ok(self.searchable_segment_metas()? - .into_iter() - .map(|segment_meta| self.segment(segment_meta)) - .collect()) + Ok( + self.searchable_segment_metas()? + .into_iter() + .map(|segment_meta| self.segment(segment_meta)) + .collect(), + ) } #[doc(hidden)] @@ -190,10 +194,12 @@ impl Index { /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { - Ok(self.searchable_segment_metas()? - .iter() - .map(|segment_meta| segment_meta.id()) - .collect()) + Ok( + self.searchable_segment_metas()? + .iter() + .map(|segment_meta| segment_meta.id()) + .collect(), + ) } /// Creates a new generation of searchers after @@ -203,10 +209,12 @@ impl Index { /// published or after a merge. pub fn load_searchers(&self) -> Result<()> { let searchable_segments = self.searchable_segments()?; - let segment_readers: Vec = try!(searchable_segments - .into_iter() - .map(SegmentReader::open) - .collect()); + let segment_readers: Vec = try!( + searchable_segments + .into_iter() + .map(SegmentReader::open) + .collect() + ); let searchers = (0..NUM_SEARCHERS) .map(|_| Searcher::from(segment_readers.clone())) .collect(); diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 785846a0d..6eafddf77 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -9,7 +9,7 @@ use core::SegmentMeta; /// * the index docstamp /// * the schema /// -#[derive(Clone,Debug,Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct IndexMeta { pub segments: Vec, pub schema: Schema, diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index b44cc176c..06816f361 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,6 +1,5 @@ use directory::{SourceRead, ReadOnlySource}; use termdict::{TermDictionary, TermDictionaryImpl}; -use std::io; use postings::{SegmentPostings, BlockSegmentPostings}; use postings::TermInfo; use postings::SegmentPostingsOption; @@ -33,22 +32,21 @@ pub struct InvertedIndexReader { } impl InvertedIndexReader { - pub(crate) fn new( termdict_source: ReadOnlySource, postings_source: ReadOnlySource, positions_source: ReadOnlySource, delete_bitset: DeleteBitSet, schema: Schema, - ) -> io::Result { + ) -> InvertedIndexReader { - Ok(InvertedIndexReader { - termdict: TermDictionaryImpl::from_source(termdict_source)?, + InvertedIndexReader { + termdict: TermDictionaryImpl::from_source(termdict_source), postings_source: postings_source, positions_source: positions_source, delete_bitset: delete_bitset, schema: schema, - }) + } } /// Returns the term info associated with the term. @@ -72,9 +70,11 @@ impl InvertedIndexReader { /// # Warning /// /// This does not reset the positions list. - pub fn reset_block_postings_from_terminfo(&self, - term_info: &TermInfo, - block_postings: &mut BlockSegmentPostings) { + pub fn reset_block_postings_from_terminfo( + &self, + term_info: &TermInfo, + block_postings: &mut BlockSegmentPostings, + ) { let offset = term_info.postings_offset as usize; let end_source = self.postings_source.len(); let postings_slice = self.postings_source.slice(offset, end_source); @@ -88,27 +88,30 @@ impl InvertedIndexReader { /// This method is for an advanced usage only. /// /// Most user should prefer using `read_postings` instead. - pub fn read_block_postings_from_terminfo(&self, - term_info: &TermInfo, - option: SegmentPostingsOption) - -> BlockSegmentPostings { + pub fn read_block_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: SegmentPostingsOption, + ) -> BlockSegmentPostings { let offset = term_info.postings_offset as usize; let postings_data = self.postings_source.slice_from(offset); let has_freq = option.has_freq(); BlockSegmentPostings::from_data( term_info.doc_freq as usize, SourceRead::from(postings_data), - has_freq) + has_freq, + ) } /// Returns a posting object given a `term_info`. /// This method is for an advanced usage only. /// /// Most user should prefer using `read_postings` instead. - pub fn read_postings_from_terminfo(&self, - term_info: &TermInfo, - option: SegmentPostingsOption) - -> SegmentPostings { + pub fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: SegmentPostingsOption, + ) -> SegmentPostings { let block_postings = self.read_block_postings_from_terminfo(term_info, option); let delete_bitset = self.delete_bitset.clone(); let position_stream = { @@ -118,16 +121,11 @@ impl InvertedIndexReader { let mut stream = CompressedIntStream::wrap(positions_source); stream.skip(term_info.positions_inner_offset as usize); Some(stream) - } - else { + } else { None } }; - SegmentPostings::from_block_postings( - block_postings, - delete_bitset, - position_stream - ) + SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream) } /// Returns the segment postings associated with the term, and with the given option, @@ -140,16 +138,20 @@ impl InvertedIndexReader { /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings(&self, - term: &Term, - option: SegmentPostingsOption) - -> Option { + pub fn read_postings( + &self, + term: &Term, + option: SegmentPostingsOption, + ) -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(term)); let maximum_option = get!(field_entry.field_type().get_segment_postings_option()); let best_effort_option = cmp::min(maximum_option, option); - Some(self.read_postings_from_terminfo(&term_info, best_effort_option)) + Some(self.read_postings_from_terminfo( + &term_info, + best_effort_option, + )) } /// Returns the number of documents containing the term. diff --git a/src/core/pool.rs b/src/core/pool.rs index 805ea3467..1796fc32c 100644 --- a/src/core/pool.rs +++ b/src/core/pool.rs @@ -76,8 +76,11 @@ impl Pool { if former_generation >= generation { break; } - self.freshest_generation - .compare_and_swap(former_generation, generation, Ordering::SeqCst); + self.freshest_generation.compare_and_swap( + former_generation, + generation, + Ordering::SeqCst, + ); } } @@ -91,9 +94,9 @@ impl Pool { let gen_item = self.queue.pop(); if gen_item.generation >= generation { return LeasedItem { - gen_item: Some(gen_item), - recycle_queue: self.queue.clone(), - }; + gen_item: Some(gen_item), + recycle_queue: self.queue.clone(), + }; } else { // this searcher is obsolete, // removing it from the pool. @@ -113,25 +116,26 @@ impl Deref for LeasedItem { fn deref(&self) -> &T { &self.gen_item - .as_ref() - .expect("Unwrapping a leased item should never fail") - .item // unwrap is safe here + .as_ref() + .expect("Unwrapping a leased item should never fail") + .item // unwrap is safe here } } impl DerefMut for LeasedItem { fn deref_mut(&mut self) -> &mut T { &mut self.gen_item - .as_mut() - .expect("Unwrapping a mut leased item should never fail") - .item // unwrap is safe here + .as_mut() + .expect("Unwrapping a mut leased item should never fail") + .item // unwrap is safe here } } impl Drop for LeasedItem { fn drop(&mut self) { - let gen_item: GenerationItem = mem::replace(&mut self.gen_item, None) - .expect("Unwrapping a leased item should never fail"); + let gen_item: GenerationItem = mem::replace(&mut self.gen_item, None).expect( + "Unwrapping a leased item should never fail", + ); self.recycle_queue.push(gen_item); } } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 5afdc4684..14f1cb141 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -47,10 +47,7 @@ impl Searcher { self.segment_readers .iter() .map(|segment_reader| { - segment_reader - .inverted_index(term.field()) - .unwrap() // TODO error handling - .doc_freq(term) + segment_reader.inverted_index(term.field()).doc_freq(term) }) .fold(0u32, |acc, val| acc + val) } @@ -70,16 +67,13 @@ impl Searcher { query.search(self, collector) } - - /// - pub fn field(&self, field: Field) -> Result { + /// Return the field searcher associated to a `Field`. + pub fn field(&self, field: Field) -> FieldSearcher { let inv_index_readers = self.segment_readers .iter() - .map(|segment_reader| { - segment_reader.inverted_index(field) - }) - .collect::>>()?; - Ok(FieldSearcher::new(inv_index_readers)) + .map(|segment_reader| segment_reader.inverted_index(field)) + .collect::>(); + FieldSearcher::new(inv_index_readers) } } @@ -92,11 +86,8 @@ pub struct FieldSearcher { impl FieldSearcher { - fn new(inv_index_readers: Vec>) -> FieldSearcher { - FieldSearcher { - inv_index_readers: inv_index_readers, - } + FieldSearcher { inv_index_readers: inv_index_readers } } @@ -105,9 +96,7 @@ impl FieldSearcher { pub fn terms(&self) -> TermMerger { let term_streamers: Vec<_> = self.inv_index_readers .iter() - .map(|inverted_index| { - inverted_index.terms().stream() - }) + .map(|inverted_index| inverted_index.terms().stream()) .collect(); TermMerger::new(term_streamers) } diff --git a/src/core/segment.rs b/src/core/segment.rs index 16cb214d2..59b5eaa13 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -76,18 +76,20 @@ impl Segment { } /// Open one of the component file for a *regular* read. - pub fn open_read(&self, - component: SegmentComponent) - -> result::Result { + pub fn open_read( + &self, + component: SegmentComponent, + ) -> result::Result { let path = self.relative_path(component); let source = try!(self.index.directory().open_read(&path)); Ok(source) } /// Open one of the component file for *regular* write. - pub fn open_write(&mut self, - component: SegmentComponent) - -> result::Result { + pub fn open_write( + &mut self, + component: SegmentComponent, + ) -> result::Result { let path = self.relative_path(component); let write = try!(self.index.directory_mut().open_write(&path)); Ok(write) @@ -125,11 +127,11 @@ mod tests { { let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS); assert!(directory.exists(&*path)); - directory.garbage_collect(|| { living_files.clone() }); + directory.garbage_collect(|| living_files.clone()); assert!(directory.exists(&*path)); } - directory.garbage_collect(|| { living_files }); + directory.garbage_collect(|| living_files); assert!(!directory.exists(&*path)); } diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index e4cbc0068..b460258c7 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -28,13 +28,15 @@ pub enum SegmentComponent { impl SegmentComponent { /// Iterates through the components. pub fn iterator() -> impl Iterator { - static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS, - SegmentComponent::POSITIONS, - SegmentComponent::FASTFIELDS, - SegmentComponent::FIELDNORMS, - SegmentComponent::TERMS, - SegmentComponent::STORE, - SegmentComponent::DELETE]; + static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [ + SegmentComponent::POSTINGS, + SegmentComponent::POSITIONS, + SegmentComponent::FASTFIELDS, + SegmentComponent::FIELDNORMS, + SegmentComponent::TERMS, + SegmentComponent::STORE, + SegmentComponent::DELETE, + ]; SEGMENT_COMPONENTS.into_iter() } } diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 623b22442..1abe95652 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -64,16 +64,14 @@ impl SegmentMeta { pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { let mut path = self.id().uuid_string(); path.push_str(&*match component { - SegmentComponent::POSITIONS => ".pos".to_string(), - SegmentComponent::POSTINGS => ".idx".to_string(), - SegmentComponent::TERMS => ".term".to_string(), - SegmentComponent::STORE => ".store".to_string(), - SegmentComponent::FASTFIELDS => ".fast".to_string(), - SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), - SegmentComponent::DELETE => { - format!(".{}.del", self.delete_opstamp().unwrap_or(0)) - } - }); + SegmentComponent::POSITIONS => ".pos".to_string(), + SegmentComponent::POSTINGS => ".idx".to_string(), + SegmentComponent::TERMS => ".term".to_string(), + SegmentComponent::STORE => ".store".to_string(), + SegmentComponent::FASTFIELDS => ".fast".to_string(), + SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), + SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)), + }); PathBuf::from(path) } @@ -111,8 +109,8 @@ impl SegmentMeta { #[doc(hidden)] pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { self.deletes = Some(DeleteMeta { - num_deleted_docs: num_deleted_docs, - opstamp: opstamp, - }); + num_deleted_docs: num_deleted_docs, + opstamp: opstamp, + }); } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 77195304e..c77c71a7b 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,7 +4,6 @@ use core::SegmentId; use core::SegmentComponent; use std::sync::RwLock; use common::HasLen; -use error::ErrorKind; use core::SegmentMeta; use fastfield::{self, FastFieldNotAvailableError}; use fastfield::DeleteBitSet; @@ -87,17 +86,17 @@ impl SegmentReader { /// /// # Panics /// May panic if the index is corrupted. - pub fn get_fast_field_reader - (&self, field: Field) -> fastfield::Result { + pub fn get_fast_field_reader( + &self, + field: Field, + ) -> fastfield::Result { let field_entry = self.schema.get_field_entry(field); if !TFastFieldReader::is_enabled(field_entry.field_type()) { Err(FastFieldNotAvailableError::new(field_entry)) } else { self.fast_fields_composite .open_read(field) - .ok_or_else(|| { - FastFieldNotAvailableError::new(field_entry) - }) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) .map(TFastFieldReader::open) } } @@ -111,9 +110,9 @@ impl SegmentReader { /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> Option { - self.fieldnorms_composite - .open_read(field) - .map(U64FastFieldReader::open) + self.fieldnorms_composite.open_read(field).map( + U64FastFieldReader::open, + ) } /// Accessor to the segment's `StoreReader`. @@ -131,13 +130,12 @@ impl SegmentReader { let store_reader = StoreReader::from_source(store_source); let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; - let postings_composite = CompositeFile::open(postings_source)?; + let postings_composite = CompositeFile::open(postings_source)?; let positions_composite = { if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) { CompositeFile::open(source)? - } - else { + } else { CompositeFile::empty() } }; @@ -159,17 +157,17 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { - inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), - segment_meta: segment.meta().clone(), - termdict_composite: termdict_composite, - postings_composite: postings_composite, - fast_fields_composite: fast_fields_composite, - fieldnorms_composite: fieldnorms_composite, - segment_id: segment.id(), - store_reader: store_reader, - delete_bitset: delete_bitset, - positions_composite: positions_composite, - schema: schema, + inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), + segment_meta: segment.meta().clone(), + termdict_composite: termdict_composite, + postings_composite: postings_composite, + fast_fields_composite: fast_fields_composite, + fieldnorms_composite: fieldnorms_composite, + segment_id: segment.id(), + store_reader: store_reader, + delete_bitset: delete_bitset, + positions_composite: positions_composite, + schema: schema, }) } @@ -179,32 +177,27 @@ impl SegmentReader { /// The field reader is in charge of iterating through the /// term dictionary associated to a specific field, /// and opening the posting list associated to any term. - pub fn inverted_index(&self, field: Field) -> Result> { - if let Some(inv_idx_reader) = self.inv_idx_reader_cache.read() - .expect("Lock poisoned. This should never happen") - .get(&field) { - return Ok(inv_idx_reader.clone()); + pub fn inverted_index(&self, field: Field) -> Arc { + if let Some(inv_idx_reader) = + self.inv_idx_reader_cache + .read() + .expect("Lock poisoned. This should never happen") + .get(&field) + { + inv_idx_reader.clone(); } - let termdict_source: ReadOnlySource = self.termdict_composite - .open_read(field) - .ok_or_else(|| { - ErrorKind::SchemaError( - format!("Could not find {:?} term dictionary", field) - ) - })?; + let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect( + "Index corrupted. Failed to open field term dictionary in composite file.", + ); - let postings_source = self.postings_composite - .open_read(field) - .ok_or_else(|| { - ErrorKind::SchemaError(format!("Could not find {:?} postings", field)) - })?; + let postings_source = self.postings_composite.open_read(field).expect( + "Index corrupted. Failed to open field postings in composite file.", + ); - let positions_source = self.positions_composite - .open_read(field) - .ok_or_else(|| { - ErrorKind::SchemaError(format!("Could not find {:?} positions", field)) - })?; + let positions_source = self.positions_composite.open_read(field).expect( + "Index corrupted. Failed to open field positions in composite file.", + ); let inv_idx_reader = Arc::new(InvertedIndexReader::new( termdict_source, @@ -212,15 +205,18 @@ impl SegmentReader { positions_source, self.delete_bitset.clone(), self.schema.clone(), - )?); + )); // by releasing the lock in between, we may end up opening the inverting index // twice, but this is fine. self.inv_idx_reader_cache .write() - .expect("Field reader cache lock poisoned. This should never happen.") + .expect( + "Field reader cache lock poisoned. This should never happen.", + ) .insert(field, inv_idx_reader.clone()); - Ok(inv_idx_reader) + + inv_idx_reader } /// Returns the document (or to be accurate, its stored field) diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index eaa439d08..af665ab3c 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -39,11 +39,11 @@ impl LayerBuilder { doc_id.serialize(&mut self.buffer)?; value.serialize(&mut self.buffer)?; Ok(if self.remaining == 0 { - self.remaining = self.period; - Some((doc_id, offset)) - } else { - None - }) + self.remaining = self.period; + Some((doc_id, offset)) + } else { + None + }) } } @@ -78,8 +78,10 @@ impl SkipListBuilder { loop { skip_pointer = match skip_pointer { Some((skip_doc_id, skip_offset)) => { - try!(self.get_skip_layer(layer_id) - .insert(skip_doc_id, &skip_offset)) + try!(self.get_skip_layer(layer_id).insert( + skip_doc_id, + &skip_offset, + )) } None => { return Ok(()); diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 03f18ed51..c9054dff2 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -68,9 +68,14 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) { }; let table_num_bits: usize = (1..) .into_iter() - .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) + .take_while(|num_bits: &usize| { + compute_table_size(*num_bits) < table_size_limit + }) .last() - .expect(&format!("Per thread memory is too small: {}", per_thread_memory_budget)); + .expect(&format!( + "Per thread memory is too small: {}", + per_thread_memory_budget + )); let table_size = compute_table_size(table_num_bits); let heap_size = per_thread_memory_budget - table_size; (heap_size, table_num_bits) @@ -174,13 +179,10 @@ impl<'a> HashMap<'a> { } pub fn iter<'b: 'a>(&'b self) -> impl Iterator + 'b { - self.occupied - .iter() - .cloned() - .map(move |bucket: usize| { - let kv = self.table[bucket]; - self.get_key_value(kv.key_value_addr) - }) + self.occupied.iter().cloned().map(move |bucket: usize| { + let kv = self.table[bucket]; + self.get_key_value(kv.key_value_addr) + }) } @@ -282,8 +284,10 @@ mod tests { let s1 = "abcdef"; let s2 = "abcdeg"; for i in 0..5 { - assert_eq!(murmurhash2(&s1[i..5].as_bytes()), - murmurhash2(&s2[i..5].as_bytes())); + assert_eq!( + murmurhash2(&s1[i..5].as_bytes()), + murmurhash2(&s2[i..5].as_bytes()) + ); } } @@ -303,13 +307,13 @@ mod tests { let keys: Vec<&'static str> = vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "]; b.iter(|| { - keys.iter() - .map(|&s| s.as_bytes()) - .map(murmurhash2::murmurhash2) - .map(|h| h as u64) - .last() - .unwrap() - }); + keys.iter() + .map(|&s| s.as_bytes()) + .map(murmurhash2::murmurhash2) + .map(|h| h as u64) + .last() + .unwrap() + }); } diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index 9d7a8f885..0bfd01fc2 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -144,7 +144,8 @@ impl InnerHeap { addr } else { if self.next_heap.is_none() { - info!(r#"Exceeded heap size. The segment will be committed right after indexing this document."#,); + info!(r#"Exceeded heap size. The segment will be committed right + after indexing this document."#,); self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize))); } self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len @@ -154,10 +155,9 @@ impl InnerHeap { fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] { let start = bytes_ref.0; if start >= self.buffer_len { - self.next_heap - .as_ref() - .unwrap() - .get_slice(BytesRef(start - self.buffer_len)) + self.next_heap.as_ref().unwrap().get_slice(BytesRef( + start - self.buffer_len, + )) } else { let start = start as usize; let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize; @@ -167,10 +167,10 @@ impl InnerHeap { fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] { if start >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .get_mut_slice(start - self.buffer_len, stop - self.buffer_len) + self.next_heap.as_mut().unwrap().get_mut_slice( + start - self.buffer_len, + stop - self.buffer_len, + ) } else { &mut self.buffer[start as usize..stop as usize] } @@ -188,10 +188,9 @@ impl InnerHeap { fn get_mut(&mut self, addr: u32) -> *mut u8 { if addr >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .get_mut(addr - self.buffer_len) + self.next_heap.as_mut().unwrap().get_mut( + addr - self.buffer_len, + ) } else { let addr_isize = addr as isize; unsafe { self.buffer.as_mut_ptr().offset(addr_isize) } @@ -200,10 +199,9 @@ impl InnerHeap { fn get_mut_ref(&mut self, addr: u32) -> &mut Item { if addr >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .get_mut_ref(addr - self.buffer_len) + self.next_heap.as_mut().unwrap().get_mut_ref( + addr - self.buffer_len, + ) } else { let v_ptr_u8 = self.get_mut(addr) as *mut u8; let v_ptr = v_ptr_u8 as *mut Item; @@ -213,10 +211,10 @@ impl InnerHeap { pub fn set(&mut self, addr: u32, val: &Item) { if addr >= self.buffer_len { - self.next_heap - .as_mut() - .unwrap() - .set(addr - self.buffer_len, val); + self.next_heap.as_mut().unwrap().set( + addr - self.buffer_len, + val, + ); } else { let v_ptr: *const Item = val as *const Item; let v_ptr_u8: *const u8 = v_ptr as *const u8; diff --git a/src/directory/error.rs b/src/directory/error.rs index d864012ea..73424f2e0 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError { write!(f, "the file '{:?}' already exists", path) } OpenWriteError::IOError(ref err) => { - write!(f, - "an io error occurred while opening a file for writing: '{}'", - err) + write!( + f, + "an io error occurred while opening a file for writing: '{}'", + err + ) } } } @@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError { write!(f, "the file '{:?}' does not exist", path) } OpenReadError::IOError(ref err) => { - write!(f, - "an io error occurred while opening a file for reading: '{}'", - err) + write!( + f, + "an io error occurred while opening a file for reading: '{}'", + err + ) } } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 5f4e7e773..8005c62b4 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -45,10 +45,9 @@ pub struct FileProtection { } fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) { - let mut meta_informations_wlock = directory - .meta_informations - .write() - .expect("Managed file lock poisoned"); + let mut meta_informations_wlock = directory.meta_informations.write().expect( + "Managed file lock poisoned", + ); if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) { (*counter_ref_mut) -= 1; } @@ -68,9 +67,10 @@ impl Drop for FileProtection { /// Saves the file containing the list of existing files /// that were created by tantivy. -fn save_managed_paths(directory: &mut Directory, - wlock: &RwLockWriteGuard) - -> io::Result<()> { +fn save_managed_paths( + directory: &mut Directory, + wlock: &RwLockWriteGuard, +) -> io::Result<()> { let mut w = serde_json::to_vec(&wlock.managed_paths)?; write!(&mut w, "\n")?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; @@ -84,22 +84,22 @@ impl ManagedDirectory { Ok(data) => { let managed_files_json = String::from_utf8_lossy(&data); let managed_files: HashSet = - serde_json::from_str(&managed_files_json) - .chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?; + serde_json::from_str(&managed_files_json).chain_err(|| { + ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()) + })?; Ok(ManagedDirectory { - directory: box directory, - meta_informations: Arc::new(RwLock::new(MetaInformation { - managed_paths: managed_files, - protected_files: - HashMap::default(), - })), - }) + directory: box directory, + meta_informations: Arc::new(RwLock::new(MetaInformation { + managed_paths: managed_files, + protected_files: HashMap::default(), + })), + }) } Err(OpenReadError::FileDoesNotExist(_)) => { Ok(ManagedDirectory { - directory: box directory, - meta_informations: Arc::default(), - }) + directory: box directory, + meta_informations: Arc::default(), + }) } Err(OpenReadError::IOError(e)) => Err(From::from(e)), } @@ -116,15 +116,14 @@ impl ManagedDirectory { /// If a file cannot be deleted (for permission reasons for instance) /// an error is simply logged, and the file remains in the list of managed /// files. - pub fn garbage_collect HashSet >(&mut self, get_living_files: L) { + pub fn garbage_collect HashSet>(&mut self, get_living_files: L) { info!("Garbage collect"); let mut files_to_delete = vec![]; { // releasing the lock as .delete() will use it too. - let meta_informations_rlock = - self.meta_informations - .read() - .expect("Managed directory rlock poisoned in garbage collect."); + let meta_informations_rlock = self.meta_informations.read().expect( + "Managed directory rlock poisoned in garbage collect.", + ); // It is crucial to get the living files after acquiring the // read lock of meta informations. That way, we @@ -177,9 +176,9 @@ impl ManagedDirectory { if !deleted_files.is_empty() { // update the list of managed files by removing // the file that were removed. - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed directory wlock poisoned (2)."); + let mut meta_informations_wlock = self.meta_informations.write().expect( + "Managed directory wlock poisoned (2).", + ); { let managed_paths_write = &mut meta_informations_wlock.managed_paths; for delete_file in &deleted_files { @@ -202,13 +201,13 @@ impl ManagedDirectory { pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection { let pathbuf = path.to_owned(); { - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed file lock poisoned on protect"); + let mut meta_informations_wlock = self.meta_informations.write().expect( + "Managed file lock poisoned on protect", + ); *meta_informations_wlock - .protected_files - .entry(pathbuf.clone()) - .or_insert(0) += 1; + .protected_files + .entry(pathbuf.clone()) + .or_insert(0) += 1; } FileProtection { directory: self.clone(), @@ -224,9 +223,9 @@ impl ManagedDirectory { /// will not lead to garbage files that will /// never get removed. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { - let mut meta_wlock = self.meta_informations - .write() - .expect("Managed file lock poisoned"); + let mut meta_wlock = self.meta_informations.write().expect( + "Managed file lock poisoned", + ); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); if has_changed { save_managed_paths(self.directory.as_mut(), &meta_wlock)?; @@ -241,8 +240,9 @@ impl Directory for ManagedDirectory { } fn open_write(&mut self, path: &Path) -> result::Result { - self.register_file_as_managed(path) - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + self.register_file_as_managed(path).map_err(|e| { + IOError::with_path(path.to_owned(), e) + })?; self.directory.open_write(path) } @@ -257,9 +257,9 @@ impl Directory for ManagedDirectory { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { { - let metas_rlock = self.meta_informations - .read() - .expect("poisoned lock in managed directory meta"); + let metas_rlock = self.meta_informations.read().expect( + "poisoned lock in managed directory meta", + ); if let Some(counter) = metas_rlock.protected_files.get(path) { if *counter > 0 { return Err(DeleteError::FileProtected(path.to_owned())); @@ -327,7 +327,7 @@ mod tests { { let living_files: HashSet = [TEST_PATH1.to_owned()].into_iter().cloned().collect(); - managed_directory.garbage_collect(|| { living_files }); + managed_directory.garbage_collect(|| living_files); } { assert!(managed_directory.exists(*TEST_PATH1)); @@ -343,7 +343,7 @@ mod tests { } { let living_files: HashSet = HashSet::new(); - managed_directory.garbage_collect(|| { living_files }); + managed_directory.garbage_collect(|| living_files); } { assert!(!managed_directory.exists(*TEST_PATH1)); @@ -366,7 +366,7 @@ mod tests { assert!(managed_directory.exists(*TEST_PATH1)); let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap(); - managed_directory.garbage_collect(|| { living_files.clone() }); + managed_directory.garbage_collect(|| living_files.clone()); if cfg!(target_os = "windows") { // On Windows, gc should try and fail the file as it is mmapped. assert!(managed_directory.exists(*TEST_PATH1)); @@ -374,7 +374,7 @@ mod tests { drop(_mmap_read); // The file should still be in the list of managed file and // eventually be deleted once mmap is released. - managed_directory.garbage_collect(|| { living_files }); + managed_directory.garbage_collect(|| living_files); assert!(!managed_directory.exists(*TEST_PATH1)); } else { assert!(!managed_directory.exists(*TEST_PATH1)); @@ -398,11 +398,11 @@ mod tests { { let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1); - managed_directory.garbage_collect(|| { living_files.clone() }); + managed_directory.garbage_collect(|| living_files.clone()); assert!(managed_directory.exists(*TEST_PATH1)); } - managed_directory.garbage_collect(|| { living_files.clone() }); + managed_directory.garbage_collect(|| living_files.clone()); assert!(!managed_directory.exists(*TEST_PATH1)); diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index a3d5748b8..970b987cb 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -24,15 +24,17 @@ use std::sync::Weak; use tempdir::TempDir; fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadError> { - let file = File::open(&full_path) - .map_err(|e| if e.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.clone()) - } else { - OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) - })?; + let file = File::open(&full_path).map_err(|e| if e.kind() == + io::ErrorKind::NotFound + { + OpenReadError::FileDoesNotExist(full_path.clone()) + } else { + OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) + })?; - let meta_data = file.metadata() - .map_err(|e| IOError::with_path(full_path.to_owned(), e))?; + let meta_data = file.metadata().map_err(|e| { + IOError::with_path(full_path.to_owned(), e) + })?; if meta_data.len() == 0 { // if the file size is 0, it will not be possible // to mmap the file, so we return an anonymous mmap_cache @@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadE } -#[derive(Default,Clone,Debug,Serialize,Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct CacheCounters { // Number of time the cache prevents to call `mmap` pub hit: usize, @@ -58,7 +60,7 @@ pub struct CacheCounters { pub miss_weak: usize, } -#[derive(Clone,Debug,Serialize,Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct CacheInfo { pub counters: CacheCounters, pub mmapped: Vec, @@ -113,31 +115,31 @@ impl MmapCache { self.cleanup(); } Ok(match self.cache.entry(full_path.clone()) { - HashMapEntry::Occupied(mut occupied_entry) => { - if let Some(mmap_arc) = occupied_entry.get().upgrade() { - self.counters.hit += 1; - Some(mmap_arc.clone()) - } else { - // The entry exists but the weak ref has been destroyed. - self.counters.miss_weak += 1; - if let Some(mmap_arc) = open_mmap(&full_path)? { - occupied_entry.insert(Arc::downgrade(&mmap_arc)); - Some(mmap_arc) - } else { - None - } - } - } - HashMapEntry::Vacant(vacant_entry) => { - self.counters.miss_empty += 1; - if let Some(mmap_arc) = open_mmap(&full_path)? { - vacant_entry.insert(Arc::downgrade(&mmap_arc)); - Some(mmap_arc) - } else { - None - } - } - }) + HashMapEntry::Occupied(mut occupied_entry) => { + if let Some(mmap_arc) = occupied_entry.get().upgrade() { + self.counters.hit += 1; + Some(mmap_arc.clone()) + } else { + // The entry exists but the weak ref has been destroyed. + self.counters.miss_weak += 1; + if let Some(mmap_arc) = open_mmap(&full_path)? { + occupied_entry.insert(Arc::downgrade(&mmap_arc)); + Some(mmap_arc) + } else { + None + } + } + } + HashMapEntry::Vacant(vacant_entry) => { + self.counters.miss_empty += 1; + if let Some(mmap_arc) = open_mmap(&full_path)? { + vacant_entry.insert(Arc::downgrade(&mmap_arc)); + Some(mmap_arc) + } else { + None + } + } + }) } } @@ -180,15 +182,19 @@ impl MmapDirectory { /// exist or if it is not a directory. pub fn open(directory_path: &Path) -> Result { if !directory_path.exists() { - Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path))) + Err(OpenDirectoryError::DoesNotExist( + PathBuf::from(directory_path), + )) } else if !directory_path.is_dir() { - Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path))) + Err(OpenDirectoryError::NotADirectory( + PathBuf::from(directory_path), + )) } else { Ok(MmapDirectory { - root_path: PathBuf::from(directory_path), - mmap_cache: Arc::new(RwLock::new(MmapCache::default())), - _temp_directory: Arc::new(None), - }) + root_path: PathBuf::from(directory_path), + mmap_cache: Arc::new(RwLock::new(MmapCache::default())), + _temp_directory: Arc::new(None), + }) } } @@ -215,9 +221,9 @@ impl MmapDirectory { use std::os::windows::fs::OpenOptionsExt; use winapi::winbase; - open_opts - .write(true) - .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS); + open_opts.write(true).custom_flags( + winbase::FILE_FLAG_BACKUP_SEMANTICS, + ); } let fd = try!(open_opts.open(&self.root_path)); @@ -270,46 +276,50 @@ impl Directory for MmapDirectory { debug!("Open Read {:?}", path); let full_path = self.resolve_path(path); - let mut mmap_cache = self.mmap_cache - .write() - .map_err(|_| { - let msg = format!("Failed to acquired write lock \ + let mut mmap_cache = self.mmap_cache.write().map_err(|_| { + let msg = format!( + "Failed to acquired write lock \ on mmap cache while reading {:?}", - path); - IOError::with_path(path.to_owned(), make_io_err(msg)) - })?; + path + ); + IOError::with_path(path.to_owned(), make_io_err(msg)) + })?; - Ok(mmap_cache - .get_mmap(full_path)? - .map(MmapReadOnly::from) - .map(ReadOnlySource::Mmap) - .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) + Ok( + mmap_cache + .get_mmap(full_path)? + .map(MmapReadOnly::from) + .map(ReadOnlySource::Mmap) + .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())), + ) } fn open_write(&mut self, path: &Path) -> Result { debug!("Open Write {:?}", path); let full_path = self.resolve_path(path); - let open_res = OpenOptions::new() - .write(true) - .create_new(true) - .open(full_path); + let open_res = OpenOptions::new().write(true).create_new(true).open( + full_path, + ); - let mut file = open_res - .map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists { - OpenWriteError::FileAlreadyExists(path.to_owned()) - } else { - IOError::with_path(path.to_owned(), err).into() - })?; + let mut file = open_res.map_err(|err| if err.kind() == + io::ErrorKind::AlreadyExists + { + OpenWriteError::FileAlreadyExists(path.to_owned()) + } else { + IOError::with_path(path.to_owned(), err).into() + })?; // making sure the file is created. - file.flush() - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + file.flush().map_err( + |e| IOError::with_path(path.to_owned(), e), + )?; // Apparetntly, on some filesystem syncing the parent // directory is required. - self.sync_directory() - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + self.sync_directory().map_err(|e| { + IOError::with_path(path.to_owned(), e) + })?; let writer = SafeFileWriter::new(file); Ok(BufWriter::new(Box::new(writer))) @@ -318,22 +328,23 @@ impl Directory for MmapDirectory { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { debug!("Deleting file {:?}", path); let full_path = self.resolve_path(path); - let mut mmap_cache = self.mmap_cache - .write() - .map_err(|_| { - let msg = format!("Failed to acquired write lock \ + let mut mmap_cache = self.mmap_cache.write().map_err(|_| { + let msg = format!( + "Failed to acquired write lock \ on mmap cache while deleting {:?}", - path); - IOError::with_path(path.to_owned(), make_io_err(msg)) - })?; + path + ); + IOError::with_path(path.to_owned(), make_io_err(msg)) + })?; // Removing the entry in the MMap cache. // The munmap will appear on Drop, // when the last reference is gone. mmap_cache.cache.remove(&full_path); match fs::remove_file(&full_path) { Ok(_) => { - self.sync_directory() - .map_err(|e| IOError::with_path(path.to_owned(), e).into()) + self.sync_directory().map_err(|e| { + IOError::with_path(path.to_owned(), e).into() + }) } Err(e) => { if e.kind() == io::ErrorKind::NotFound { @@ -355,8 +366,9 @@ impl Directory for MmapDirectory { let mut buffer = Vec::new(); match File::open(&full_path) { Ok(mut file) => { - file.read_to_end(&mut buffer) - .map_err(|e| IOError::with_path(path.to_owned(), e))?; + file.read_to_end(&mut buffer).map_err(|e| { + IOError::with_path(path.to_owned(), e) + })?; Ok(buffer) } Err(e) => { diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 0f205c6f1..ca23bc07c 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -41,8 +41,10 @@ impl VecWriter { impl Drop for VecWriter { fn drop(&mut self) { if !self.is_flushed { - panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", - self.path) + panic!( + "You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", + self.path + ) } } } @@ -62,8 +64,10 @@ impl Write for VecWriter { fn flush(&mut self) -> io::Result<()> { self.is_flushed = true; - try!(self.shared_directory - .write(self.path.clone(), self.data.get_ref())); + try!(self.shared_directory.write( + self.path.clone(), + self.data.get_ref(), + )); Ok(()) } } @@ -79,11 +83,11 @@ impl InnerDirectory { } fn write(&self, path: PathBuf, data: &[u8]) -> io::Result { - let mut map = try!(self.0 - .write() - .map_err(|_| { - make_io_err(format!("Failed to lock the directory, when trying to write {:?}", - path)) + let mut map = try!(self.0.write().map_err(|_| { + make_io_err(format!( + "Failed to lock the directory, when trying to write {:?}", + path + )) })); let prev_value = map.insert(path, Arc::new(Vec::from(data))); Ok(prev_value.is_some()) @@ -93,17 +97,21 @@ impl InnerDirectory { self.0 .read() .map_err(|_| { - let msg = format!("Failed to acquire read lock for the \ + let msg = format!( + "Failed to acquire read lock for the \ directory when trying to read {:?}", - path); - let io_err = make_io_err(msg); - OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) + path + ); + let io_err = make_io_err(msg); + OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err)) + }) .and_then(|readable_map| { readable_map .get(path) .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) - .map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))) + .map(|data| { + ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())) + }) }) } @@ -111,16 +119,18 @@ impl InnerDirectory { self.0 .write() .map_err(|_| { - let msg = format!("Failed to acquire write lock for the \ + let msg = format!( + "Failed to acquire write lock for the \ directory when trying to delete {:?}", - path); - let io_err = make_io_err(msg); - DeleteError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) + path + ); + let io_err = make_io_err(msg); + DeleteError::IOError(IOError::with_path(path.to_owned(), io_err)) + }) .and_then(|mut writable_map| match writable_map.remove(path) { - Some(_) => Ok(()), - None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), - }) + Some(_) => Ok(()), + None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), + }) } fn exists(&self, path: &Path) -> bool { @@ -164,9 +174,11 @@ impl Directory for RAMDirectory { let path_buf = PathBuf::from(path); let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); - let exists = self.fs - .write(path_buf.clone(), &Vec::new()) - .map_err(|err| IOError::with_path(path.to_owned(), err))?; + let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err( + |err| { + IOError::with_path(path.to_owned(), err) + }, + )?; // force the creation of the file to mimic the MMap directory. if exists { diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index 32423ff96..9b1506217 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -114,7 +114,7 @@ impl From> for ReadOnlySource { /// Acts as a owning cursor over the data backed up by a ReadOnlySource pub(crate) struct SourceRead { _data_owner: ReadOnlySource, - cursor: &'static [u8] + cursor: &'static [u8], } impl SourceRead { @@ -131,7 +131,6 @@ impl AsRef<[u8]> for SourceRead { } impl From for SourceRead { - // Creates a new `SourceRead` from a given `ReadOnlySource` fn from(source: ReadOnlySource) -> SourceRead { let len = source.len(); diff --git a/src/error.rs b/src/error.rs index 8b345717d..7a2db9d2b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -112,12 +112,9 @@ impl From for Error { impl From for Error { fn from(error: OpenWriteError) -> Error { match error { - OpenWriteError::FileAlreadyExists(filepath) => { - ErrorKind::FileAlreadyExists(filepath) - } - OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error), - } - .into() + OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath), + OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error), + }.into() } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index cce503f21..3e83f239d 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -100,8 +100,7 @@ mod tests { { let composite_file = CompositeFile::open(source).unwrap(); let field_source = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(field_source); + let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source); assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); @@ -190,9 +189,11 @@ mod tests { // forcing the amplitude to be high add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64); for i in 0u64..10_000u64 { - add_single_field_doc(&mut fast_field_writers, - *FIELD, - 5_000_000_000_000_000_000u64 + i); + add_single_field_doc( + &mut fast_field_writers, + *FIELD, + 5_000_000_000_000_000_000u64 + i, + ); } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); @@ -208,8 +209,10 @@ mod tests { assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { - assert_eq!(fast_field_reader.get(doc), - 5_000_000_000_000_000_000u64 + doc as u64 - 1u64); + assert_eq!( + fast_field_reader.get(doc), + 5_000_000_000_000_000_000u64 + doc as u64 - 1u64 + ); } } } @@ -339,13 +342,13 @@ mod tests { fn bench_intfastfield_veclookup(b: &mut Bencher) { let permutation = generate_permutation(); b.iter(|| { - let n = test::black_box(1000u32); - let mut a = 0u64; - for _ in 0u32..n { - a = permutation[a as usize]; - } - a - }); + let n = test::black_box(1000u32); + let mut a = 0u64; + for _ in 0u32..n { + a = permutation[a as usize]; + } + a + }); } #[bench] @@ -403,13 +406,13 @@ mod tests { U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); b.iter(|| { - let n = test::black_box(1000u32); - let mut a = 0u32; - for _ in 0u32..n { - a = fast_field_reader.get(a) as u32; - } - a - }); + let n = test::black_box(1000u32); + let mut a = 0u32; + for _ in 0u32..n { + a = fast_field_reader.get(a) as u32; + } + a + }); } } } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 0c59cba05..8e37688e5 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -2,7 +2,7 @@ use directory::ReadOnlySource; use common::{self, BinarySerializable}; use common::bitpacker::{compute_num_bits, BitUnpacker}; use DocId; -use schema::{SchemaBuilder}; +use schema::SchemaBuilder; use std::path::Path; use schema::FAST; use directory::{WritePtr, RAMDirectory, Directory}; @@ -106,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader { let amplitude: u64; { let mut cursor = data.as_slice(); - min_value = u64::deserialize(&mut cursor) - .expect("Failed to read the min_value of fast field."); - amplitude = u64::deserialize(&mut cursor) - .expect("Failed to read the amplitude of fast field."); + min_value = + u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field."); + amplitude = + u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field."); } let max_value = min_value + amplitude; @@ -130,15 +130,14 @@ impl From> for U64FastFieldReader { let mut schema_builder = SchemaBuilder::default(); let field = schema_builder.add_u64_field("field", FAST); let schema = schema_builder.build(); - let path = Path::new("test"); + let path = Path::new("__dummy__"); let mut directory: RAMDirectory = RAMDirectory::create(); { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let write: WritePtr = directory.open_write(path).expect("With a RAMDirectory, this should never fail."); + let mut serializer = FastFieldSerializer::from_write(write).expect("With a RAMDirectory, this should never fail."); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); - // TODO Error not unwrap { - let fast_field_writer = fast_field_writers.get_field_writer(field).unwrap(); + let fast_field_writer = fast_field_writers.get_field_writer(field).expect("With a RAMDirectory, this should never fail."); for val in vals { fast_field_writer.add_val(val); } @@ -147,13 +146,12 @@ impl From> for U64FastFieldReader { serializer.close().unwrap(); } - let source = directory - .open_read(path) - .expect("Failed to open the file"); - let composite_file = CompositeFile::open(source) - .expect("Failed to read the composite file"); + let source = directory.open_read(path).expect("Failed to open the file"); + let composite_file = + CompositeFile::open(source).expect("Failed to read the composite file"); - let field_source = composite_file.open_read(field) + let field_source = composite_file + .open_read(field) .expect("File component not found"); U64FastFieldReader::open(field_source) } diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index ce2184fde..d26366de0 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -31,30 +31,22 @@ pub struct FastFieldSerializer { } impl FastFieldSerializer { - - /// Constructor pub fn from_write(write: WritePtr) -> io::Result { // just making room for the pointer to header. let composite_write = CompositeWrite::wrap(write); - Ok(FastFieldSerializer { - composite_write: composite_write, - }) + Ok(FastFieldSerializer { composite_write: composite_write }) } /// Start serializing a new u64 fast field - pub fn new_u64_fast_field(&mut self, - field: Field, - min_value: u64, - max_value: u64) - -> io::Result>> { - let field_write = self - .composite_write - .for_field(field); - FastSingleFieldSerializer::open( - field_write, - min_value, - max_value) + pub fn new_u64_fast_field( + &mut self, + field: Field, + min_value: u64, + max_value: u64, + ) -> io::Result>> { + let field_write = self.composite_write.for_field(field); + FastSingleFieldSerializer::open(field_write, min_value, max_value) } @@ -73,10 +65,11 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> { } impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { - - fn open(write: &'a mut W, - min_value: u64, - max_value: u64) -> io::Result> { + fn open( + write: &'a mut W, + min_value: u64, + max_value: u64, + ) -> io::Result> { min_value.serialize(write)?; let amplitude = max_value - min_value; amplitude.serialize(write)?; diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 1427a7b36..1750f90ca 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -58,9 +58,9 @@ impl FastFieldsWriter { /// Get the `FastFieldWriter` associated to a field. pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> { // TODO optimize - self.field_writers - .iter_mut() - .find(|field_writer| field_writer.field == field) + self.field_writers.iter_mut().find(|field_writer| { + field_writer.field == field + }) } @@ -155,9 +155,9 @@ impl IntFastFieldWriter { /// associated to the document with the `DocId` n. /// (Well, `n-1` actually because of 0-indexing) pub fn add_val(&mut self, val: u64) { - VInt(val) - .serialize(&mut self.vals) - .expect("unable to serialize VInt to Vec"); + VInt(val).serialize(&mut self.vals).expect( + "unable to serialize VInt to Vec", + ); if val > self.val_max { self.val_max = val; diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index fc22dbc84..da09c49c9 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -40,9 +40,9 @@ impl DeleteQueue { { let mut delete_queue_wlock = delete_queue.inner.write().unwrap(); delete_queue_wlock.last_block = Some(Arc::new(Block { - operations: Arc::default(), - next: next_block, - })); + operations: Arc::default(), + next: next_block, + })); } delete_queue @@ -59,9 +59,11 @@ impl DeleteQueue { .expect("Read lock poisoned when opening delete queue cursor") .last_block .clone() - .expect("Failed to unwrap last_block. This should never happen + .expect( + "Failed to unwrap last_block. This should never happen as the Option<> is only here to make - initialization possible"); + initialization possible", + ); let operations_len = last_block.operations.len(); DeleteCursor { block: last_block, @@ -92,9 +94,9 @@ impl DeleteQueue { // be some unflushed operations. // fn flush(&self) -> Option> { - let mut self_wlock = self.inner - .write() - .expect("Failed to acquire write lock on delete queue writer"); + let mut self_wlock = self.inner.write().expect( + "Failed to acquire write lock on delete queue writer", + ); let delete_operations; { @@ -108,9 +110,9 @@ impl DeleteQueue { let next_block = NextBlock::from(self.clone()); { self_wlock.last_block = Some(Arc::new(Block { - operations: Arc::new(delete_operations), - next: next_block, - })); + operations: Arc::new(delete_operations), + next: next_block, + })); } self_wlock.last_block.clone() } @@ -132,18 +134,18 @@ impl From for NextBlock { impl NextBlock { fn next_block(&self) -> Option> { { - let next_read_lock = self.0 - .read() - .expect("Failed to acquire write lock in delete queue"); + let next_read_lock = self.0.read().expect( + "Failed to acquire write lock in delete queue", + ); if let InnerNextBlock::Closed(ref block) = *next_read_lock { return Some(block.clone()); } } let next_block; { - let mut next_write_lock = self.0 - .write() - .expect("Failed to acquire write lock in delete queue"); + let mut next_write_lock = self.0.write().expect( + "Failed to acquire write lock in delete queue", + ); match *next_write_lock { InnerNextBlock::Closed(ref block) => { return Some(block.clone()); diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs index 03556ef17..e7d277f00 100644 --- a/src/indexer/doc_opstamp_mapping.rs +++ b/src/indexer/doc_opstamp_mapping.rs @@ -56,8 +56,10 @@ mod tests { #[test] fn test_doc_to_opstamp_mapping_none() { let doc_to_opstamp_mapping = DocToOpstampMapping::None; - assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), - u32::max_value()); + assert_eq!( + doc_to_opstamp_mapping.compute_doc_limit(1), + u32::max_value() + ); } #[test] diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 26e6c6330..57acc00a7 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -102,14 +102,17 @@ impl !Sync for IndexWriter {} /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics /// If the heap size per thread is too small, panics. -pub fn open_index_writer(index: &Index, - num_threads: usize, - heap_size_in_bytes_per_thread: usize) - -> Result { +pub fn open_index_writer( + index: &Index, + num_threads: usize, + heap_size_in_bytes_per_thread: usize, +) -> Result { if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { - panic!(format!("The heap size per thread needs to be at least {}.", - HEAP_SIZE_LIMIT)); + panic!(format!( + "The heap size per thread needs to be at least {}.", + HEAP_SIZE_LIMIT + )); } let directory_lock = DirectoryLock::lock(index.directory().box_clone())?; @@ -156,12 +159,13 @@ pub fn open_index_writer(index: &Index, -pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, - segment_reader: &SegmentReader, - delete_cursor: &mut DeleteCursor, - doc_opstamps: &DocToOpstampMapping, - target_opstamp: u64) - -> Result { +pub fn compute_deleted_bitset( + delete_bitset: &mut BitSet, + segment_reader: &SegmentReader, + delete_cursor: &mut DeleteCursor, + doc_opstamps: &DocToOpstampMapping, + target_opstamp: u64, +) -> Result { let mut might_have_changed = false; @@ -177,9 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, // Limit doc helps identify the first document // that may be affected by the delete operation. let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); - let inverted_index = segment_reader.inverted_index(delete_op.term.field())?; - if let Some(mut docset) = - inverted_index.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + let inverted_index = segment_reader.inverted_index(delete_op.term.field()); + if let Some(mut docset) = inverted_index.read_postings( + &delete_op.term, + SegmentPostingsOption::NoFreq, + ) + { while docset.advance() { let deleted_doc = docset.doc(); if deleted_doc < limit_doc { @@ -199,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, /// Advance delete for the given segment up /// to the target opstamp. -pub fn advance_deletes(mut segment: Segment, - segment_entry: &mut SegmentEntry, - target_opstamp: u64) - -> Result> { +pub fn advance_deletes( + mut segment: Segment, + segment_entry: &mut SegmentEntry, + target_opstamp: u64, +) -> Result> { let mut file_protect: Option = None; @@ -223,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment, let delete_cursor = segment_entry.delete_cursor(); - compute_deleted_bitset(&mut delete_bitset, - &segment_reader, - delete_cursor, - &DocToOpstampMapping::None, - target_opstamp)?; + compute_deleted_bitset( + &mut delete_bitset, + &segment_reader, + delete_cursor, + &DocToOpstampMapping::None, + target_opstamp, + )?; for doc in 0u32..max_doc { if segment_reader.is_deleted(doc) { @@ -248,15 +258,16 @@ pub fn advance_deletes(mut segment: Segment, Ok(file_protect) } -fn index_documents(heap: &mut Heap, - table_size: usize, - segment: Segment, - schema: &Schema, - generation: usize, - document_iterator: &mut Iterator, - segment_updater: &mut SegmentUpdater, - mut delete_cursor: DeleteCursor) - -> Result { +fn index_documents( + heap: &mut Heap, + table_size: usize, + segment: Segment, + schema: &Schema, + generation: usize, + document_iterator: &mut Iterator, + segment_updater: &mut SegmentUpdater, + mut delete_cursor: DeleteCursor, +) -> Result { heap.clear(); let segment_id = segment.id(); let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?; @@ -266,8 +277,10 @@ fn index_documents(heap: &mut Heap, // One is the memory arena dedicated to the segment is // getting full. if segment_writer.is_buffer_full() { - info!("Buffer limit reached, flushing segment with maxdoc={}.", - segment_writer.max_doc()); + info!( + "Buffer limit reached, flushing segment with maxdoc={}.", + segment_writer.max_doc() + ); break; } // The second is the term dictionary hash table @@ -276,8 +289,10 @@ fn index_documents(heap: &mut Heap, // Tantivy does not resize its hashtable. When it reaches // capacity, we just stop indexing new document. if segment_writer.is_term_saturated() { - info!("Term dic saturated, flushing segment with maxdoc={}.", - segment_writer.max_doc()); + info!( + "Term dic saturated, flushing segment with maxdoc={}.", + segment_writer.max_doc() + ); break; } } @@ -297,11 +312,13 @@ fn index_documents(heap: &mut Heap, let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); let segment_reader = SegmentReader::open(segment)?; let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - &doc_to_opstamps, - last_docstamp)?; + let may_have_deletes = compute_deleted_bitset( + &mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + &doc_to_opstamps, + last_docstamp, + )?; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { if may_have_deletes { @@ -328,14 +345,15 @@ impl IndexWriter { join_handle .join() .expect("Indexing Worker thread panicked") - .chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?; + .chain_err(|| { + ErrorKind::ErrorInThread("Error in indexing worker thread.".into()) + })?; } drop(self.workers_join_handle); - let result = - self.segment_updater - .wait_merging_thread() - .chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into())); + let result = self.segment_updater.wait_merging_thread().chain_err(|| { + ErrorKind::ErrorInThread("Failed to join merging thread.".into()) + }); if let Err(ref e) = result { error!("Some merging thread failed {:?}", e); @@ -348,8 +366,10 @@ impl IndexWriter { pub fn add_segment(&mut self, segment_meta: SegmentMeta) { let delete_cursor = self.delete_queue.cursor(); let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None); - self.segment_updater - .add_segment(self.generation, segment_entry); + self.segment_updater.add_segment( + self.generation, + segment_entry, + ); } #[doc(hidden)] @@ -373,7 +393,11 @@ impl IndexWriter { let mut delete_cursor = self.delete_queue.cursor(); let join_handle: JoinHandle> = thread::Builder::new() - .name(format!("indexing thread {} for gen {}", self.worker_id, generation)) + .name(format!( + "indexing thread {} for gen {}", + self.worker_id, + generation + )) .spawn(move || { loop { @@ -397,14 +421,16 @@ impl IndexWriter { return Ok(()); } let segment = segment_updater.new_segment(); - index_documents(&mut heap, - table_size, - segment, - &schema, - generation, - &mut document_iterator, - &mut segment_updater, - delete_cursor.clone())?; + index_documents( + &mut heap, + table_size, + segment, + &schema, + generation, + &mut document_iterator, + &mut segment_updater, + delete_cursor.clone(), + )?; } })?; @@ -437,9 +463,10 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, - segment_ids: &[SegmentId]) - -> impl Future { + pub fn merge( + &mut self, + segment_ids: &[SegmentId], + ) -> impl Future { self.segment_updater.start_merge(segment_ids) } @@ -523,14 +550,15 @@ impl IndexWriter { self.recreate_document_channel(); let mut former_workers_join_handle = Vec::new(); - swap(&mut former_workers_join_handle, - &mut self.workers_join_handle); + swap( + &mut former_workers_join_handle, + &mut self.workers_join_handle, + ); for worker_handle in former_workers_join_handle { - let indexing_worker_result = - worker_handle - .join() - .map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?; + let indexing_worker_result = worker_handle.join().map_err(|e| { + Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))) + })?; indexing_worker_result?; // add a new worker for the next generation. @@ -624,13 +652,17 @@ mod tests { let schema_builder = schema::SchemaBuilder::default(); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), - "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \ - level_log_size: 0.75 }"); + assert_eq!( + format!("{:?}", index_writer.get_merge_policy()), + "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \ + level_log_size: 0.75 }" + ); let merge_policy = box NoMergePolicy::default(); index_writer.set_merge_policy(merge_policy); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), - "NoMergePolicy"); + assert_eq!( + format!("{:?}", index_writer.get_merge_policy()), + "NoMergePolicy" + ); } #[test] @@ -720,9 +752,9 @@ mod tests { } // this should create 8 segments and trigger a merge. index_writer.commit().expect("commit failed"); - index_writer - .wait_merging_threads() - .expect("waiting merging thread failed"); + index_writer.wait_merging_threads().expect( + "waiting merging thread failed", + ); index.load_searchers().unwrap(); assert_eq!(num_docs_containing("a"), 200); diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 47f496998..c5e55d41c 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -62,7 +62,9 @@ impl MergePolicy for LogMergePolicy { let size_sorted_log_tuples: Vec<_> = size_sorted_tuples .into_iter() - .map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2())) + .map(|(ind, num_docs)| { + (ind, (self.clip_min_size(num_docs) as f64).log2()) + }) .collect(); let (first_ind, first_score) = size_sorted_log_tuples[0]; @@ -79,7 +81,9 @@ impl MergePolicy for LogMergePolicy { levels .iter() .filter(|level| level.len() >= self.min_merge_size) - .map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())) + .map(|ind_vec| { + MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()) + }) .collect() } @@ -134,12 +138,14 @@ mod tests { #[test] fn test_log_merge_policy_levels() { // multiple levels all get merged correctly - let test_input = vec![seg_meta(10), - seg_meta(10), - seg_meta(10), - seg_meta(1000), - seg_meta(1000), - seg_meta(1000)]; + let test_input = vec![ + seg_meta(10), + seg_meta(10), + seg_meta(10), + seg_meta(1000), + seg_meta(1000), + seg_meta(1000), + ]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 2); } @@ -147,24 +153,28 @@ mod tests { #[test] fn test_log_merge_policy_within_levels() { // multiple levels all get merged correctly - let test_input = vec![seg_meta(10), - seg_meta(11), - seg_meta(12), - seg_meta(1000), - seg_meta(1000), - seg_meta(1000)]; + let test_input = vec![ + seg_meta(10), + seg_meta(11), + seg_meta(12), + seg_meta(1000), + seg_meta(1000), + seg_meta(1000), + ]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 2); } #[test] fn test_log_merge_policy_small_segments() { // multiple levels all get merged correctly - let test_input = vec![seg_meta(1), - seg_meta(1), - seg_meta(1), - seg_meta(2), - seg_meta(2), - seg_meta(2)]; + let test_input = vec![ + seg_meta(1), + seg_meta(1), + seg_meta(1), + seg_meta(2), + seg_meta(2), + seg_meta(2), + ]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 1); } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 8e26784da..594f952d5 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -28,10 +28,11 @@ pub struct IndexMerger { } -fn compute_min_max_val(u64_reader: &U64FastFieldReader, - max_doc: DocId, - delete_bitset: &DeleteBitSet) - -> Option<(u64, u64)> { +fn compute_min_max_val( + u64_reader: &U64FastFieldReader, + max_doc: DocId, + delete_bitset: &DeleteBitSet, +) -> Option<(u64, u64)> { if max_doc == 0 { None } else if !delete_bitset.has_deletes() { @@ -49,17 +50,18 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader, } } -fn extract_fieldnorm_reader(segment_reader: &SegmentReader, - field: Field) - -> Option { +fn extract_fieldnorm_reader( + segment_reader: &SegmentReader, + field: Field, +) -> Option { segment_reader.get_fieldnorms_reader(field) } -fn extract_fast_field_reader(segment_reader: &SegmentReader, - field: Field) - -> Option { - segment_reader.get_fast_field_reader(field) - .ok() +fn extract_fast_field_reader( + segment_reader: &SegmentReader, + field: Field, +) -> Option { + segment_reader.get_fast_field_reader(field).ok() } @@ -100,10 +102,10 @@ impl IndexMerger { } } Ok(IndexMerger { - schema: schema, - readers: readers, - max_doc: max_doc, - }) + schema: schema, + readers: readers, + max_doc: max_doc, + }) } fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { @@ -114,9 +116,11 @@ impl IndexMerger { .filter(|&(_, field_entry)| field_entry.is_indexed()) .map(|(field_id, _)| Field(field_id as u32)) .collect(); - self.generic_write_fast_field(fieldnorm_fastfields, - &extract_fieldnorm_reader, - fast_field_serializer) + self.generic_write_fast_field( + fieldnorm_fastfields, + &extract_fieldnorm_reader, + fast_field_serializer, + ) } fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { @@ -127,19 +131,21 @@ impl IndexMerger { .filter(|&(_, field_entry)| field_entry.is_int_fast()) .map(|(field_id, _)| Field(field_id as u32)) .collect(); - self.generic_write_fast_field(fast_fields, - &extract_fast_field_reader, - fast_field_serializer) + self.generic_write_fast_field( + fast_fields, + &extract_fast_field_reader, + fast_field_serializer, + ) } // used both to merge field norms and regular u64 fast fields. - fn generic_write_fast_field(&self, - fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) - -> Option, - fast_field_serializer: &mut FastFieldSerializer) - -> Result<()> { + fn generic_write_fast_field( + &self, + fields: Vec, + field_reader_extractor: &Fn(&SegmentReader, Field) -> Option, + fast_field_serializer: &mut FastFieldSerializer, + ) -> Result<()> { for field in fields { @@ -151,19 +157,25 @@ impl IndexMerger { match field_reader_extractor(reader, field) { Some(u64_reader) => { if let Some((seg_min_val, seg_max_val)) = - compute_min_max_val(&u64_reader, - reader.max_doc(), - reader.delete_bitset()) { + compute_min_max_val( + &u64_reader, + reader.max_doc(), + reader.delete_bitset(), + ) + { // the segment has some non-deleted documents min_val = min(min_val, seg_min_val); max_val = max(max_val, seg_max_val); - u64_readers - .push((reader.max_doc(), u64_reader, reader.delete_bitset())); + u64_readers.push(( + reader.max_doc(), + u64_reader, + reader.delete_bitset(), + )); } } None => { - let error_msg = format!("Failed to find a u64_reader for field {:?}", - field); + let error_msg = + format!("Failed to find a u64_reader for field {:?}", field); error!("{}", error_msg); bail!(ErrorKind::SchemaError(error_msg)); } @@ -179,8 +191,11 @@ impl IndexMerger { assert!(min_val <= max_val); - let mut fast_single_field_serializer = fast_field_serializer - .new_u64_fast_field(field, min_val, max_val)?; + let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field( + field, + min_val, + max_val, + )?; for (max_doc, u64_reader, delete_bitset) in u64_readers { for doc_id in 0..max_doc { if !delete_bitset.is_deleted(doc_id) { @@ -199,9 +214,8 @@ impl IndexMerger { let mut delta_computer = DeltaComputer::new(); - let mut indexed_fields = vec!(); + let mut indexed_fields = vec![]; for (field_ord, field_entry) in self.schema.fields().iter().enumerate() { - // if field_entry if field_entry.is_indexed() { indexed_fields.push(Field(field_ord as u32)); } @@ -211,9 +225,8 @@ impl IndexMerger { let field_readers = self.readers .iter() - .map(|reader| - reader.inverted_index(indexed_field)) - .collect::>>()?; + .map(|reader| reader.inverted_index(indexed_field)) + .collect::>(); let field_term_streams = field_readers .iter() @@ -224,7 +237,8 @@ impl IndexMerger { let mut max_doc = 0; // map from segment doc ids to the resulting merged segment doc id. - let mut merged_doc_id_map: Vec>> = Vec::with_capacity(self.readers.len()); + let mut merged_doc_id_map: Vec>> = + Vec::with_capacity(self.readers.len()); for reader in &self.readers { let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); @@ -258,8 +272,10 @@ impl IndexMerger { let segment_postings_option = field_entry .field_type() .get_segment_postings_option() - .expect("Encountered a field that is not supposed to be - indexed. Have you modified the schema?"); + .expect( + "Encountered a field that is not supposed to be + indexed. Have you modified the schema?", + ); while merged_terms.advance() { @@ -273,9 +289,11 @@ impl IndexMerger { let segment_ord = heap_item.segment_ord; let term_info = heap_item.streamer.value(); let segment_reader = &self.readers[heap_item.segment_ord]; - let inverted_index = segment_reader.inverted_index(term.field()).unwrap(); // TODO fix unwrap - let mut segment_postings = inverted_index - .read_postings_from_terminfo(term_info, segment_postings_option); + let inverted_index = segment_reader.inverted_index(term.field()); + let mut segment_postings = inverted_index.read_postings_from_terminfo( + term_info, + segment_postings_option, + ); if segment_postings.advance() { Some((segment_ord, segment_postings)) } else { @@ -304,14 +322,18 @@ impl IndexMerger { // `.advance()` has been called once before the loop. // Hence we cannot use a `while segment_postings.advance()` loop. if let Some(remapped_doc_id) = - old_to_new_doc_id[segment_postings.doc() as usize] { + old_to_new_doc_id[segment_postings.doc() as usize] + { // we make sure to only write the term iff // there is at least one document. let positions: &[u32] = segment_postings.positions(); let term_freq = segment_postings.term_freq(); let delta_positions = delta_computer.compute_delta(positions); - field_serializer - .write_doc(remapped_doc_id, term_freq, delta_positions)?; + field_serializer.write_doc( + remapped_doc_id, + term_freq, + delta_positions, + )?; } if !segment_postings.advance() { break; @@ -349,8 +371,12 @@ impl IndexMerger { impl SerializableSegment for IndexMerger { fn write(&self, mut serializer: SegmentSerializer) -> Result { self.write_postings(serializer.get_postings_serializer())?; - self.write_fieldnorms(serializer.get_fieldnorms_serializer())?; - self.write_fast_fields(serializer.get_fast_field_serializer())?; + self.write_fieldnorms( + serializer.get_fieldnorms_serializer(), + )?; + self.write_fast_fields( + serializer.get_fast_field_serializer(), + )?; self.write_storable_fields(serializer.get_store_writer())?; serializer.close()?; Ok(self.max_doc) @@ -429,14 +455,13 @@ mod tests { } } { - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index_writer.wait_merging_threads().unwrap(); } { @@ -449,14 +474,22 @@ mod tests { collector.docs() }; { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec![1, 2, 4]); - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec![0, 3]); - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]), - vec![4]); - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec![0, 1, 2, 3, 4]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + vec![1, 2, 4] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + vec![0, 3] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "g")]), + vec![4] + ); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + vec![0, 1, 2, 3, 4] + ); } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); @@ -485,8 +518,10 @@ mod tests { assert!(searcher.search(&query, &mut collector).is_ok()); collector.vals() }; - assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]), - vec![5, 7, 13]); + assert_eq!( + get_fast_vals(vec![Term::from_field_text(text_field, "a")]), + vec![5, 7, 13] + ); } } } @@ -533,14 +568,22 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - vec![1]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - vec![1]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - vec![1, 3]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + vec![1] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + vec![1] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![1, 3] + ); } { // a second commit @@ -572,20 +615,34 @@ mod tests { assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[1].num_docs(), 2); assert_eq!(searcher.segment_readers()[1].max_doc(), 4); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) @@ -603,33 +660,46 @@ mod tests { } { // merging the segments - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 3); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - vec![3]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![3] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) .get_fast_field_reader(score_field) @@ -648,20 +718,34 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) .get_fast_field_reader(score_field) @@ -671,13 +755,12 @@ mod tests { } { // Test merging a single segment in order to remove deletes. - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); @@ -685,20 +768,34 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 2); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), - empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), - vec![6_000]); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), - vec![6_000, 7_000]); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "c")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "d")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000] + ); + assert_eq!( + search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000] + ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) .get_fast_field_reader(score_field) @@ -710,13 +807,12 @@ mod tests { { // Test removing all docs index_writer.delete_term(Term::from_field_text(text_field, "g")); - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - index_writer - .merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index.searchable_segment_ids().expect( + "Searchable segments failed.", + ); + index_writer.merge(&segment_ids).wait().expect( + "Merging failed", + ); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 082f9e1c1..9e8ad74a5 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -44,10 +44,11 @@ pub struct SegmentEntry { impl SegmentEntry { /// Create a new `SegmentEntry` - pub fn new(segment_meta: SegmentMeta, - delete_cursor: DeleteCursor, - delete_bitset: Option) - -> SegmentEntry { + pub fn new( + segment_meta: SegmentMeta, + delete_cursor: DeleteCursor, + delete_bitset: Option, + ) -> SegmentEntry { SegmentEntry { meta: segment_meta, state: SegmentState::Ready, diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 7a37f3574..a040c2ed5 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -32,31 +32,36 @@ pub struct SegmentManager { impl Debug for SegmentManager { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { let lock = self.read(); - write!(f, - "{{ uncommitted: {:?}, committed: {:?} }}", - lock.uncommitted, - lock.committed) + write!( + f, + "{{ uncommitted: {:?}, committed: {:?} }}", + lock.uncommitted, + lock.committed + ) } } -pub fn get_mergeable_segments(segment_manager: &SegmentManager) - -> (Vec, Vec) { +pub fn get_mergeable_segments( + segment_manager: &SegmentManager, +) -> (Vec, Vec) { let registers_lock = segment_manager.read(); - (registers_lock.committed.get_mergeable_segments(), - registers_lock.uncommitted.get_mergeable_segments()) + ( + registers_lock.committed.get_mergeable_segments(), + registers_lock.uncommitted.get_mergeable_segments(), + ) } impl SegmentManager { - pub fn from_segments(segment_metas: Vec, - delete_cursor: DeleteCursor) - -> SegmentManager { + pub fn from_segments( + segment_metas: Vec, + delete_cursor: DeleteCursor, + ) -> SegmentManager { SegmentManager { registers: RwLock::new(SegmentRegisters { - uncommitted: SegmentRegister::default(), - committed: SegmentRegister::new(segment_metas, - delete_cursor), - writing: HashSet::new(), - }), + uncommitted: SegmentRegister::default(), + committed: SegmentRegister::new(segment_metas, delete_cursor), + writing: HashSet::new(), + }), } } @@ -94,25 +99,24 @@ impl SegmentManager { pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { let registers = self.read(); - registers - .committed - .segment_entry(segment_id) - .or_else(|| registers.uncommitted.segment_entry(segment_id)) + registers.committed.segment_entry(segment_id).or_else(|| { + registers.uncommitted.segment_entry(segment_id) + }) } // Lock poisoning should never happen : // The lock is acquired and released within this class, // and the operations cannot panic. fn read(&self) -> RwLockReadGuard { - self.registers - .read() - .expect("Failed to acquire read lock on SegmentManager.") + self.registers.read().expect( + "Failed to acquire read lock on SegmentManager.", + ) } fn write(&self) -> RwLockWriteGuard { - self.registers - .write() - .expect("Failed to acquire write lock on SegmentManager.") + self.registers.write().expect( + "Failed to acquire write lock on SegmentManager.", + ) } pub fn commit(&self, segment_entries: Vec) { @@ -140,9 +144,11 @@ impl SegmentManager { } - pub fn cancel_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_id: SegmentId) { + pub fn cancel_merge( + &self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_id: SegmentId, + ) { let mut registers_lock = self.write(); @@ -150,13 +156,15 @@ impl SegmentManager { { let target_segment_register: &mut SegmentRegister; target_segment_register = { - if registers_lock - .uncommitted - .contains_all(before_merge_segment_ids) { + if registers_lock.uncommitted.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.uncommitted - } else if registers_lock - .committed - .contains_all(before_merge_segment_ids) { + } else if registers_lock.committed.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); @@ -185,23 +193,26 @@ impl SegmentManager { registers_lock.uncommitted.add_segment_entry(segment_entry); } - pub fn end_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_entry: SegmentEntry) { + pub fn end_merge( + &self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentEntry, + ) { let mut registers_lock = self.write(); - registers_lock - .writing - .remove(&after_merge_segment_entry.segment_id()); + registers_lock.writing.remove(&after_merge_segment_entry + .segment_id()); let target_register: &mut SegmentRegister = { - if registers_lock - .uncommitted - .contains_all(before_merge_segment_ids) { + if registers_lock.uncommitted.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.uncommitted - } else if registers_lock - .committed - .contains_all(before_merge_segment_ids) { + } else if registers_lock.committed.contains_all( + before_merge_segment_ids, + ) + { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index af7e778d1..97be73c85 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -24,7 +24,12 @@ impl Debug for SegmentRegister { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { try!(write!(f, "SegmentRegister(")); for (k, v) in &self.segment_states { - try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())); + try!(write!( + f, + "{}:{}, ", + k.short_uuid_string(), + v.state().letter_code() + )); } try!(write!(f, ")")); Ok(()) @@ -74,9 +79,9 @@ impl SegmentRegister { } pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { - segment_ids - .iter() - .all(|segment_id| self.segment_states.contains_key(segment_id)) + segment_ids.iter().all(|segment_id| { + self.segment_states.contains_key(segment_id) + }) } pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) { @@ -91,14 +96,18 @@ impl SegmentRegister { pub fn cancel_merge(&mut self, segment_id: &SegmentId) { self.segment_states .get_mut(segment_id) - .expect("Received a merge notification for a segment that is not registered") + .expect( + "Received a merge notification for a segment that is not registered", + ) .cancel_merge(); } pub fn start_merge(&mut self, segment_id: &SegmentId) { self.segment_states .get_mut(segment_id) - .expect("Received a merge notification for a segment that is not registered") + .expect( + "Received a merge notification for a segment that is not registered", + ) .start_merge(); } @@ -144,34 +153,42 @@ mod tests { let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register - .segment_entry(&segment_id_a) - .unwrap() - .state(), - SegmentState::Ready); + assert_eq!( + segment_register + .segment_entry(&segment_id_a) + .unwrap() + .state(), + SegmentState::Ready + ); assert_eq!(segment_ids(&segment_register), vec![segment_id_a]); { let segment_meta = SegmentMeta::new(segment_id_b); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register - .segment_entry(&segment_id_b) - .unwrap() - .state(), - SegmentState::Ready); + assert_eq!( + segment_register + .segment_entry(&segment_id_b) + .unwrap() + .state(), + SegmentState::Ready + ); segment_register.start_merge(&segment_id_a); segment_register.start_merge(&segment_id_b); - assert_eq!(segment_register - .segment_entry(&segment_id_a) - .unwrap() - .state(), - SegmentState::InMerge); - assert_eq!(segment_register - .segment_entry(&segment_id_b) - .unwrap() - .state(), - SegmentState::InMerge); + assert_eq!( + segment_register + .segment_entry(&segment_id_a) + .unwrap() + .state(), + SegmentState::InMerge + ); + assert_eq!( + segment_register + .segment_entry(&segment_id_b) + .unwrap() + .state(), + SegmentState::InMerge + ); segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_b); { diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 719c98c14..c2aa4bcae 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -28,11 +28,11 @@ impl SegmentSerializer { let postings_serializer = try!(InvertedIndexSerializer::open(segment)); Ok(SegmentSerializer { - postings_serializer: postings_serializer, - store_writer: StoreWriter::new(store_write), - fast_field_serializer: fast_field_serializer, - fieldnorms_serializer: fieldnorms_serializer, - }) + postings_serializer: postings_serializer, + store_writer: StoreWriter::new(store_write), + fast_field_serializer: fast_field_serializer, + fieldnorms_serializer: fieldnorms_serializer, + }) } /// Accessor to the `PostingsSerializer`. diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index feeb33d03..db7add226 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) - /// and flushed. /// /// This method is not part of tantivy's public API -pub fn save_metas(segment_metas: Vec, - schema: Schema, - opstamp: u64, - directory: &mut Directory) - -> Result<()> { +pub fn save_metas( + segment_metas: Vec, + schema: Schema, + opstamp: u64, + directory: &mut Directory, +) -> Result<()> { let metas = IndexMeta { segments: segment_metas, schema: schema, @@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc); -fn perform_merge(segment_ids: &[SegmentId], - segment_updater: &SegmentUpdater, - mut merged_segment: Segment, - target_opstamp: u64) - -> Result { +fn perform_merge( + segment_ids: &[SegmentId], + segment_updater: &SegmentUpdater, + mut merged_segment: Segment, + target_opstamp: u64, +) -> Result { // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids); @@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId], for segment_id in segment_ids { if let Some(mut segment_entry) = - segment_updater.0.segment_manager.segment_entry(segment_id) { + segment_updater.0.segment_manager.segment_entry(segment_id) + { let segment = index.segment(segment_entry.meta().clone()); if let Some(file_protection) = - advance_deletes(segment, &mut segment_entry, target_opstamp)? { + advance_deletes(segment, &mut segment_entry, target_opstamp)? + { file_protections.push(file_protection); } segment_entries.push(segment_entry); } else { error!("Error, had to abort merge as some of the segment is not managed anymore."); - let msg = format!("Segment {:?} requested for merge is not managed.", - segment_id); + let msg = format!( + "Segment {:?} requested for merge is not managed.", + segment_id + ); bail!(ErrorKind::InvalidArgument(msg)); } } @@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId], // ... we just serialize this index merger in our new segment // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment) - .expect("Creating index serializer failed"); + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect( + "Creating index serializer failed", + ); - let num_docs = merger - .write(segment_serializer) - .expect("Serializing merged index failed"); + let num_docs = merger.write(segment_serializer).expect( + "Serializing merged index failed", + ); let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_max_doc(num_docs); @@ -161,23 +168,24 @@ struct InnerSegmentUpdater { } impl SegmentUpdater { - pub fn new(index: Index, - stamper: Stamper, - delete_cursor: DeleteCursor) - -> Result { + pub fn new( + index: Index, + stamper: Stamper, + delete_cursor: DeleteCursor, + ) -> Result { let segments = index.searchable_segment_metas()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater { - pool: CpuPool::new(1), - index: index, - segment_manager: segment_manager, - merge_policy: RwLock::new(box DefaultMergePolicy::default()), - merging_thread_id: AtomicUsize::default(), - merging_threads: RwLock::new(HashMap::new()), - generation: AtomicUsize::default(), - killed: AtomicBool::new(false), - stamper: stamper, - }))) + pool: CpuPool::new(1), + index: index, + segment_manager: segment_manager, + merge_policy: RwLock::new(box DefaultMergePolicy::default()), + merging_thread_id: AtomicUsize::default(), + merging_threads: RwLock::new(HashMap::new()), + generation: AtomicUsize::default(), + killed: AtomicBool::new(false), + stamper: stamper, + }))) } pub fn new_segment(&self) -> Segment { @@ -199,10 +207,10 @@ impl SegmentUpdater { self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst) } - fn run_async T> - (&self, - f: F) - -> CpuFuture { + fn run_async T>( + &self, + f: F, + ) -> CpuFuture { let me_clone = self.clone(); self.0.pool.spawn_fn(move || Ok(f(me_clone))) } @@ -211,11 +219,10 @@ impl SegmentUpdater { pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool { if generation >= self.0.generation.load(Ordering::Acquire) { self.run_async(|segment_updater| { - segment_updater.0.segment_manager.add_segment(segment_entry); - segment_updater.consider_merge_options(); - true - }) - .forget(); + segment_updater.0.segment_manager.add_segment(segment_entry); + segment_updater.consider_merge_options(); + true + }).forget(); true } else { false @@ -249,46 +256,46 @@ impl SegmentUpdater { if self.is_alive() { let index = &self.0.index; let directory = index.directory(); - save_metas(self.0.segment_manager.committed_segment_metas(), - index.schema(), - opstamp, - directory.box_clone().borrow_mut()) - .expect("Could not save metas."); + save_metas( + self.0.segment_manager.committed_segment_metas(), + index.schema(), + opstamp, + directory.box_clone().borrow_mut(), + ).expect("Could not save metas."); } } pub fn garbage_collect_files(&self) -> Result<()> { self.run_async(move |segment_updater| { segment_updater.garbage_collect_files_exec(); - }) - .wait() + }).wait() } fn garbage_collect_files_exec(&self) { info!("Running garbage collection"); let mut index = self.0.index.clone(); - index.directory_mut().garbage_collect(|| { - self.0.segment_manager.list_files() - }); + index.directory_mut().garbage_collect( + || self.0.segment_manager.list_files(), + ); } pub fn commit(&self, opstamp: u64) -> Result<()> { self.run_async(move |segment_updater| if segment_updater.is_alive() { - let segment_entries = segment_updater - .purge_deletes(opstamp) - .expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(segment_entries); - segment_updater.save_metas(opstamp); - segment_updater.garbage_collect_files_exec(); - segment_updater.consider_merge_options(); - }) - .wait() + let segment_entries = segment_updater.purge_deletes(opstamp).expect( + "Failed purge deletes", + ); + segment_updater.0.segment_manager.commit(segment_entries); + segment_updater.save_metas(opstamp); + segment_updater.garbage_collect_files_exec(); + segment_updater.consider_merge_options(); + }).wait() } - pub fn start_merge(&self, - segment_ids: &[SegmentId]) - -> impl Future { + pub fn start_merge( + &self, + segment_ids: &[SegmentId], + ) -> impl Future { self.0.segment_manager.start_merge(segment_ids); let segment_updater_clone = self.clone(); @@ -308,10 +315,12 @@ impl SegmentUpdater { // first we need to apply deletes to our segment. let merged_segment = segment_updater_clone.new_segment(); let merged_segment_id = merged_segment.id(); - let merge_result = perform_merge(&segment_ids_vec, - &segment_updater_clone, - merged_segment, - target_opstamp); + let merge_result = perform_merge( + &segment_ids_vec, + &segment_updater_clone, + merged_segment, + target_opstamp, + ); match merge_result { Ok(after_merge_segment_entry) => { @@ -345,11 +354,10 @@ impl SegmentUpdater { .remove(&merging_thread_id); Ok(()) }); - self.0 - .merging_threads - .write() - .unwrap() - .insert(merging_thread_id, merging_join_handle); + self.0.merging_threads.write().unwrap().insert( + merging_thread_id, + merging_join_handle, + ); merging_future_recv } @@ -368,19 +376,23 @@ impl SegmentUpdater { } } - fn cancel_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_entry: SegmentId) { - self.0 - .segment_manager - .cancel_merge(before_merge_segment_ids, after_merge_segment_entry); + fn cancel_merge( + &self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentId, + ) { + self.0.segment_manager.cancel_merge( + before_merge_segment_ids, + after_merge_segment_entry, + ); } - fn end_merge(&self, - before_merge_segment_ids: Vec, - mut after_merge_segment_entry: SegmentEntry) - -> Result<()> { + fn end_merge( + &self, + before_merge_segment_ids: Vec, + mut after_merge_segment_entry: SegmentEntry, + ) -> Result<()> { self.run_async(move |segment_updater| { info!("End merge {:?}", after_merge_segment_entry.meta()); @@ -391,28 +403,37 @@ impl SegmentUpdater { if delete_operation.opstamp < committed_opstamp { let index = &segment_updater.0.index; let segment = index.segment(after_merge_segment_entry.meta().clone()); - match advance_deletes(segment, - &mut after_merge_segment_entry, - committed_opstamp) { + match advance_deletes( + segment, + &mut after_merge_segment_entry, + committed_opstamp, + ) { Ok(file_protection_opt_res) => { _file_protection_opt = file_protection_opt_res; } Err(e) => { - error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", - before_merge_segment_ids, e); + error!( + "Merge of {:?} was cancelled (advancing deletes failed): {:?}", + before_merge_segment_ids, + e + ); // ... cancel merge if cfg!(test) { panic!("Merge failed."); } - segment_updater.cancel_merge(&before_merge_segment_ids, - after_merge_segment_entry.segment_id()); + segment_updater.cancel_merge( + &before_merge_segment_ids, + after_merge_segment_entry.segment_id(), + ); return; } } } } - segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, - after_merge_segment_entry); + segment_updater.0.segment_manager.end_merge( + &before_merge_segment_ids, + after_merge_segment_entry, + ); segment_updater.consider_merge_options(); info!("save metas"); segment_updater.save_metas(segment_updater.0.index.opstamp()); @@ -450,10 +471,9 @@ impl SegmentUpdater { } debug!("wait merging thread {}", new_merging_threads.len()); for (_, merging_thread_handle) in new_merging_threads { - merging_thread_handle - .join() - .map(|_| ()) - .map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?; + merging_thread_handle.join().map(|_| ()).map_err(|_| { + ErrorKind::ErrorInThread("Merging thread failed.".into()) + })?; } // Our merging thread may have queued their completed self.run_async(move |_| {}).wait()?; @@ -522,9 +542,9 @@ mod tests { assert_eq!(index.searcher().num_docs(), 302); { - index_writer - .wait_merging_threads() - .expect("waiting for merging threads"); + index_writer.wait_merging_threads().expect( + "waiting for merging threads", + ); } index.load_searchers().unwrap(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bbafb37ef..93c5ee5ee 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -54,22 +54,23 @@ impl<'a> SegmentWriter<'a> { /// the flushing behavior as a buffer limit /// - segment: The segment being written /// - schema - pub fn for_segment(heap: &'a Heap, - table_bits: usize, - mut segment: Segment, - schema: &Schema) - -> Result> { + pub fn for_segment( + heap: &'a Heap, + table_bits: usize, + mut segment: Segment, + schema: &Schema, + ) -> Result> { let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap); Ok(SegmentWriter { - heap: heap, - max_doc: 0, - multifield_postings: multifield_postings, - fieldnorms_writer: create_fieldnorms_writer(schema), - segment_serializer: segment_serializer, - fast_field_writers: FastFieldsWriter::from_schema(schema), - doc_opstamps: Vec::with_capacity(1_000), - }) + heap: heap, + max_doc: 0, + multifield_postings: multifield_postings, + fieldnorms_writer: create_fieldnorms_writer(schema), + segment_serializer: segment_serializer, + fast_field_writers: FastFieldsWriter::from_schema(schema), + doc_opstamps: Vec::with_capacity(1_000), + }) } /// Lay on disk the current content of the `SegmentWriter` @@ -77,10 +78,12 @@ impl<'a> SegmentWriter<'a> { /// Finalize consumes the `SegmentWriter`, so that it cannot /// be used afterwards. pub fn finalize(self) -> Result> { - write(&self.multifield_postings, - &self.fast_field_writers, - &self.fieldnorms_writer, - self.segment_serializer)?; + write( + &self.multifield_postings, + &self.fast_field_writers, + &self.fieldnorms_writer, + self.segment_serializer, + )?; Ok(self.doc_opstamps) } @@ -107,10 +110,11 @@ impl<'a> SegmentWriter<'a> { /// Indexes a new document /// /// As a user, you should rather use `IndexWriter`'s add_document. - pub fn add_document(&mut self, - add_operation: &AddOperation, - schema: &Schema) - -> io::Result<()> { + pub fn add_document( + &mut self, + add_operation: &AddOperation, + schema: &Schema, + ) -> io::Result<()> { let doc_id = self.max_doc; let doc = &add_operation.document; self.doc_opstamps.push(add_operation.opstamp); @@ -122,8 +126,11 @@ impl<'a> SegmentWriter<'a> { match *field_options.field_type() { FieldType::Str(ref text_options) => { let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() { - self.multifield_postings - .index_text(doc_id, field, &field_values) + self.multifield_postings.index_text( + doc_id, + field, + &field_values, + ) } else { let num_field_values = field_values.len() as u32; for field_value in field_values { @@ -132,15 +139,17 @@ impl<'a> SegmentWriter<'a> { } num_field_values }; - self.fieldnorms_writer - .get_field_writer(field) - .map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64)); + self.fieldnorms_writer.get_field_writer(field).map( + |field_norms_writer| field_norms_writer.add_val(num_tokens as u64), + ); } FieldType::U64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_u64(field_value.field(), - field_value.value().u64_value()); + let term = Term::from_field_u64( + field_value.field(), + field_value.value().u64_value(), + ); self.multifield_postings.suscribe(doc_id, &term); } } @@ -148,8 +157,10 @@ impl<'a> SegmentWriter<'a> { FieldType::I64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_i64(field_value.field(), - field_value.value().i64_value()); + let term = Term::from_field_i64( + field_value.field(), + field_value.value().i64_value(), + ); self.multifield_postings.suscribe(doc_id, &term); } } @@ -160,7 +171,9 @@ impl<'a> SegmentWriter<'a> { self.fast_field_writers.add_document(doc); let stored_fieldvalues: Vec<&FieldValue> = doc.field_values() .iter() - .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) + .filter(|field_value| { + schema.get_field_entry(field_value.field()).is_stored() + }) .collect(); let doc_writer = self.segment_serializer.get_store_writer(); try!(doc_writer.store(&stored_fieldvalues)); @@ -191,15 +204,22 @@ impl<'a> SegmentWriter<'a> { } // This method is used as a trick to workaround the borrow checker -fn write(multifield_postings: &MultiFieldPostingsWriter, - fast_field_writers: &FastFieldsWriter, - fieldnorms_writer: &FastFieldsWriter, - mut serializer: SegmentSerializer) - -> Result<()> { +fn write( + multifield_postings: &MultiFieldPostingsWriter, + fast_field_writers: &FastFieldsWriter, + fieldnorms_writer: &FastFieldsWriter, + mut serializer: SegmentSerializer, +) -> Result<()> { - try!(multifield_postings.serialize(serializer.get_postings_serializer())); - try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); - try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())); + try!(multifield_postings.serialize( + serializer.get_postings_serializer(), + )); + try!(fast_field_writers.serialize( + serializer.get_fast_field_serializer(), + )); + try!(fieldnorms_writer.serialize( + serializer.get_fieldnorms_serializer(), + )); try!(serializer.close()); Ok(()) @@ -208,10 +228,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter, impl<'a> SerializableSegment for SegmentWriter<'a> { fn write(&self, serializer: SegmentSerializer) -> Result { let max_doc = self.max_doc; - write(&self.multifield_postings, - &self.fast_field_writers, - &self.fieldnorms_writer, - serializer)?; + write( + &self.multifield_postings, + &self.fast_field_writers, + &self.fieldnorms_writer, + serializer, + )?; Ok(max_doc) } } diff --git a/src/lib.rs b/src/lib.rs index 0b26c6197..2279ca687 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,7 +68,7 @@ extern crate stable_deref_trait; #[cfg(test)] extern crate env_logger; -#[cfg(feature="simdcompression")] +#[cfg(feature = "simdcompression")] extern crate libc; #[cfg(windows)] @@ -391,16 +391,24 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let inverted_index = reader.inverted_index(text_field).unwrap(); - assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); + let inverted_index = reader.inverted_index(text_field); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); { - let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_b, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -426,17 +434,25 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let inverted_index = reader.inverted_index(term_abcd.field()).unwrap(); + let inverted_index = reader.inverted_index(term_abcd.field()); - assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); { - let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_b, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -462,14 +478,22 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let inverted_index = reader.inverted_index(term_abcd.field()).unwrap(); - assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); + let inverted_index = reader.inverted_index(term_abcd.field()); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); { - let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert!(!postings.advance()); } { - let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_b, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -477,7 +501,9 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = inverted_index.read_postings(&term_c, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_c, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -501,7 +527,7 @@ mod tests { let term = Term::from_field_u64(field, 1u64); let mut postings = searcher .segment_reader(0) - .inverted_index(term.field()).unwrap() + .inverted_index(term.field()) .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -525,7 +551,7 @@ mod tests { let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) - .inverted_index(term.field()).unwrap() + .inverted_index(term.field()) .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); @@ -588,11 +614,17 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - let inverted_index = reader.inverted_index(text_field).unwrap(); + let inverted_index = reader.inverted_index(text_field); let term_abcd = Term::from_field_text(text_field, "abcd"); - assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none()); + assert!( + inverted_index + .read_postings(&term_abcd, FreqAndPositions) + .is_none() + ); let term_af = Term::from_field_text(text_field, "af"); - let mut postings = inverted_index.read_postings(&term_af, FreqAndPositions).unwrap(); + let mut postings = inverted_index + .read_postings(&term_af, FreqAndPositions) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); @@ -634,29 +666,43 @@ mod tests { collector.docs() }; { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec![1, 2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + vec![1, 2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec![0]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + vec![0] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec![0, 1, 2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + vec![0, 1, 2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]), - vec![1, 2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "c")]), + vec![1, 2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]), - vec![2]); + assert_eq!( + get_doc_ids(vec![Term::from_field_text(text_field, "d")]), + vec![2] + ); } { - assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"), - Term::from_field_text(text_field, "a")]), - vec![0, 1, 2]); + assert_eq!( + get_doc_ids(vec![ + Term::from_field_text(text_field, "b"), + Term::from_field_text(text_field, "a"), + ]), + vec![0, 1, 2] + ); } } } @@ -693,7 +739,8 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let other_text_field = schema_builder.add_text_field("text2", TEXT); - let document = doc!(text_field => "tantivy", + let document = + doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short"); assert_eq!(document.len(), 3); diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 4b1ea3c7a..8aa665f53 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -72,8 +72,7 @@ pub trait DocSet { for (i, buffer_val) in buffer.iter_mut().enumerate() { if self.advance() { *buffer_val = self.doc(); - } - else { + } else { return i; } } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 9b725cd86..d672077b2 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -65,7 +65,9 @@ mod tests { field_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..120u32 { let delta_positions = vec![1, 2, 3, 2]; - field_serializer.write_doc(doc_id, 2, &delta_positions).unwrap(); + field_serializer + .write_doc(doc_id, 2, &delta_positions) + .unwrap(); } field_serializer.close_term().unwrap(); } @@ -84,8 +86,8 @@ mod tests { let heap = Heap::with_capacity(10_000_000); { - let mut segment_writer = SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema) - .unwrap(); + let mut segment_writer = + SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap(); { let mut doc = Document::default(); // checking that position works if the field has two values @@ -131,15 +133,17 @@ mod tests { } { let term_a = Term::from_field_text(text_field, "abcdef"); - assert!(segment_reader - .inverted_index(term_a.field()).unwrap() + assert!( + segment_reader + .inverted_index(term_a.field()) .read_postings(&term_a, FreqAndPositions) - .is_none()); + .is_none() + ); } { let term_a = Term::from_field_text(text_field, "a"); let mut postings_a = segment_reader - .inverted_index(term_a.field()).unwrap() + .inverted_index(term_a.field()) .read_postings(&term_a, FreqAndPositions) .unwrap(); assert_eq!(postings_a.len(), 1000); @@ -162,7 +166,7 @@ mod tests { { let term_e = Term::from_field_text(text_field, "e"); let mut postings_e = segment_reader - .inverted_index(term_e.field()).unwrap() + .inverted_index(term_e.field()) .read_postings(&term_e, FreqAndPositions) .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); @@ -202,8 +206,10 @@ mod tests { assert!(index_writer.commit().is_ok()); } index.load_searchers().unwrap(); - let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), - SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new( + Term::from_field_text(text_field, "a"), + SegmentPostingsOption::NoFreq, + ); let searcher = index.searcher(); let mut term_weight = term_query.specialized_weight(&*searcher); term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions; @@ -250,7 +256,7 @@ mod tests { for i in 0..num_docs - 1 { for j in i + 1..num_docs { let mut segment_postings = segment_reader - .inverted_index(term_2.field()).unwrap() + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -264,7 +270,7 @@ mod tests { { let mut segment_postings = segment_reader - .inverted_index(term_2.field()).unwrap() + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -285,7 +291,7 @@ mod tests { // check that filtering works { let mut segment_postings = segment_reader - .inverted_index(term_0.field()).unwrap() + .inverted_index(term_0.field()) .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -295,7 +301,7 @@ mod tests { } let mut segment_postings = segment_reader - .inverted_index(term_0.field()).unwrap() + .inverted_index(term_0.field()) .read_postings(&term_0, SegmentPostingsOption::NoFreq) .unwrap(); @@ -320,7 +326,7 @@ mod tests { // make sure seeking still works for i in 0..num_docs { let mut segment_postings = segment_reader - .inverted_index(term_2.field()).unwrap() + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -336,7 +342,7 @@ mod tests { // now try with a longer sequence { let mut segment_postings = segment_reader - .inverted_index(term_2.field()).unwrap() + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -372,14 +378,14 @@ mod tests { // finally, check that it's empty { let mut segment_postings = segment_reader - .inverted_index(term_2.field()).unwrap() + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); assert_eq!(segment_postings.skip_next(0), SkipResult::End); let mut segment_postings = segment_reader - .inverted_index(term_2.field()).unwrap() + .inverted_index(term_2.field()) .read_postings(&term_2, SegmentPostingsOption::NoFreq) .unwrap(); @@ -446,12 +452,12 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { - let mut segment_postings = segment_reader - .inverted_index(TERM_A.field()).unwrap() - .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) - .unwrap(); - while segment_postings.advance() {} - }); + let mut segment_postings = segment_reader + .inverted_index(TERM_A.field()) + .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) + .unwrap(); + while segment_postings.advance() {} + }); } #[bench] @@ -460,25 +466,27 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { let segment_postings_a = segment_reader - .inverted_index(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_b = segment_reader - .inverted_index(TERM_B.field()).unwrap() + .inverted_index(TERM_B.field()) .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_c = segment_reader - .inverted_index(TERM_C.field()).unwrap() + .inverted_index(TERM_C.field()) .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq) .unwrap(); let segment_postings_d = segment_reader - .inverted_index(TERM_D.field()).unwrap() + .inverted_index(TERM_D.field()) .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq) .unwrap(); - let mut intersection = IntersectionDocSet::from(vec![segment_postings_a, - segment_postings_b, - segment_postings_c, - segment_postings_d]); + let mut intersection = IntersectionDocSet::from(vec![ + segment_postings_a, + segment_postings_b, + segment_postings_c, + segment_postings_d, + ]); while intersection.advance() {} }); } @@ -489,7 +497,7 @@ mod tests { let docs = tests::sample(segment_reader.num_docs(), p); let mut segment_postings = segment_reader - .inverted_index(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); @@ -506,7 +514,7 @@ mod tests { b.iter(|| { let mut segment_postings = segment_reader - .inverted_index(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { @@ -544,7 +552,7 @@ mod tests { b.iter(|| { let n: u32 = test::black_box(17); let mut segment_postings = segment_reader - .inverted_index(TERM_A.field()).unwrap() + .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); let mut s = 0u32; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 4e1f770c7..1b62942c5 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -16,9 +16,10 @@ use schema::FieldEntry; use schema::FieldType; use schema::TextIndexingOptions; -fn posting_from_field_entry<'a>(field_entry: &FieldEntry, - heap: &'a Heap) - -> Box { +fn posting_from_field_entry<'a>( + field_entry: &FieldEntry, + heap: &'a Heap, +) -> Box { match *field_entry.field_type() { FieldType::Str(ref text_options) => { match text_options.get_indexing_options() { @@ -51,9 +52,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { let per_field_postings_writers: Vec<_> = schema .fields() .iter() - .map(|field_entry| { - posting_from_field_entry(field_entry, heap) - }) + .map(|field_entry| posting_from_field_entry(field_entry, heap)) .collect(); MultiFieldPostingsWriter { @@ -102,7 +101,11 @@ impl<'a> MultiFieldPostingsWriter<'a> { let (_, stop) = offsets[i + 1]; let postings_writer = &self.per_field_postings_writers[field.0 as usize]; let mut field_serializer = serializer.new_field(field)?; - postings_writer.serialize(&term_offsets[start..stop], &mut field_serializer, self.heap)?; + postings_writer.serialize( + &term_offsets[start..stop], + &mut field_serializer, + self.heap, + )?; field_serializer.close()?; } Ok(()) @@ -127,29 +130,33 @@ pub trait PostingsWriter { /// * term - the term /// * heap - heap used to store the postings informations as well as the terms /// in the hashmap. - fn suscribe(&mut self, - term_index: &mut HashMap, - doc: DocId, - pos: u32, - term: &Term, - heap: &Heap); + fn suscribe( + &mut self, + term_index: &mut HashMap, + doc: DocId, + pos: u32, + term: &Term, + heap: &Heap, + ); /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. - fn serialize(&self, - term_addrs: &[(&[u8], u32)], - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()>; + fn serialize( + &self, + term_addrs: &[(&[u8], u32)], + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()>; /// Tokenize a text and suscribe all of its token. - fn index_text<'a>(&mut self, - term_index: &mut HashMap, - doc_id: DocId, - field: Field, - field_values: &[&'a FieldValue], - heap: &Heap) - -> u32 { + fn index_text<'a>( + &mut self, + term_index: &mut HashMap, + doc_id: DocId, + field: Field, + field_values: &[&'a FieldValue], + heap: &Heap, + ) -> u32 { let mut pos = 0u32; let mut num_tokens: u32 = 0u32; let mut term = unsafe { Term::with_capacity(100) }; @@ -195,12 +202,14 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { } impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> { - fn suscribe(&mut self, - term_index: &mut HashMap, - doc: DocId, - position: u32, - term: &Term, - heap: &Heap) { + fn suscribe( + &mut self, + term_index: &mut HashMap, + doc: DocId, + position: u32, + term: &Term, + heap: &Heap, + ) { debug_assert!(term.as_slice().len() >= 4); let recorder: &mut Rec = term_index.get_or_create(term); let current_doc = recorder.current_doc(); @@ -213,11 +222,12 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' recorder.record_position(position, heap); } - fn serialize(&self, - term_addrs: &[(&[u8], u32)], - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + term_addrs: &[(&[u8], u32)], + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { for &(term_bytes, addr) in term_addrs { let recorder: &mut Rec = self.heap.get_mut_ref(addr); serializer.new_term(term_bytes)?; @@ -227,4 +237,3 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' Ok(()) } } - diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index dde85d66c..07c0c4e19 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable { /// Close the document. It will help record the term frequency. fn close_doc(&mut self, heap: &Heap); /// Pushes the postings information to the serializer. - fn serialize(&self, - self_addr: u32, - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()>; + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()>; } /// Only records the doc ids @@ -64,11 +65,12 @@ impl Recorder for NothingRecorder { fn close_doc(&mut self, _heap: &Heap) {} - fn serialize(&self, - self_addr: u32, - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { for doc in self.stack.iter(self_addr, heap) { serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?; } @@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder { } - fn serialize(&self, - self_addr: u32, - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { // the last document has not been closed... // its term freq is self.current_tf. - let mut doc_iter = self.stack - .iter(self_addr, heap) - .chain(Some(self.current_tf).into_iter()); + let mut doc_iter = self.stack.iter(self_addr, heap).chain( + Some(self.current_tf) + .into_iter(), + ); while let Some(doc) = doc_iter.next() { - let term_freq = doc_iter - .next() - .expect("The IndexWriter recorded a doc without a term freq."); + let term_freq = doc_iter.next().expect( + "The IndexWriter recorded a doc without a term freq.", + ); serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?; } Ok(()) @@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder { self.stack.push(POSITION_END, heap); } - fn serialize(&self, - self_addr: u32, - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()> { + fn serialize( + &self, + self_addr: u32, + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()> { let mut doc_positions = Vec::with_capacity(100); let mut positions_iter = self.stack.iter(self_addr, heap); while let Some(doc) = positions_iter.next() { @@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder { prev_position = position; } } - serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?; + serializer.write_doc( + doc, + doc_positions.len() as u32, + &doc_positions, + )?; } Ok(()) } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index d8d08e40b..cadc85401 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -25,11 +25,10 @@ struct PositionComputer { } impl PositionComputer { - pub fn new(positions_stream: CompressedIntStream) -> PositionComputer { PositionComputer { position_to_skip: None, - positions: vec!(), + positions: vec![], positions_stream: positions_stream, } } @@ -38,9 +37,9 @@ impl PositionComputer { self.position_to_skip = Some( self.position_to_skip .map(|prev_skip| prev_skip + num_skip) - .unwrap_or(0) - ); - } + .unwrap_or(0), + ); + } pub fn positions(&mut self, term_freq: usize) -> &[u32] { if let Some(num_skip) = self.position_to_skip { @@ -83,13 +82,13 @@ impl SegmentPostings { /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_block_postings(segment_block_postings: BlockSegmentPostings, - delete_bitset: DeleteBitSet, - positions_stream_opt: Option) - -> SegmentPostings { - let position_computer = positions_stream_opt.map(|stream| { - UnsafeCell::new(PositionComputer::new(stream)) - }); + pub fn from_block_postings( + segment_block_postings: BlockSegmentPostings, + delete_bitset: DeleteBitSet, + positions_stream_opt: Option, + ) -> SegmentPostings { + let position_computer = + positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream))); SegmentPostings { block_cursor: segment_block_postings, cur: COMPRESSION_BLOCK_SIZE, // cursor within the block @@ -110,7 +109,7 @@ impl SegmentPostings { } - fn position_add_skipusize>(&self, num_skips_fn: F) { + fn position_add_skip usize>(&self, num_skips_fn: F) { if let Some(ref position_computer) = self.position_computer.as_ref() { let num_skips = num_skips_fn(); unsafe { @@ -135,7 +134,7 @@ impl DocSet for SegmentPostings { return false; } } - self.position_add_skip(|| { self.term_freq() as usize }); + self.position_add_skip(|| self.term_freq() as usize); if !self.delete_bitset.is_deleted(self.doc()) { return true; } @@ -257,8 +256,10 @@ impl DocSet for SegmentPostings { #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); - debug_assert!(self.cur < docs.len(), - "Have you forgotten to call `.advance()` at least once before calling .doc()."); + debug_assert!( + self.cur < docs.len(), + "Have you forgotten to call `.advance()` at least once before calling .doc()." + ); docs[self.cur] } } @@ -278,16 +279,11 @@ impl Postings for SegmentPostings { let term_freq = self.term_freq(); self.position_computer .as_ref() - .map(|position_computer| { - unsafe { - (&mut *position_computer.get()).positions(term_freq as usize) - } + .map(|position_computer| unsafe { + (&mut *position_computer.get()).positions(term_freq as usize) }) .unwrap_or(&EMPTY_POSITIONS[..]) } - - - } @@ -311,10 +307,11 @@ pub struct BlockSegmentPostings { } impl BlockSegmentPostings { - pub(crate) fn from_data(doc_freq: usize, - data: SourceRead, - has_freq: bool) - -> BlockSegmentPostings { + pub(crate) fn from_data( + doc_freq: usize, + data: SourceRead, + has_freq: bool, + ) -> BlockSegmentPostings { let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE; let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks; BlockSegmentPostings { @@ -402,15 +399,16 @@ impl BlockSegmentPostings { /// Returns false iff there was no remaining blocks. pub fn advance(&mut self) -> bool { if self.num_binpacked_blocks > 0 { - // TODO could self.doc_offset be just a local variable? - - let num_consumed_bytes = self - .doc_decoder - .uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset); + let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + ); self.remaining_data.advance(num_consumed_bytes); if self.has_freq { - let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref()); + let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted( + self.remaining_data.as_ref(), + ); self.remaining_data.advance(num_consumed_bytes); } // it will be used as the next offset. @@ -418,15 +416,17 @@ impl BlockSegmentPostings { self.num_binpacked_blocks -= 1; true } else if self.num_vint_docs > 0 { - let num_compressed_bytes = - self.doc_decoder - .uncompress_vint_sorted(self.remaining_data.as_ref(), - self.doc_offset, - self.num_vint_docs); + let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + self.num_vint_docs, + ); self.remaining_data.advance(num_compressed_bytes); if self.has_freq { - self.freq_decoder - .uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs); + self.freq_decoder.uncompress_vint_unsorted( + self.remaining_data.as_ref(), + self.num_vint_docs, + ); } self.num_vint_docs = 0; true @@ -508,12 +508,13 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(int_field).unwrap(); + let inverted_index = segment_reader.inverted_index(int_field); let term = Term::from_field_u64(int_field, 0u64); let term_info = inverted_index.get_term_info(&term).unwrap(); - let mut block_segments = - inverted_index - .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + let mut block_segments = inverted_index.read_block_postings_from_terminfo( + &term_info, + SegmentPostingsOption::NoFreq, + ); let mut offset: u32 = 0u32; // checking that the block before calling advance is empty assert!(block_segments.docs().is_empty()); @@ -549,17 +550,18 @@ mod tests { let mut block_segments; { let term = Term::from_field_u64(int_field, 0u64); - let inverted_index = segment_reader.inverted_index(int_field).unwrap(); + let inverted_index = segment_reader.inverted_index(int_field); let term_info = inverted_index.get_term_info(&term).unwrap(); - block_segments = - inverted_index - .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + block_segments = inverted_index.read_block_postings_from_terminfo( + &term_info, + SegmentPostingsOption::NoFreq, + ); } assert!(block_segments.advance()); assert!(block_segments.docs() == &[0, 2, 4]); { let term = Term::from_field_u64(int_field, 1u64); - let inverted_index = segment_reader.inverted_index(int_field).unwrap(); + let inverted_index = segment_reader.inverted_index(int_field); let term_info = inverted_index.get_term_info(&term).unwrap(); inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments); } diff --git a/src/postings/segment_postings_option.rs b/src/postings/segment_postings_option.rs index 2aba4ec8e..b50e2eee4 100644 --- a/src/postings/segment_postings_option.rs +++ b/src/postings/segment_postings_option.rs @@ -17,7 +17,6 @@ pub enum SegmentPostingsOption { } impl SegmentPostingsOption { - /// Returns true iff this option includes encoding /// term frequencies. pub fn has_freq(&self) -> bool { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 14a22ccea..4c37e015d 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -57,11 +57,12 @@ pub struct InvertedIndexSerializer { impl InvertedIndexSerializer { /// Open a new `PostingsSerializer` for the given segment - fn new(terms_write: CompositeWrite, - postings_write: CompositeWrite, - positions_write: CompositeWrite, - schema: Schema) - -> Result { + fn new( + terms_write: CompositeWrite, + postings_write: CompositeWrite, + positions_write: CompositeWrite, + schema: Schema, + ) -> Result { Ok(InvertedIndexSerializer { terms_write: terms_write, postings_write: postings_write, @@ -78,7 +79,8 @@ impl InvertedIndexSerializer { CompositeWrite::wrap(segment.open_write(TERMS)?), CompositeWrite::wrap(segment.open_write(POSTINGS)?), CompositeWrite::wrap(segment.open_write(POSITIONS)?), - segment.schema()) + segment.schema(), + ) } /// Must be called before starting pushing terms of @@ -94,7 +96,7 @@ impl InvertedIndexSerializer { field_entry.field_type().clone(), term_dictionary_write, postings_write, - positions_write + positions_write, ) } @@ -120,7 +122,6 @@ pub struct FieldSerializer<'a> { impl<'a> FieldSerializer<'a> { - fn new( field_type: FieldType, term_dictionary_write: &'a mut CountingWriter, @@ -128,25 +129,24 @@ impl<'a> FieldSerializer<'a> { positions_write: &'a mut CountingWriter, ) -> io::Result> { - let (term_freq_enabled, position_enabled): (bool, bool) = - match field_type { - FieldType::Str(ref text_options) => { - let text_indexing_options = text_options.get_indexing_options(); - (text_indexing_options.is_termfreq_enabled(), text_indexing_options.is_position_enabled()) - }, - _ => { - (false, false) - } - }; - let term_dictionary_builder = TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?; - let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled); - let positions_serializer_opt = - if position_enabled { - Some(PositionSerializer::new(positions_write)) + let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { + FieldType::Str(ref text_options) => { + let text_indexing_options = text_options.get_indexing_options(); + ( + text_indexing_options.is_termfreq_enabled(), + text_indexing_options.is_position_enabled(), + ) } - else { - None - }; + _ => (false, false), + }; + let term_dictionary_builder = + TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?; + let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled); + let positions_serializer_opt = if position_enabled { + Some(PositionSerializer::new(positions_write)) + } else { + None + }; Ok(FieldSerializer { term_dictionary_builder: term_dictionary_builder, @@ -159,9 +159,9 @@ impl<'a> FieldSerializer<'a> { fn current_term_info(&self) -> TermInfo { let (filepos, offset) = self.positions_serializer_opt - .as_ref() - .map(|positions_serializer| positions_serializer.addr()) - .unwrap_or((0u32, 0u8)); + .as_ref() + .map(|positions_serializer| positions_serializer.addr()) + .unwrap_or((0u32, 0u8)); TermInfo { doc_freq: 0, postings_offset: self.postings_serializer.addr(), @@ -194,11 +194,12 @@ impl<'a> FieldSerializer<'a> { /// /// Term frequencies and positions may be ignored by the serializer depending /// on the configuration of the field in the `Schema`. - pub fn write_doc(&mut self, - doc_id: DocId, - term_freq: u32, - position_deltas: &[u32]) - -> io::Result<()> { + pub fn write_doc( + &mut self, + doc_id: DocId, + term_freq: u32, + position_deltas: &[u32], + ) -> io::Result<()> { self.current_term_info.doc_freq += 1; self.postings_serializer.write_doc(doc_id, term_freq)?; if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() { @@ -213,7 +214,9 @@ impl<'a> FieldSerializer<'a> { /// using `VInt` encoding. pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { - self.term_dictionary_builder.insert_value(&self.current_term_info)?; + self.term_dictionary_builder.insert_value( + &self.current_term_info, + )?; self.postings_serializer.close_term()?; self.term_open = false; } @@ -251,8 +254,8 @@ impl PostingsSerializer { postings_write: CountingWriter::wrap(write), block_encoder: BlockEncoder::new(), - doc_ids: vec!(), - term_freqs: vec!(), + doc_ids: vec![], + term_freqs: vec![], last_doc_id_encoded: 0u32, termfreq_enabled: termfreq_enabled, @@ -267,16 +270,17 @@ impl PostingsSerializer { if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE { { // encode the doc ids - let block_encoded: &[u8] = - self.block_encoder - .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded: &[u8] = self.block_encoder.compress_block_sorted( + &self.doc_ids, + self.last_doc_id_encoded, + ); self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; self.postings_write.write_all(block_encoded)?; } if self.termfreq_enabled { // encode the term_freqs - let block_encoded: &[u8] = self.block_encoder - .compress_block_unsorted(&self.term_freqs); + let block_encoded: &[u8] = + self.block_encoder.compress_block_unsorted(&self.term_freqs); self.postings_write.write_all(block_encoded)?; self.term_freqs.clear(); } @@ -294,16 +298,18 @@ impl PostingsSerializer { // In that case, the remaining part is encoded // using variable int encoding. { - let block_encoded = - self.block_encoder - .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded = self.block_encoder.compress_vint_sorted( + &self.doc_ids, + self.last_doc_id_encoded, + ); self.postings_write.write_all(block_encoded)?; self.doc_ids.clear(); } // ... Idem for term frequencies if self.termfreq_enabled { - let block_encoded = self.block_encoder - .compress_vint_unsorted(&self.term_freqs[..]); + let block_encoded = self.block_encoder.compress_vint_unsorted( + &self.term_freqs[..], + ); self.postings_write.write_all(block_encoded)?; self.term_freqs.clear(); } @@ -373,4 +379,3 @@ impl PositionSerializer { self.write.flush() } } - diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 51ae7083a..375f73202 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -12,7 +12,7 @@ use std::io; /// * `postings_offset` : an offset in the `.idx` file /// addressing the start of the posting list associated /// to this term. -#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)] +#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)] pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index b471da320..ba9f93b19 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -37,10 +37,12 @@ impl Query for BooleanQuery { } fn weight(&self, searcher: &Searcher) -> Result> { - let sub_weights = try!(self.subqueries - .iter() - .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) - .collect()); + let sub_weights = try!( + self.subqueries + .iter() + .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) + .collect() + ); let occurs: Vec = self.subqueries .iter() .map(|&(ref occur, ref _subquery)| *occur) @@ -57,10 +59,9 @@ impl BooleanQuery { let occur_term_queries: Vec<(Occur, Box)> = terms .into_iter() .map(|term| { - let term_query: Box = box TermQuery::new(term, - SegmentPostingsOption::Freq); - (Occur::Should, term_query) - }) + let term_query: Box = box TermQuery::new(term, SegmentPostingsOption::Freq); + (Occur::Should, term_query) + }) .collect(); BooleanQuery::from(occur_term_queries) } diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index 595f54219..723e4a92d 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -55,11 +55,11 @@ impl BooleanScorer { .map(|posting| posting.doc()) .enumerate() .map(|(ord, doc)| { - HeapItem { - doc: doc, - ord: ord as u32, - } - }) + HeapItem { + doc: doc, + ord: ord as u32, + } + }) .collect(); BooleanScorer { scorers: non_empty_scorers, diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 04f22595c..0ff49cbde 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -22,11 +22,12 @@ impl BooleanWeight { impl Weight for BooleanWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { - let sub_scorers: Vec> = - try!(self.weights - .iter() - .map(|weight| weight.scorer(reader)) - .collect()); + let sub_scorers: Vec> = try!( + self.weights + .iter() + .map(|weight| weight.scorer(reader)) + .collect() + ); let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); Ok(box boolean_scorer) } diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 01ef9e824..73f659a03 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -64,8 +64,10 @@ mod tests { } let make_term_query = |text: &str| { - let term_query = TermQuery::new(Term::from_field_text(text_field, text), - SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new( + Term::from_field_text(text_field, text), + SegmentPostingsOption::NoFreq, + ); let query: Box = box term_query; query }; @@ -87,19 +89,25 @@ mod tests { assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), - (Occur::Should, make_term_query("b"))]); + let boolean_query = BooleanQuery::from(vec![ + (Occur::Should, make_term_query("a")), + (Occur::Should, make_term_query("b")), + ]); assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), - (Occur::Should, make_term_query("b"))]); + let boolean_query = BooleanQuery::from(vec![ + (Occur::Must, make_term_query("a")), + (Occur::Should, make_term_query("b")), + ]); assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), - (Occur::Should, make_term_query("b")), - (Occur::MustNot, make_term_query("d"))]); + let boolean_query = BooleanQuery::from(vec![ + (Occur::Must, make_term_query("a")), + (Occur::Should, make_term_query("b")), + (Occur::MustNot, make_term_query("d")), + ]); assert_eq!(matching_docs(&boolean_query), vec![0, 1]); } { diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 4ad89a3b2..8adc4728b 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -61,9 +61,9 @@ mod tests { .map(|text| Term::from_field_text(text_field, text)) .collect(); let phrase_query = PhraseQuery::from(terms); - searcher - .search(&phrase_query, &mut test_collector) - .expect("search should succeed"); + searcher.search(&phrase_query, &mut test_collector).expect( + "search should succeed", + ); test_collector.docs() }; diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 119f32dbe..1a85342b9 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -22,7 +22,7 @@ impl Weight for PhraseWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let mut term_postings_list = Vec::new(); for term in &self.phrase_terms { - let inverted_index = reader.inverted_index(term.field())?; + let inverted_index = reader.inverted_index(term.field()); let term_postings_option = inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions); if let Some(term_postings) = term_postings_option { @@ -31,6 +31,8 @@ impl Weight for PhraseWeight { return Ok(box EmptyScorer); } } - Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) }) + Ok(box PhraseScorer { + intersection_docset: IntersectionDocSet::from(term_postings_list), + }) } } diff --git a/src/query/query.rs b/src/query/query.rs index 683281dc6..59e1f2cbf 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -66,7 +66,10 @@ pub trait Query: fmt::Debug { let mut segment_search_timer = search_timer.open("segment_search"); { let _ = segment_search_timer.open("set_segment"); - try!(collector.set_segment(segment_ord as SegmentLocalId, segment_reader)); + try!(collector.set_segment( + segment_ord as SegmentLocalId, + segment_reader, + )); } let mut scorer = try!(weight.scorer(segment_reader)); { diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 08f167b25..8fa2a3c11 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -3,7 +3,8 @@ use combine::char::*; use super::user_input_ast::*; fn literal(input: I) -> ParseResult - where I: Stream +where + I: Stream, { let term_val = || { let word = many1(satisfy(|c: char| c.is_alphanumeric())); @@ -11,27 +12,29 @@ fn literal(input: I) -> ParseResult phrase.or(word) }; - let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))) - .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); + let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map( + |(s1, s2): (char, String)| format!("{}{}", s1, s2), + ); - let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))) - .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); + let field = ( + letter(), + many(satisfy(|c: char| c.is_alphanumeric() || c == '_')), + ).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); let term_val_with_field = negative_numbers.or(term_val()); let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| { - UserInputLiteral { - field_name: - Some(field_name), - phrase: phrase, - } - }); + UserInputLiteral { + field_name: Some(field_name), + phrase: phrase, + } + }); let term_default_field = term_val().map(|phrase| { - UserInputLiteral { - field_name: None, - phrase: phrase, - } - }); + UserInputLiteral { + field_name: None, + phrase: phrase, + } + }); try(term_query) .or(term_default_field) .map(UserInputAST::from) @@ -40,25 +43,29 @@ fn literal(input: I) -> ParseResult fn leaf(input: I) -> ParseResult - where I: Stream +where + I: Stream, { (char('-'), parser(literal)) .map(|(_, expr)| UserInputAST::Not(box expr)) - .or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr))) + .or((char('+'), parser(literal)).map(|(_, expr)| { + UserInputAST::Must(box expr) + })) .or(parser(literal)) .parse_stream(input) } pub fn parse_to_ast(input: I) -> ParseResult - where I: Stream +where + I: Stream, { sep_by(parser(leaf), spaces()) .map(|subqueries: Vec| if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) - }) + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) + }) .parse_stream(input) } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 0b6b43efe..5beb42745 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -117,20 +117,22 @@ impl QueryParser { /// Parse the user query into an AST. fn parse_query_to_logical_ast(&self, query: &str) -> Result { - let (user_input_ast, _remaining) = parse_to_ast(query) - .map_err(|_| QueryParserError::SyntaxError)?; + let (user_input_ast, _remaining) = parse_to_ast(query).map_err( + |_| QueryParserError::SyntaxError, + )?; self.compute_logical_ast(user_input_ast) } fn resolve_field_name(&self, field_name: &str) -> Result { - self.schema - .get_field(field_name) - .ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name))) + self.schema.get_field(field_name).ok_or_else(|| { + QueryParserError::FieldDoesNotExist(String::from(field_name)) + }) } - fn compute_logical_ast(&self, - user_input_ast: UserInputAST) - -> Result { + fn compute_logical_ast( + &self, + user_input_ast: UserInputAST, + ) -> Result { let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?; if occur == Occur::MustNot { return Err(QueryParserError::AllButQueryForbidden); @@ -138,10 +140,11 @@ impl QueryParser { Ok(ast) } - fn compute_logical_ast_for_leaf(&self, - field: Field, - phrase: &str) - -> Result, QueryParserError> { + fn compute_logical_ast_for_leaf( + &self, + field: Field, + phrase: &str, + ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -174,7 +177,9 @@ impl QueryParser { if terms.is_empty() { Ok(None) } else if terms.len() == 1 { - Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))) + Ok(Some( + LogicalLiteral::Term(terms.into_iter().next().unwrap()), + )) } else { Ok(Some(LogicalLiteral::Phrase(terms))) } @@ -191,18 +196,24 @@ impl QueryParser { } } - fn compute_logical_ast_with_occur(&self, - user_input_ast: UserInputAST) - -> Result<(Occur, LogicalAST), QueryParserError> { + fn compute_logical_ast_with_occur( + &self, + user_input_ast: UserInputAST, + ) -> Result<(Occur, LogicalAST), QueryParserError> { match user_input_ast { UserInputAST::Clause(sub_queries) => { let default_occur = self.default_occur(); - let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter() - .map(|sub_query| self.compute_logical_ast_with_occur(*sub_query)) - .map(|res| { - res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast)) - }) - .collect()); + let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!( + sub_queries + .into_iter() + .map(|sub_query| self.compute_logical_ast_with_occur(*sub_query)) + .map(|res| { + res.map(|(occur, sub_ast)| { + (compose_occur(default_occur, occur), sub_ast) + }) + }) + .collect() + ); Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } UserInputAST::Not(subquery) => { @@ -320,9 +331,10 @@ mod test { } - fn parse_query_to_logical_ast(query: &str, - default_conjunction: bool) - -> Result { + fn parse_query_to_logical_ast( + query: &str, + default_conjunction: bool, + ) -> Result { let mut query_parser = make_query_parser(); if default_conjunction { query_parser.set_conjunction_by_default(); @@ -330,9 +342,11 @@ mod test { query_parser.parse_query_to_logical_ast(query) } - fn test_parse_query_to_logical_ast_helper(query: &str, - expected: &str, - default_conjunction: bool) { + fn test_parse_query_to_logical_ast_helper( + query: &str, + expected: &str, + default_conjunction: bool, + ) { let query = parse_query_to_logical_ast(query, default_conjunction).unwrap(); let query_str = format!("{:?}", query); assert_eq!(query_str, expected); @@ -358,21 +372,29 @@ mod test { } }; - assert_eq!(is_not_indexed_err("notindexed_text:titi"), - Some(String::from("notindexed_text"))); - assert_eq!(is_not_indexed_err("notindexed_u64:23424"), - Some(String::from("notindexed_u64"))); - assert_eq!(is_not_indexed_err("notindexed_i64:-234324"), - Some(String::from("notindexed_i64"))); + assert_eq!( + is_not_indexed_err("notindexed_text:titi"), + Some(String::from("notindexed_text")) + ); + assert_eq!( + is_not_indexed_err("notindexed_u64:23424"), + Some(String::from("notindexed_u64")) + ); + assert_eq!( + is_not_indexed_err("notindexed_i64:-234324"), + Some(String::from("notindexed_i64")) + ); } #[test] pub fn test_parse_query_untokenized() { - test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"", - "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \ + test_parse_query_to_logical_ast_helper( + "nottokenized:\"wordone wordtwo\"", + "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \ 101, 32, 119, 111, 114, 100, 116, 119, 111])", - false); + false, + ); } #[test] @@ -381,82 +403,115 @@ mod test { assert!(query_parser.parse_query("signed:2324").is_ok()); assert!(query_parser.parse_query("signed:\"22\"").is_ok()); assert!(query_parser.parse_query("signed:\"-2234\"").is_ok()); - assert!(query_parser - .parse_query("signed:\"-9999999999999\"") - .is_ok()); + assert!( + query_parser + .parse_query("signed:\"-9999999999999\"") + .is_ok() + ); assert!(query_parser.parse_query("signed:\"a\"").is_err()); assert!(query_parser.parse_query("signed:\"2a\"").is_err()); - assert!(query_parser - .parse_query("signed:\"18446744073709551615\"") - .is_err()); + assert!( + query_parser + .parse_query("signed:\"18446744073709551615\"") + .is_err() + ); assert!(query_parser.parse_query("unsigned:\"2\"").is_ok()); assert!(query_parser.parse_query("unsigned:\"-2\"").is_err()); - assert!(query_parser - .parse_query("unsigned:\"18446744073709551615\"") - .is_ok()); - test_parse_query_to_logical_ast_helper("unsigned:2324", - "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])", - false); + assert!( + query_parser + .parse_query("unsigned:\"18446744073709551615\"") + .is_ok() + ); + test_parse_query_to_logical_ast_helper( + "unsigned:2324", + "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])", + false, + ); - test_parse_query_to_logical_ast_helper("signed:-2324", - &format!("{:?}", - Term::from_field_i64(Field(2u32), -2324)), - false); + test_parse_query_to_logical_ast_helper( + "signed:-2324", + &format!("{:?}", Term::from_field_i64(Field(2u32), -2324)), + false, + ); } #[test] pub fn test_parse_query_to_ast_disjunction() { - test_parse_query_to_logical_ast_helper("title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - false); - test_parse_query_to_logical_ast_helper("+title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - false); - test_parse_query_to_logical_ast_helper("+title:toto -titi", - "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ + test_parse_query_to_logical_ast_helper( + "title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + false, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + false, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto -titi", + "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \ Term([0, 0, 0, 1, 116, 105, 116, 105])))", - false); - assert_eq!(parse_query_to_logical_ast("-title:toto", false) - .err() - .unwrap(), - QueryParserError::AllButQueryForbidden); - test_parse_query_to_logical_ast_helper("title:a b", - "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \ + false, + ); + assert_eq!( + parse_query_to_logical_ast("-title:toto", false) + .err() + .unwrap(), + QueryParserError::AllButQueryForbidden + ); + test_parse_query_to_logical_ast_helper( + "title:a b", + "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \ Term([0, 0, 0, 1, 98])))", - false); - test_parse_query_to_logical_ast_helper("title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), \ + false, + ); + test_parse_query_to_logical_ast_helper( + "title:\"a b\"", + "\"[Term([0, 0, 0, 0, 97]), \ Term([0, 0, 0, 0, 98])]\"", - false); + false, + ); } #[test] pub fn test_parse_query_to_ast_conjunction() { - test_parse_query_to_logical_ast_helper("title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - true); - test_parse_query_to_logical_ast_helper("+title:toto", - "Term([0, 0, 0, 0, 116, 111, 116, 111])", - true); - test_parse_query_to_logical_ast_helper("+title:toto -titi", - "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ + test_parse_query_to_logical_ast_helper( + "title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + true, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + true, + ); + test_parse_query_to_logical_ast_helper( + "+title:toto -titi", + "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \ Term([0, 0, 0, 1, 116, 105, 116, 105])))", - true); - assert_eq!(parse_query_to_logical_ast("-title:toto", true) - .err() - .unwrap(), - QueryParserError::AllButQueryForbidden); - test_parse_query_to_logical_ast_helper("title:a b", - "(+Term([0, 0, 0, 0, 97]) \ + true, + ); + assert_eq!( + parse_query_to_logical_ast("-title:toto", true) + .err() + .unwrap(), + QueryParserError::AllButQueryForbidden + ); + test_parse_query_to_logical_ast_helper( + "title:a b", + "(+Term([0, 0, 0, 0, 97]) \ +(Term([0, 0, 0, 0, 98]) \ Term([0, 0, 0, 1, 98])))", - true); - test_parse_query_to_logical_ast_helper("title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), \ + true, + ); + test_parse_query_to_logical_ast_helper( + "title:\"a b\"", + "\"[Term([0, 0, 0, 0, 97]), \ Term([0, 0, 0, 0, 98])]\"", - true); + true, + ); } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 9670e73e2..bbc751c5e 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -44,8 +44,10 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); - let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), - SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new( + Term::from_field_text(text_field, "a"), + SegmentPostingsOption::NoFreq, + ); let term_weight = term_query.weight(&searcher).unwrap(); let segment_reader = searcher.segment_reader(0); let mut term_scorer = term_weight.scorer(segment_reader).unwrap(); diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 73ea46b4b..95787a030 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -7,7 +7,8 @@ use postings::Postings; use fastfield::FastFieldReader; pub struct TermScorer - where TPostings: Postings +where + TPostings: Postings, { pub idf: Score, pub fieldnorm_reader_opt: Option, @@ -15,7 +16,8 @@ pub struct TermScorer } impl TermScorer - where TPostings: Postings +where + TPostings: Postings, { pub fn postings(&self) -> &TPostings { &self.postings @@ -23,7 +25,8 @@ impl TermScorer } impl DocSet for TermScorer - where TPostings: Postings +where + TPostings: Postings, { fn advance(&mut self) -> bool { self.postings.advance() @@ -40,7 +43,8 @@ impl DocSet for TermScorer } impl Scorer for TermScorer - where TPostings: Postings +where + TPostings: Postings, { fn score(&self) -> Score { let doc = self.postings.doc(); diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index a755a2921..d837a63fd 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -28,21 +28,22 @@ impl TermWeight { } /// If the field is not found, returns an empty `DocSet`. - pub fn specialized_scorer(&self, - reader: &SegmentReader) - -> Result> { + pub fn specialized_scorer( + &self, + reader: &SegmentReader, + ) -> Result> { let field = self.term.field(); - let inverted_index = reader.inverted_index(field)?; + let inverted_index = reader.inverted_index(field); let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - let postings_opt: Option = inverted_index.read_postings(&self.term, self.segment_postings_options); + let postings_opt: Option = + inverted_index.read_postings(&self.term, self.segment_postings_options); if let Some(segment_postings) = postings_opt { Ok(TermScorer { idf: self.idf(), fieldnorm_reader_opt: fieldnorm_reader_opt, postings: segment_postings, }) - } - else { + } else { Ok(TermScorer { idf: 1f32, fieldnorm_reader_opt: None, diff --git a/src/schema/field.rs b/src/schema/field.rs index 9df8e149b..b7ecc3737 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -10,7 +10,7 @@ use common::BinarySerializable; /// /// Because the field id is a `u8`, tantivy can only have at most `255` fields. /// Value 255 is reserved. -#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)] pub struct Field(pub u32); impl BinarySerializable for Field { diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 883dc49ff..7487ff7c1 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -89,7 +89,8 @@ impl FieldEntry { impl Serialize for FieldEntry { fn serialize(&self, serializer: S) -> Result - where S: Serializer + where + S: Serializer, { let mut s = serializer.serialize_struct("field_entry", 3)?; s.serialize_field("name", &self.name)?; @@ -115,7 +116,8 @@ impl Serialize for FieldEntry { impl<'de> Deserialize<'de> for FieldEntry { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> + where + D: Deserializer<'de>, { #[derive(Deserialize)] #[serde(field_identifier, rename_all = "lowercase")] @@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry { } fn visit_map(self, mut map: V) -> Result - where V: MapAccess<'de> + where + V: MapAccess<'de>, { let mut name = None; let mut ty = None; @@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry { let name = name.ok_or_else(|| de::Error::missing_field("name"))?; ty.ok_or_else(|| de::Error::missing_field("ty"))?; - let field_type = field_type - .ok_or_else(|| de::Error::missing_field("options"))?; + let field_type = field_type.ok_or_else( + || de::Error::missing_field("options"), + )?; Ok(FieldEntry { - name: name, - field_type: field_type, - }) + name: name, + field_type: field_type, + }) } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 7a494c9e4..f31c6e9da 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -80,8 +80,9 @@ impl FieldType { FieldType::Str(_) => Ok(Value::Str(field_text.clone())), FieldType::U64(_) | FieldType::I64(_) => { - Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}", - json))) + Err(ValueParsingError::TypeError( + format!("Expected an integer, got {:?}", json), + )) } } } @@ -110,9 +111,11 @@ impl FieldType { } } _ => { - let msg = format!("Json value not supported error {:?}. Expected {:?}", - json, - self); + let msg = format!( + "Json value not supported error {:?}. Expected {:?}", + json, + self + ); Err(ValueParsingError::TypeError(msg)) } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 7c5f480dc..93f50ff48 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -105,9 +105,9 @@ impl SchemaBuilder { /// This will consume your `SchemaBuilder` pub fn build(self) -> Schema { Schema(Arc::new(InnerSchema { - fields: self.fields, - fields_map: self.fields_map, - })) + fields: self.fields, + fields_map: self.fields_map, + })) } } @@ -206,15 +206,14 @@ impl Schema { /// Build a document object from a json-object. pub fn parse_document(&self, doc_json: &str) -> Result { - let json_obj: JsonObject = serde_json::from_str(doc_json) - .map_err(|_| { - let doc_json_sample: String = if doc_json.len() < 20 { - String::from(doc_json) - } else { - format!("{:?}...", &doc_json[0..20]) - }; - DocParsingError::NotJSON(doc_json_sample) - })?; + let json_obj: JsonObject = serde_json::from_str(doc_json).map_err(|_| { + let doc_json_sample: String = if doc_json.len() < 20 { + String::from(doc_json) + } else { + format!("{:?}...", &doc_json[0..20]) + }; + DocParsingError::NotJSON(doc_json_sample) + })?; let mut doc = Document::default(); for (field_name, json_value) in json_obj.iter() { @@ -225,18 +224,15 @@ impl Schema { match *json_value { JsonValue::Array(ref json_items) => { for json_item in json_items { - let value = try!(field_type - .value_from_json(json_item) - .map_err(|e| { - DocParsingError::ValueError(field_name.clone(), e) - })); + let value = + try!(field_type.value_from_json(json_item).map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })); doc.add(FieldValue::new(field, value)); } } _ => { - let value = try!(field_type - .value_from_json(json_value) - .map_err(|e| { + let value = try!(field_type.value_from_json(json_value).map_err(|e| { DocParsingError::ValueError(field_name.clone(), e) })); doc.add(FieldValue::new(field, value)); @@ -259,7 +255,8 @@ impl fmt::Debug for Schema { impl Serialize for Schema { fn serialize(&self, serializer: S) -> Result - where S: Serializer + where + S: Serializer, { let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?; for e in &self.0.fields { @@ -271,7 +268,8 @@ impl Serialize for Schema { impl<'de> Deserialize<'de> for Schema { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> + where + D: Deserializer<'de>, { struct SchemaVisitor; @@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema { } fn visit_seq(self, mut seq: A) -> Result - where A: SeqAccess<'de> + where + A: SeqAccess<'de>, { let mut schema = SchemaBuilder { fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)), @@ -430,12 +429,14 @@ mod tests { } { let doc = schema - .parse_document(r#"{ + .parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 4, "popularity": 10 - }"#) + }"#, + ) .unwrap(); assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); @@ -443,13 +444,15 @@ mod tests { assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 4, "popularity": 10, "jambon": "bayonne" - }"#); + }"#, + ); match json_err { Err(DocParsingError::NoSuchFieldInSchema(field_name)) => { assert_eq!(field_name, "jambon"); @@ -460,13 +463,15 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": "5", "popularity": "10", "jambon": "bayonne" - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => { assert!(true); @@ -477,12 +482,14 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": -5, "popularity": 10 - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { assert!(true); @@ -493,12 +500,14 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 9223372036854775808, "popularity": 10 - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { panic!("expected 9223372036854775808 to fit into u64, but it didn't"); @@ -509,12 +518,14 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 50, "popularity": 9223372036854775808 - }"#); + }"#, + ); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { assert!(true); @@ -525,11 +536,13 @@ mod tests { } } { - let json_err = schema.parse_document(r#"{ + let json_err = schema.parse_document( + r#"{ "title": "my title", "author": "fulmicoton", "count": 50, - }"#); + }"#, + ); match json_err { Err(NotJSON(_)) => { assert!(true); diff --git a/src/schema/term.rs b/src/schema/term.rs index f66144b07..197f4975a 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8; /// /// It actually wraps a `Vec`. #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] -pub struct Term>(B) where B: AsRef<[u8]>; +pub struct Term>(B) +where + B: AsRef<[u8]>; impl Term { /// Builds a term given a field, and a u64-value @@ -109,7 +111,8 @@ impl Term { } impl Term - where B: AsRef<[u8]> +where + B: AsRef<[u8]>, { /// Wraps a source of data pub fn wrap(data: B) -> Term { @@ -166,7 +169,8 @@ impl Term } impl AsRef<[u8]> for Term - where B: AsRef<[u8]> +where + B: AsRef<[u8]>, { fn as_ref(&self) -> &[u8] { self.0.as_ref() diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 472bd3e1e..ddcd9948e 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -2,7 +2,7 @@ use std::ops::BitOr; /// Define how a text field should be handled by tantivy. -#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct TextOptions { indexing: TextIndexingOptions, stored: bool, @@ -45,10 +45,10 @@ impl Default for TextOptions { /// Describe how a field should be indexed -#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)] +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)] pub enum TextIndexingOptions { /// Unindexed fields will not generate any postings. They will not be searchable either. - #[serde(rename="unindexed")] + #[serde(rename = "unindexed")] Unindexed, /// Untokenized means that the field text will not be split into tokens before being indexed. /// A field with the value "Hello world", will have the document suscribe to one single @@ -56,23 +56,23 @@ pub enum TextIndexingOptions { /// /// It will **not** be searchable if the user enter "hello" for instance. /// This can be useful for tags, or ids for instance. - #[serde(rename="untokenized")] + #[serde(rename = "untokenized")] Untokenized, /// TokenizedNoFreq will tokenize the field value, and append the document doc id /// to the posting lists associated to all of the tokens. /// The frequence of appearance of the term in the document however will be lost. /// The term frequency used in the TfIdf formula will always be 1. - #[serde(rename="tokenize")] + #[serde(rename = "tokenize")] TokenizedNoFreq, /// TokenizedWithFreq will tokenize the field value, and encode /// both the docid and the term frequency in the posting lists associated to all - #[serde(rename="freq")] + #[serde(rename = "freq")] TokenizedWithFreq, /// Like TokenizedWithFreq, but also encodes the positions of the /// terms in a separate file. This option is required for phrase queries. /// Don't use this if you are certain you won't need it, the term positions file /// can be very big. - #[serde(rename="position")] + #[serde(rename = "position")] TokenizedWithFreqAndPosition, } diff --git a/src/schema/value.rs b/src/schema/value.rs index ad24688ee..828822a8e 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -16,7 +16,8 @@ pub enum Value { impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where S: Serializer + where + S: Serializer, { match *self { Value::Str(ref v) => serializer.serialize_str(v), @@ -28,7 +29,8 @@ impl Serialize for Value { impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> + where + D: Deserializer<'de>, { struct ValueVisitor; @@ -162,9 +164,13 @@ mod binary_serialize { Ok(Value::I64(value)) } _ => { - Err(io::Error::new(io::ErrorKind::InvalidData, - format!("No field type is associated with code {:?}", - type_code))) + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "No field type is associated with code {:?}", + type_code + ), + )) } } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 59e0558d1..46138d556 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -54,17 +54,19 @@ mod tests { fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema { let mut schema_builder = SchemaBuilder::default(); let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored()); - let field_title = schema_builder - .add_text_field("title", TextOptions::default().set_stored()); + let field_title = + schema_builder.add_text_field("title", TextOptions::default().set_stored()); let schema = schema_builder.build(); - let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ + let lorem = String::from( + "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ Ut enim ad minim veniam, quis nostrud exercitation ullamco \ laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \ dolor in reprehenderit in voluptate velit esse cillum dolore eu \ fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \ proident, sunt in culpa qui officia deserunt mollit anim id est \ - laborum."); + laborum.", + ); { let mut store_writer = StoreWriter::new(writer); for i in 0..num_docs { @@ -96,8 +98,10 @@ mod tests { let store_source = directory.open_read(path).unwrap(); let store = StoreReader::from_source(store_source); for i in 0..1_000 { - assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(), - format!("Doc {}", i)); + assert_eq!( + *store.get(i).unwrap().get_first(field_title).unwrap().text(), + format!("Doc {}", i) + ); } } @@ -106,9 +110,9 @@ mod tests { let mut directory = MmapDirectory::create_from_tempdir().unwrap(); let path = Path::new("store"); b.iter(|| { - write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); - directory.delete(path).unwrap(); - }); + write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); + directory.delete(path).unwrap(); + }); } diff --git a/src/store/reader.rs b/src/store/reader.rs index 05781a583..72f9b2da7 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -49,7 +49,7 @@ impl StoreReader { let mut cursor = &total_buffer[block_offset..]; let block_length = u32::deserialize(&mut cursor).unwrap(); let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize).. - (block_offset + 4 + block_length as usize)]; + (block_offset + 4 + block_length as usize)]; let mut lz4_decoder = try!(lz4::Decoder::new(block_array)); *self.current_block_offset.borrow_mut() = usize::max_value(); try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())); @@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) let offset = u64::deserialize(&mut serialized_offset_buf).unwrap(); let offset = offset as usize; let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap(); - (data.slice(0, offset), data.slice(offset, footer_offset), max_doc) + ( + data.slice(0, offset), + data.slice(offset, footer_offset), + max_doc, + ) } diff --git a/src/store/writer.rs b/src/store/writer.rs index 28befa7af..2b7aacb19 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -49,12 +49,15 @@ impl StoreWriter { /// pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> { self.intermediary_buffer.clear(); - try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); + try!((field_values.len() as u32).serialize( + &mut self.intermediary_buffer, + )); for field_value in field_values { try!((*field_value).serialize(&mut self.intermediary_buffer)); } - (self.intermediary_buffer.len() as u32) - .serialize(&mut self.current_block)?; + (self.intermediary_buffer.len() as u32).serialize( + &mut self.current_block, + )?; self.current_block.write_all(&self.intermediary_buffer[..])?; self.doc += 1; if self.current_block.len() > BLOCK_SIZE { @@ -66,16 +69,22 @@ impl StoreWriter { fn write_and_compress_block(&mut self) -> io::Result<()> { self.intermediary_buffer.clear(); { - let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)); + let mut encoder = try!(lz4::EncoderBuilder::new().build( + &mut self.intermediary_buffer, + )); try!(encoder.write_all(&self.current_block)); let (_, encoder_result) = encoder.finish(); try!(encoder_result); } - (self.intermediary_buffer.len() as u32) - .serialize(&mut self.writer)?; + (self.intermediary_buffer.len() as u32).serialize( + &mut self.writer, + )?; self.writer.write_all(&self.intermediary_buffer)?; - self.offset_index_writer - .insert(self.doc, &(self.writer.written_bytes() as u64))?; + self.offset_index_writer.insert( + self.doc, + &(self.writer.written_bytes() as + u64), + )?; self.current_block.clear(); Ok(()) } @@ -90,8 +99,7 @@ impl StoreWriter { try!(self.write_and_compress_block()); } let header_offset: u64 = self.writer.written_bytes() as u64; - try!(self.offset_index_writer - .write(&mut self.writer)); + try!(self.offset_index_writer.write(&mut self.writer)); try!(header_offset.serialize(&mut self.writer)); try!(self.doc.serialize(&mut self.writer)); self.writer.flush() diff --git a/src/termdict/fstdict/streamer.rs b/src/termdict/fstdict/streamer.rs index 823e5cdc4..1d90fe9c1 100644 --- a/src/termdict/fstdict/streamer.rs +++ b/src/termdict/fstdict/streamer.rs @@ -5,17 +5,13 @@ use super::TermDictionaryImpl; use termdict::{TermStreamerBuilder, TermStreamer}; /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a> -{ +pub struct TermStreamerBuilderImpl<'a> { fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>, } -impl<'a> TermStreamerBuilderImpl<'a> -{ - pub(crate) fn new(fst_map: &'a TermDictionaryImpl, - stream_builder: StreamBuilder<'a>) - -> Self { +impl<'a> TermStreamerBuilderImpl<'a> { + pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self { TermStreamerBuilderImpl { fst_map: fst_map, stream_builder: stream_builder, @@ -23,8 +19,7 @@ impl<'a> TermStreamerBuilderImpl<'a> } } -impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> -{ +impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { type Streamer = TermStreamerImpl<'a>; fn ge>(mut self, bound: T) -> Self { @@ -60,8 +55,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a> -{ +pub struct TermStreamerImpl<'a> { fst_map: &'a TermDictionaryImpl, stream: Stream<'a>, offset: u64, @@ -69,17 +63,15 @@ pub struct TermStreamerImpl<'a> current_value: TermInfo, } -impl<'a> TermStreamer for TermStreamerImpl<'a> -{ +impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { if let Some((term, offset)) = self.stream.next() { self.current_key.clear(); self.current_key.extend_from_slice(term); self.offset = offset; - self.current_value = - self.fst_map - .read_value(self.offset) - .expect("Fst data is corrupted. Failed to deserialize a value."); + self.current_value = self.fst_map.read_value(self.offset).expect( + "Fst data is corrupted. Failed to deserialize a value.", + ); true } else { false diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 253e70b2a..ce608113b 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -13,14 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error { } /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) -pub struct TermDictionaryBuilderImpl -{ +pub struct TermDictionaryBuilderImpl { fst_builder: fst::MapBuilder, data: Vec, } impl TermDictionaryBuilderImpl - where W: Write +where + W: Write, { /// # Warning /// Horribly dangerous internal API @@ -46,14 +46,15 @@ impl TermDictionaryBuilderImpl } impl TermDictionaryBuilder for TermDictionaryBuilderImpl - where W: Write +where + W: Write, { fn new(w: W, _field_type: FieldType) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilderImpl { - fst_builder: fst_builder, - data: Vec::new(), - }) + fst_builder: fst_builder, + data: Vec::new(), + }) } fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { @@ -75,28 +76,25 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl } } -fn open_fst_index(source: ReadOnlySource) -> io::Result { +fn open_fst_index(source: ReadOnlySource) -> fst::Map { let fst = match source { ReadOnlySource::Anonymous(data) => { - Fst::from_shared_bytes(data.data, data.start, data.len) - .map_err(convert_fst_error)? + Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted") } ReadOnlySource::Mmap(mmap_readonly) => { - Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)? + Fst::from_mmap(mmap_readonly).expect("FST data is corrupted") } }; - Ok(fst::Map::from(fst)) + fst::Map::from(fst) } /// See [`TermDictionary`](./trait.TermDictionary.html) -pub struct TermDictionaryImpl -{ +pub struct TermDictionaryImpl { fst_index: fst::Map, values_mmap: ReadOnlySource, } -impl TermDictionaryImpl -{ +impl TermDictionaryImpl { /// Deserialize and returns the value at address `offset` pub(crate) fn read_value(&self, offset: u64) -> io::Result { let buffer = self.values_mmap.as_slice(); @@ -106,34 +104,34 @@ impl TermDictionaryImpl } -impl<'a> TermDictionary<'a> for TermDictionaryImpl -{ +impl<'a> TermDictionary<'a> for TermDictionaryImpl { type Streamer = TermStreamerImpl<'a>; type StreamBuilder = TermStreamerBuilderImpl<'a>; - fn from_source(source: ReadOnlySource) -> io::Result { + fn from_source(source: ReadOnlySource) -> Self { let total_len = source.len(); let length_offset = total_len - 4; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = u32::deserialize(&mut split_len_buffer)? as usize; + let footer_size = u32::deserialize(&mut split_len_buffer).expect( + "Deserializing 4 bytes should always work", + ) as usize; let split_len = length_offset - footer_size; let fst_source = source.slice(0, split_len); let values_source = source.slice(split_len, length_offset); - let fst_index = open_fst_index(fst_source)?; - Ok(TermDictionaryImpl { - fst_index: fst_index, - values_mmap: values_source, - }) + let fst_index = open_fst_index(fst_source); + TermDictionaryImpl { + fst_index: fst_index, + values_mmap: values_source, + } } fn get>(&self, key: K) -> Option { - self.fst_index - .get(key) - .map(|offset| { - self.read_value(offset) - .expect("The fst is corrupted. Failed to deserialize a value.") - }) + self.fst_index.get(key).map(|offset| { + self.read_value(offset).expect( + "The fst is corrupted. Failed to deserialize a value.", + ) + }) } fn range(&self) -> TermStreamerBuilderImpl { diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 1e0dde82f..517f9589a 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -4,30 +4,26 @@ use std::cmp::Ordering; use termdict::TermStreamer; use schema::Term; -pub struct HeapItem<'a> -{ +pub struct HeapItem<'a> { pub streamer: TermStreamerImpl<'a>, pub segment_ord: usize, } -impl<'a> PartialEq for HeapItem<'a> -{ +impl<'a> PartialEq for HeapItem<'a> { fn eq(&self, other: &Self) -> bool { self.segment_ord == other.segment_ord } } -impl<'a> Eq for HeapItem<'a> {} +impl<'a> Eq for HeapItem<'a> {} -impl<'a> PartialOrd for HeapItem<'a> -{ +impl<'a> PartialOrd for HeapItem<'a> { fn partial_cmp(&self, other: &HeapItem<'a>) -> Option { Some(self.cmp(other)) } } -impl<'a> Ord for HeapItem<'a> -{ +impl<'a> Ord for HeapItem<'a> { fn cmp(&self, other: &HeapItem<'a>) -> Ordering { (&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord)) } @@ -40,15 +36,12 @@ impl<'a> Ord for HeapItem<'a> /// - the term /// - a slice with the ordinal of the segments containing /// the terms. -pub struct TermMerger<'a> -{ +pub struct TermMerger<'a> { heap: BinaryHeap>, current_streamers: Vec>, } -impl<'a> TermMerger<'a> -{ - +impl<'a> TermMerger<'a> { /// Stream of merged term dictionary /// /// @@ -59,11 +52,11 @@ impl<'a> TermMerger<'a> .into_iter() .enumerate() .map(|(ord, streamer)| { - HeapItem { - streamer: streamer, - segment_ord: ord, - } - }) + HeapItem { + streamer: streamer, + segment_ord: ord, + } + }) .collect(), } } @@ -133,5 +126,3 @@ impl<'a> TermMerger<'a> } } } - - diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 13a31b6d7..9150b8f85 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -54,16 +54,16 @@ use postings::TermInfo; pub use self::merger::TermMerger; -#[cfg(not(feature="streamdict"))] +#[cfg(not(feature = "streamdict"))] mod fstdict; -#[cfg(not(feature="streamdict"))] +#[cfg(not(feature = "streamdict"))] pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, TermStreamerBuilderImpl}; -#[cfg(feature="streamdict")] +#[cfg(feature = "streamdict")] mod streamdict; -#[cfg(feature="streamdict")] +#[cfg(feature = "streamdict")] pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, TermStreamerBuilderImpl}; @@ -72,7 +72,9 @@ use std::io; /// Dictionary associating sorted `&[u8]` to values -pub trait TermDictionary<'a> where Self: Sized +pub trait TermDictionary<'a> +where + Self: Sized, { /// Streamer type associated to the term dictionary type Streamer: TermStreamer + 'a; @@ -81,7 +83,7 @@ pub trait TermDictionary<'a> where Self: Sized type StreamBuilder: TermStreamerBuilder + 'a; /// Opens a `TermDictionary` given a data source. - fn from_source(source: ReadOnlySource) -> io::Result; + fn from_source(source: ReadOnlySource) -> Self; /// Lookups the value corresponding to the key. fn get>(&self, target_key: K) -> Option; @@ -110,7 +112,8 @@ pub trait TermDictionary<'a> where Self: Sized /// /// Inserting must be done in the order of the `keys`. pub trait TermDictionaryBuilder: Sized - where W: io::Write +where + W: io::Write, { /// Creates a new `TermDictionaryBuilder` fn new(write: W, field_type: FieldType) -> io::Result; @@ -170,8 +173,7 @@ pub trait TermStreamer: Sized { /// `TermStreamerBuilder` is an helper object used to define /// a range of terms that should be streamed. -pub trait TermStreamerBuilder -{ +pub trait TermStreamerBuilder { /// Associated `TermStreamer` type that this builder is building. type Streamer: TermStreamer; @@ -226,7 +228,8 @@ mod tests { { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type) + .unwrap(); term_dictionary_builder .insert("abc".as_bytes(), &make_term_info(34u32)) .unwrap(); @@ -236,7 +239,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source).unwrap(); + let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source); assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); let mut stream = term_dict.stream(); @@ -296,7 +299,7 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); - let field_searcher = searcher.field(text_field).unwrap(); + let field_searcher = searcher.field(text_field); let mut term_it = field_searcher.terms(); let mut term_string = String::new(); while term_it.advance() { @@ -314,15 +317,17 @@ mod tests { .collect(); let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); for &(ref id, ref i) in &ids { - term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap(); + term_dictionary_builder + .insert(id.as_bytes(), &make_term_info(*i)) + .unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); { let mut streamer = term_dictionary.stream(); let mut i = 0; @@ -343,16 +348,22 @@ mod tests { fn test_stream_high_range_prefix_suffix() { let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); // term requires more than 16bits - term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)).unwrap(); - term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)).unwrap(); - term_dictionary_builder.insert("abr", &make_term_info(2)).unwrap(); + term_dictionary_builder + .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) + .unwrap(); + term_dictionary_builder + .insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)) + .unwrap(); + term_dictionary_builder + .insert("abr", &make_term_info(2)) + .unwrap(); term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); let mut kv_stream = term_dictionary.stream(); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); @@ -372,17 +383,19 @@ mod tests { .collect(); let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); for &(ref id, ref i) in &ids { - term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap(); + term_dictionary_builder + .insert(id.as_bytes(), &make_term_info(*i)) + .unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); { for i in (0..20).chain(6000..8_000) { let &(ref target_key, _) = &ids[i]; @@ -440,16 +453,18 @@ mod tests { fn test_stream_range_boundaries() { let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + .unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; - term_dictionary_builder.insert(&number_arr, &make_term_info(i as u32)).unwrap(); + term_dictionary_builder + .insert(&number_arr, &make_term_info(i as u32)) + .unwrap(); } term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source) - .unwrap(); + let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); let value_list = |mut streamer: TermStreamerImpl| { let mut res: Vec = vec![]; @@ -460,12 +475,17 @@ mod tests { }; { let range = term_dictionary.range().ge([2u8]).into_stream(); - assert_eq!(value_list(range), - vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]); + assert_eq!( + value_list(range), + vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); } { let range = term_dictionary.range().gt([2u8]).into_stream(); - assert_eq!(value_list(range), vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]); + assert_eq!( + value_list(range), + vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); } { let range = term_dictionary.range().lt([6u8]).into_stream(); @@ -473,7 +493,10 @@ mod tests { } { let range = term_dictionary.range().le([6u8]).into_stream(); - assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]); + assert_eq!( + value_list(range), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] + ); } { let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream(); diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs index 9a2a4173e..5ba466203 100644 --- a/src/termdict/streamdict/delta_encoder.rs +++ b/src/termdict/streamdict/delta_encoder.rs @@ -11,7 +11,7 @@ use common::BinarySerializable; fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize { s1.iter() .zip(s2.iter()) - .take_while(|&(a, b)| a==b) + .take_while(|&(a, b)| a == b) .count() } @@ -45,32 +45,28 @@ pub struct TermDeltaDecoder { impl TermDeltaDecoder { pub fn with_previous_term(term: Vec) -> TermDeltaDecoder { - TermDeltaDecoder { - term: Vec::from(term) - } + TermDeltaDecoder { term: Vec::from(term) } } #[inline(always)] pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] { - let (prefix_len, suffix_len): (usize, usize) = - if (code & 1u8) == 1u8 { - let b = cursor[0]; - cursor = &cursor[1..]; - let prefix_len = (b & 15u8) as usize; - let suffix_len = (b >> 4u8) as usize; - (prefix_len, suffix_len) - } - else { - let prefix_len = u32::deserialize(&mut cursor).unwrap(); - let suffix_len = u32::deserialize(&mut cursor).unwrap(); - (prefix_len as usize, suffix_len as usize) - }; + let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 { + let b = cursor[0]; + cursor = &cursor[1..]; + let prefix_len = (b & 15u8) as usize; + let suffix_len = (b >> 4u8) as usize; + (prefix_len, suffix_len) + } else { + let prefix_len = u32::deserialize(&mut cursor).unwrap(); + let suffix_len = u32::deserialize(&mut cursor).unwrap(); + (prefix_len as usize, suffix_len as usize) + }; unsafe { self.term.set_len(prefix_len) }; self.term.extend_from_slice(&(*cursor)[..suffix_len]); &cursor[suffix_len..] } - pub fn term(&self) -> &[u8] { + pub fn term(&self) -> &[u8] { &self.term[..] } } @@ -89,7 +85,6 @@ pub struct TermInfoDeltaEncoder { } impl TermInfoDeltaEncoder { - pub fn new(has_positions: bool) -> Self { TermInfoDeltaEncoder { term_info: TermInfo::default(), @@ -109,7 +104,8 @@ impl TermInfoDeltaEncoder { positions_inner_offset: 0, }; if self.has_positions { - delta_term_info.delta_positions_offset = term_info.positions_offset - self.term_info.positions_offset; + delta_term_info.delta_positions_offset = term_info.positions_offset - + self.term_info.positions_offset; delta_term_info.positions_inner_offset = term_info.positions_inner_offset; } mem::replace(&mut self.term_info, term_info); @@ -131,7 +127,6 @@ pub fn make_mask(num_bytes: usize) -> u32 { } impl TermInfoDeltaDecoder { - pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder { TermInfoDeltaDecoder { term_info: term_info, @@ -147,7 +142,7 @@ impl TermInfoDeltaDecoder { positions_offset: checkpoint.positions_offset, positions_inner_offset: 0u8, }, - has_positions: has_positions + has_positions: has_positions, } } @@ -164,12 +159,12 @@ impl TermInfoDeltaDecoder { self.term_info.postings_offset += delta_postings_offset; if self.has_positions { let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1; - let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset); + let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & + make_mask(num_bytes_positions_offset); self.term_info.positions_offset += delta_positions_offset; self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset]; &cursor[num_bytes_positions_offset + 1..] - } - else { + } else { cursor } } diff --git a/src/termdict/streamdict/mod.rs b/src/termdict/streamdict/mod.rs index f9c01529e..176f63377 100644 --- a/src/termdict/streamdict/mod.rs +++ b/src/termdict/streamdict/mod.rs @@ -22,7 +22,6 @@ pub struct CheckPoint { } impl BinarySerializable for CheckPoint { - fn serialize(&self, writer: &mut W) -> io::Result<()> { self.stream_offset.serialize(writer)?; self.postings_offset.serialize(writer)?; @@ -40,4 +39,4 @@ impl BinarySerializable for CheckPoint { positions_offset: positions_offset, }) } -} \ No newline at end of file +} diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index dcb4b8bdb..22f687da1 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -7,11 +7,11 @@ use postings::TermInfo; use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder}; -fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, - target_key: &[u8], - has_positions: bool) - -> TermStreamerImpl<'a> -{ +fn stream_before<'a>( + term_dictionary: &'a TermDictionaryImpl, + target_key: &[u8], + has_positions: bool, +) -> TermStreamerImpl<'a> { let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref()); let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..]; @@ -24,8 +24,7 @@ fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl, /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) -pub struct TermStreamerBuilderImpl<'a> -{ +pub struct TermStreamerBuilderImpl<'a> { term_dictionary: &'a TermDictionaryImpl, origin: usize, offset_from: usize, @@ -35,14 +34,17 @@ pub struct TermStreamerBuilderImpl<'a> has_positions: bool, } -impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> -{ +impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { type Streamer = TermStreamerImpl<'a>; /// Limit the range to terms greater or equal to the bound fn ge>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.lt(target_key); let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer); self.current_key = current_key; @@ -54,7 +56,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// Limit the range to terms strictly greater than the bound fn gt>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.le(target_key); let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer); self.current_key = current_key; @@ -66,7 +72,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// Limit the range to terms lesser or equal to the bound fn lt>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.lt(target_key); let (offset_before, _, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; @@ -76,7 +86,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// Limit the range to terms lesser or equal to the bound fn le>(mut self, bound: T) -> Self { let target_key = bound.as_ref(); - let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions); + let streamer = stream_before( + self.term_dictionary, + target_key.as_ref(), + self.has_positions, + ); let smaller_than = |k: &[u8]| k.le(target_key); let (offset_before, _, _) = get_offset(smaller_than, streamer); self.offset_to = offset_before - self.origin; @@ -88,10 +102,13 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> let data: &[u8] = self.term_dictionary.stream_data(); let start = self.offset_from; let stop = max(self.offset_to, start); + let term_delta_decoder = TermDeltaDecoder::with_previous_term(self.current_key); + let term_info_decoder = + TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions); TermStreamerImpl { cursor: &data[start..stop], - term_delta_decoder: TermDeltaDecoder::with_previous_term(self.current_key), - term_info_decoder: TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions), // TODO checkpoint + term_delta_decoder: term_delta_decoder, + term_info_decoder: term_info_decoder, } } } @@ -103,10 +120,10 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> /// - the block start /// - the index within this block /// - the term_buffer state to initialize the block) -fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, - mut streamer: TermStreamerImpl<'a>) - -> (usize, Vec, TermInfo) -{ +fn get_offset<'a, P: Fn(&[u8]) -> bool>( + predicate: P, + mut streamer: TermStreamerImpl<'a>, +) -> (usize, Vec, TermInfo) { let mut prev: &[u8] = streamer.cursor; let mut term_info = streamer.value().clone(); @@ -124,11 +141,8 @@ fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P, (prev.as_ptr() as usize, prev_data, term_info) } -impl<'a> TermStreamerBuilderImpl<'a> -{ - pub(crate) fn new( - term_dictionary: &'a TermDictionaryImpl, - has_positions: bool) -> Self { +impl<'a> TermStreamerBuilderImpl<'a> { + pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl, has_positions: bool) -> Self { let data = term_dictionary.stream_data(); let origin = data.as_ptr() as usize; TermStreamerBuilderImpl { @@ -146,8 +160,7 @@ impl<'a> TermStreamerBuilderImpl<'a> /// See [`TermStreamer`](./trait.TermStreamer.html) -pub struct TermStreamerImpl<'a> -{ +pub struct TermStreamerImpl<'a> { cursor: &'a [u8], term_delta_decoder: TermDeltaDecoder, term_info_decoder: TermInfoDeltaDecoder, @@ -156,8 +169,7 @@ pub struct TermStreamerImpl<'a> -impl<'a> TermStreamer for TermStreamerImpl<'a> -{ +impl<'a> TermStreamer for TermStreamerImpl<'a> { fn advance(&mut self) -> bool { if self.cursor.is_empty() { return false; @@ -178,4 +190,3 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> &self.term_info_decoder.term_info() } } - diff --git a/src/termdict/streamdict/termdict.rs b/src/termdict/streamdict/termdict.rs index bf4d899fd..f0f7c618f 100644 --- a/src/termdict/streamdict/termdict.rs +++ b/src/termdict/streamdict/termdict.rs @@ -30,20 +30,16 @@ fn has_positions(field_type: &FieldType) -> bool { let indexing_options = text_options.get_indexing_options(); if indexing_options.is_position_enabled() { true - } - else { + } else { false } } - _ => { - false - } + _ => false, } } /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) -pub struct TermDictionaryBuilderImpl -{ +pub struct TermDictionaryBuilderImpl { write: CountingWriter, term_delta_encoder: TermDeltaEncoder, term_info_encoder: TermInfoDeltaEncoder, @@ -61,7 +57,8 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec) { } impl TermDictionaryBuilderImpl - where W: Write +where + W: Write, { fn add_index_entry(&mut self) { let stream_offset = self.write.written_bytes() as u32; @@ -74,10 +71,17 @@ impl TermDictionaryBuilderImpl positions_offset: positions_offset, }; self.block_index - .insert(&self.term_delta_encoder.term(), self.checkpoints.len() as u64) - .expect("Serializing fst on a Vec should never fail. Where your terms not in order maybe?"); - checkpoint.serialize(&mut self.checkpoints) - .expect("Serializing checkpoint on a Vec should never fail."); + .insert( + &self.term_delta_encoder.term(), + self.checkpoints.len() as u64, + ) + .expect( + "Serializing fst on a Vec should never fail. \ + Where your terms not in order maybe?", + ); + checkpoint.serialize(&mut self.checkpoints).expect( + "Serializing checkpoint on a Vec should never fail.", + ); } /// # Warning @@ -98,7 +102,13 @@ impl TermDictionaryBuilderImpl pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> { let delta_term_info = self.term_info_encoder.encode(term_info.clone()); let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix(); - write_term_kv(prefix_len, suffix, &delta_term_info, self.term_info_encoder.has_positions, &mut self.write)?; + write_term_kv( + prefix_len, + suffix, + &delta_term_info, + self.term_info_encoder.has_positions, + &mut self.write, + )?; self.len += 1; Ok(()) } @@ -108,19 +118,20 @@ fn num_bytes_required(mut n: u32) -> u8 { for i in 1u8..5u8 { if n < 256u32 { return i; - } - else { + } else { n /= 256; } } 0u8 } -fn write_term_kv(prefix_len: usize, - suffix: &[u8], - delta_term_info: &DeltaTermInfo, - has_positions: bool, - write: &mut W) -> io::Result<()> { +fn write_term_kv( + prefix_len: usize, + suffix: &[u8], + delta_term_info: &DeltaTermInfo, + has_positions: bool, + write: &mut W, +) -> io::Result<()> { let suffix_len = suffix.len(); let mut code = 0u8; let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq); @@ -131,9 +142,13 @@ fn write_term_kv(prefix_len: usize, code |= (num_bytes_positions_offset - 1) << 5u8; if (prefix_len < 16) && (suffix_len < 16) { code |= 1u8; - write.write_all(&[code, (prefix_len as u8) | ((suffix_len as u8) << 4u8)])?; - } - else { + write.write_all( + &[ + code, + (prefix_len as u8) | ((suffix_len as u8) << 4u8), + ], + )?; + } else { write.write_all(&[code])?; (prefix_len as u32).serialize(write)?; (suffix_len as u32).serialize(write)?; @@ -145,11 +160,15 @@ fn write_term_kv(prefix_len: usize, } { let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) }; - write.write_all(&bytes[0..num_bytes_postings_offset as usize])?; + write.write_all( + &bytes[0..num_bytes_postings_offset as usize], + )?; } if has_positions { let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) }; - write.write_all(&bytes[0..num_bytes_positions_offset as usize])?; + write.write_all( + &bytes[0..num_bytes_positions_offset as usize], + )?; write.write_all(&[delta_term_info.positions_inner_offset])?; } Ok(()) @@ -157,7 +176,8 @@ fn write_term_kv(prefix_len: usize, } impl TermDictionaryBuilder for TermDictionaryBuilderImpl - where W: Write +where + W: Write, { /// Creates a new `TermDictionaryBuilder` fn new(mut write: W, field_type: FieldType) -> io::Result { @@ -169,7 +189,7 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl term_delta_encoder: TermDeltaEncoder::default(), term_info_encoder: TermInfoDeltaEncoder::new(has_positions), block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"), - checkpoints: vec!(), + checkpoints: vec![], len: 0, }) } @@ -206,28 +226,22 @@ impl TermDictionaryBuilder for TermDictionaryBuilderImpl fn open_fst_index(source: ReadOnlySource) -> io::Result { use self::ReadOnlySource::*; let fst_result = match source { - Anonymous(data) => { - Fst::from_shared_bytes(data.data, data.start, data.len) - } - Mmap(mmap_readonly) => { - Fst::from_mmap(mmap_readonly) - } + Anonymous(data) => Fst::from_shared_bytes(data.data, data.start, data.len), + Mmap(mmap_readonly) => Fst::from_mmap(mmap_readonly), }; let fst = fst_result.map_err(convert_fst_error)?; Ok(fst::Map::from(fst)) } /// See [`TermDictionary`](./trait.TermDictionary.html) -pub struct TermDictionaryImpl -{ +pub struct TermDictionaryImpl { stream_data: ReadOnlySource, fst_index: fst::Map, checkpoints_data: ReadOnlySource, has_positions: bool, } -impl TermDictionaryImpl -{ +impl TermDictionaryImpl { pub(crate) fn stream_data(&self) -> &[u8] { self.stream_data.as_slice() } @@ -235,8 +249,8 @@ impl TermDictionaryImpl pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec, CheckPoint) { let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key); let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..]; - let checkpoint = CheckPoint::deserialize(&mut checkpoint_data) - .expect("Checkpoint data is corrupted"); + let checkpoint = + CheckPoint::deserialize(&mut checkpoint_data).expect("Checkpoint data is corrupted"); (term, checkpoint) } @@ -288,47 +302,47 @@ impl TermDictionaryImpl -impl<'a> TermDictionary<'a> for TermDictionaryImpl -{ +impl<'a> TermDictionary<'a> for TermDictionaryImpl { type Streamer = TermStreamerImpl<'a>; type StreamBuilder = TermStreamerBuilderImpl<'a>; /// Opens a `TermDictionary` given a data source. - fn from_source(mut source: ReadOnlySource) -> io::Result { - let has_positions = source.slice(0, 1).as_ref()[0] == 255u8; + fn from_source(mut source: ReadOnlySource) -> Self { + let has_positions = source.slice(0, 1)[0] == 255u8; source = source.slice_from(1); let total_len = source.len(); let (body, footer) = source.split(total_len - 16); let mut footer_buffer: &[u8] = footer.as_slice(); - let fst_addr: usize = u64::deserialize(&mut footer_buffer)? as usize; - let checkpoints_addr: usize = u64::deserialize(&mut footer_buffer)? as usize; + let fst_addr = u64::deserialize(&mut footer_buffer).expect( + "deserializing 8 byte should never fail", + ) as usize; + let checkpoints_addr = u64::deserialize(&mut footer_buffer).expect( + "deserializing 8 byte should never fail", + ) as usize; let stream_data = body.slice(0, fst_addr - PADDING_SIZE); let fst_data = body.slice(fst_addr, checkpoints_addr); let checkpoints_data = body.slice_from(checkpoints_addr); - let fst_index = open_fst_index(fst_data)?; + let fst_index = open_fst_index(fst_data).expect("Index FST data corrupted"); - Ok(TermDictionaryImpl { + TermDictionaryImpl { has_positions: has_positions, stream_data: stream_data, checkpoints_data: checkpoints_data, fst_index: fst_index, - }) + } } /// Lookups the value corresponding to the key. fn get>(&self, target_key: K) -> Option { - let mut streamer = self.range() - .ge(&target_key) - .into_stream(); + let mut streamer = self.range().ge(&target_key).into_stream(); if streamer.advance() && streamer.key() == target_key.as_ref() { Some(streamer.value().clone()) - } - else { + } else { None } } @@ -353,4 +367,4 @@ mod tests { assert_eq!(num_bytes_required(256), 2); assert_eq!(num_bytes_required(u32::max_value()), 4); } -} \ No newline at end of file +}