From e486495cb8aeeddbdc5a8ec9e78bbb53313891c1 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 31 Jul 2016 15:34:32 +0900 Subject: [PATCH] Code cleaning. --- Cargo.toml | 10 ---------- src/common/serialize.rs | 2 +- src/compression/simdcomp.rs | 16 +++++++--------- src/core/merger.rs | 6 +++--- src/core/searcher.rs | 4 ++-- src/core/segment_reader.rs | 2 +- src/datastruct/skip/skiplist_builder.rs | 4 ++-- src/fastfield/mod.rs | 12 ++++++------ src/fastfield/writer.rs | 12 ++++++------ src/lib.rs | 9 +++++---- src/postings/chained_postings.rs | 8 ++++---- src/postings/docset.rs | 15 ++++++++++++++- src/postings/intersection.rs | 20 +------------------- src/postings/segment_postings.rs | 23 +---------------------- src/postings/union_postings.rs | 23 +++++------------------ src/postings/writer.rs | 2 +- src/schema/field.rs | 2 -- src/store/mod.rs | 4 ++-- src/store/reader.rs | 10 +++++----- src/store/writer.rs | 4 ++-- 20 files changed, 68 insertions(+), 120 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9a90c3ed0..e749e0e65 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" authors = ["Paul Masurel "] build = "build.rs" - [dependencies] byteorder = "0.4" memmap = "0.2" @@ -34,12 +33,3 @@ gcc = "0.3.24" [[bin]] name = "tantivy-merge" path = "src/cli/merge.rs" - - -# [profile.release] -# opt-level = 3 -# debug = true -# rpath = false -# lto = false -# debug-assertions = false -# codegen-units = 1 diff --git a/src/common/serialize.rs b/src/common/serialize.rs index 91c752fbe..fb37bd552 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -31,7 +31,7 @@ impl BinarySerializable for () { impl BinarySerializable for Vec { fn serialize(&self, writer: &mut Write) -> io::Result { let mut total_size = try!(VInt(self.len() as u64).serialize(writer)); - for it in self.iter() { + for it in self { total_size += try!(it.serialize(writer)); } Ok(total_size) diff --git a/src/compression/simdcomp.rs b/src/compression/simdcomp.rs index 60a1b8719..b113175ab 100644 --- a/src/compression/simdcomp.rs +++ b/src/compression/simdcomp.rs @@ -50,9 +50,9 @@ impl SIMDBlockEncoder { pub fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] { let mut byte_written = 0; - for v in input.iter() { - let mut to_encode: u32 = *v - offset; - offset = *v; + for &v in input { + let mut to_encode: u32 = v - offset; + offset = v; loop { let next_byte: u8 = (to_encode % 128u32) as u8; to_encode /= 128u32; @@ -72,8 +72,8 @@ impl SIMDBlockEncoder { pub fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] { let mut byte_written = 0; - for &i in input.iter() { - let mut to_encode: u32 = i; + for &v in input { + let mut to_encode: u32 = v; loop { let next_byte: u8 = (to_encode % 128u32) as u8; to_encode /= 128u32; @@ -267,15 +267,13 @@ mod tests { .map(|i| 4 + i * 7 / 2) .into_iter() .collect(); - for offset in [0u32, 1u32, 2u32].iter() { + for offset in &[0u32, 1u32, 2u32] { let encoded_data = encoder.compress_vint_sorted(&input, *offset); assert_eq!(encoded_data.len(), expected_length); let mut decoder = SIMDBlockDecoder::new(); let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len()); assert_eq!(0, remaining_data.len()); - for (&decoded, &expected) in decoder.output_array().iter().zip(input.iter()) { - assert_eq!(decoded, expected); - } + assert_eq!(input, decoder.output_array()); } } { diff --git a/src/core/merger.rs b/src/core/merger.rs index c2d6d21c3..95041462a 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -55,7 +55,7 @@ impl<'a> PostingsMerger<'a> { fn new(readers: &'a Vec) -> PostingsMerger<'a> { let mut doc_offsets: Vec = Vec::new(); let mut max_doc = 0; - for reader in readers.iter() { + for reader in readers { doc_offsets.push(max_doc); max_doc += reader.max_doc(); }; @@ -142,7 +142,7 @@ impl IndexMerger { pub fn open(schema: Schema, segments: &Vec) -> io::Result { let mut readers = Vec::new(); let mut max_doc = 0; - for segment in segments.iter() { + for segment in segments { let reader = try!(SegmentReader::open(segment.clone())); max_doc += reader.max_doc(); readers.push(reader); @@ -166,7 +166,7 @@ impl IndexMerger { let mut u32_readers = Vec::new(); let mut min_val = u32::min_value(); let mut max_val = 0; - for reader in self.readers.iter() { + for reader in &self.readers { let u32_reader = try!(reader.get_fast_field_reader(field)); min_val = min(min_val, u32_reader.min_val()); max_val = max(max_val, u32_reader.max_val()); diff --git a/src/core/searcher.rs b/src/core/searcher.rs index f638bfc5a..a22e59754 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -18,8 +18,8 @@ impl Searcher { pub fn doc(&self, doc_address: &DocAddress) -> io::Result { // TODO err - let DocAddress(ref segment_local_id, ref doc_id) = *doc_address; - let segment_reader = &self.segments[*segment_local_id as usize]; + let DocAddress(segment_local_id, doc_id) = *doc_address; + let segment_reader = &self.segments[segment_local_id as usize]; segment_reader.doc(doc_id) } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 9888f4c6f..afcf5f6b6 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -101,7 +101,7 @@ impl SegmentReader { /// bearing the given doc id. /// This method is slow and should seldom be called from /// within a collector. - pub fn doc(&self, doc_id: &DocId) -> io::Result { + pub fn doc(&self, doc_id: DocId) -> io::Result { self.store_reader.get(doc_id) } diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index 766676868..41969ff68 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -97,13 +97,13 @@ impl SkipListBuilder { let mut layer_sizes: Vec = Vec::new(); size += self.data_layer.buffer.len() as u32; layer_sizes.push(size); - for layer in self.skip_layers.iter() { + for layer in &self.skip_layers { size += layer.buffer.len() as u32; layer_sizes.push(size); } try!(layer_sizes.serialize(output)); try!(self.data_layer.write(output)); - for layer in self.skip_layers.iter() { + for layer in &self.skip_layers { try!(layer.write(output)); } Ok(()) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index c33a2e764..ea98fb8da 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -141,8 +141,8 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::new(write).unwrap(); let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema); - for x in permutation.iter() { - add_single_field_doc(&mut fast_field_writers, field, x.clone()); + for x in &permutation { + add_single_field_doc(&mut fast_field_writers, field, *x); } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); @@ -196,8 +196,8 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::new(write).unwrap(); let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema); - for x in permutation.iter() { - add_single_field_doc(&mut fast_field_writers, field, x.clone()); + for x in &permutation { + add_single_field_doc(&mut fast_field_writers, field, *x); } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); @@ -228,8 +228,8 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::new(write).unwrap(); let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema); - for x in permutation.iter() { - add_single_field_doc(&mut fast_field_writers, field, x.clone()); + for x in &permutation { + add_single_field_doc(&mut fast_field_writers, field, *x); } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index e0e08ada7..329ea7708 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -35,7 +35,7 @@ impl U32FastFieldsWriter { } pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { - for field_writer in self.field_writers.iter() { + for field_writer in &self.field_writers { try!(field_writer.serialize(serializer)); } Ok(()) @@ -83,11 +83,11 @@ impl U32FastFieldWriter { pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { let zero = 0; - let min = self.vals.iter().min().unwrap_or(&zero).clone(); - let max = self.vals.iter().max().unwrap_or(&min).clone(); - try!(serializer.new_u32_fast_field(self.field.clone(), min, max)); - for val in self.vals.iter() { - try!(serializer.add_val(val.clone())); + let min = *self.vals.iter().min().unwrap_or(&zero); + let max = *self.vals.iter().max().unwrap_or(&min); + try!(serializer.new_u32_fast_field(self.field, min, max)); + for &val in &self.vals { + try!(serializer.add_val(val)); } serializer.close_field() } diff --git a/src/lib.rs b/src/lib.rs index 2f0669ec8..25d2012d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,15 +35,16 @@ mod macros { } mod core; -mod datastruct; -mod postings; -mod directory; + mod compression; mod fastfield; mod store; mod common; -pub mod query; +pub mod postings; +pub mod query; +pub mod directory; +pub mod datastruct; pub mod analyzer; pub mod collector; diff --git a/src/postings/chained_postings.rs b/src/postings/chained_postings.rs index 446302279..e0ee0e85e 100644 --- a/src/postings/chained_postings.rs +++ b/src/postings/chained_postings.rs @@ -12,10 +12,10 @@ pub struct ChainedPostings<'a> { impl<'a> ChainedPostings<'a> { pub fn new(chained_postings: Vec>) -> ChainedPostings { - let mut doc_freq: usize = 0; - for segment_postings in chained_postings.iter() { - doc_freq += segment_postings.doc_freq(); - } + let doc_freq: usize = chained_postings + .iter() + .map(|segment_postings| segment_postings.doc_freq()) + .fold(0, |sum, addition| sum + addition); ChainedPostings { chained_postings: chained_postings, posting_id: 0, diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 326729cf8..25e5b71b3 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -1,6 +1,7 @@ use DocId; use std::borrow::Borrow; use std::borrow::BorrowMut; +use std::cmp::Ordering; #[derive(PartialEq, Eq, Debug)] pub enum SkipResult { @@ -18,7 +19,19 @@ pub trait DocSet { // after skipping position // the iterator in such a way that doc() will return a // value greater or equal to target. - fn skip_next(&mut self, target: DocId) -> SkipResult; + fn skip_next(&mut self, target: DocId) -> SkipResult { + loop { + match self.doc().cmp(&target) { + Ordering::Less => { + if !self.next() { + return SkipResult::End; + } + }, + Ordering::Equal => { return SkipResult::Reached }, + Ordering::Greater => { return SkipResult::OverStep }, + } + } + } fn doc(&self,) -> DocId; diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index a8f4154f6..a381aebef 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -1,5 +1,4 @@ use postings::DocSet; -use postings::SkipResult; use std::cmp::Ordering; use DocId; @@ -76,26 +75,9 @@ impl<'a> DocSet for IntersectionDocSet<'a> { fn doc_freq(&self,) -> usize { // TODO not a great idea. - panic!("intersectiond does not implement doc freq"); + panic!("intersection does not implement doc freq"); } - - fn skip_next(&mut self, target: DocId) -> SkipResult { - loop { - match self.doc().cmp(&target) { - Ordering::Equal => { - return SkipResult::Reached; - } - Ordering::Greater => { - return SkipResult::OverStep; - } - Ordering::Less => {} - } - if !self.next() { - return SkipResult::End; - } - } - } } #[inline(never)] diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 4b23c239f..5934cb249 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,7 +1,6 @@ use compression::{NUM_DOCS_PER_BLOCK, SIMDBlockDecoder}; use DocId; -use std::cmp::Ordering; -use postings::{Postings, FreqHandler, SkipResult, DocSet}; +use postings::{Postings, FreqHandler, DocSet}; use std::num::Wrapping; @@ -82,26 +81,6 @@ impl<'a> DocSet for SegmentPostings<'a> { self.block_decoder.output(self.index_within_block()) } - // after skipping position - // the iterator in such a way that doc() will return a - // value greater or equal to target. - fn skip_next(&mut self, target: DocId) -> SkipResult { - loop { - match self.doc().cmp(&target) { - Ordering::Equal => { - return SkipResult::Reached; - } - Ordering::Greater => { - return SkipResult::OverStep; - } - Ordering::Less => {} - } - if !self.next() { - return SkipResult::End; - } - } - } - fn doc_freq(&self,) -> usize { self.doc_freq } diff --git a/src/postings/union_postings.rs b/src/postings/union_postings.rs index 964e86a4d..404fd0bb7 100644 --- a/src/postings/union_postings.rs +++ b/src/postings/union_postings.rs @@ -1,9 +1,8 @@ use DocId; use postings::{Postings, DocSet}; -use std::collections::BinaryHeap; -use postings::SkipResult; use std::cmp::Ordering; +use std::collections::BinaryHeap; use query::MultiTermScorer; use postings::ScoredDocSet; use query::Scorer; @@ -13,7 +12,7 @@ struct HeapItem(DocId, usize, u32); impl PartialOrd for HeapItem { fn partial_cmp(&self, other:&Self) -> Option { - (self.0, self.1).partial_cmp(&(other.0, other.1)).map(|o| o.reverse()) + Some(self.cmp(&other)) } } @@ -87,21 +86,9 @@ impl DocSet for UnionPostings { } } - fn skip_next(&mut self, target: DocId) -> SkipResult { - // TODO skip the underlying posting object. - loop { - match self.doc.cmp(&target) { - Ordering::Less => { - if !self.next() { - return SkipResult::End; - } - }, - Ordering::Equal => { return SkipResult::Reached }, - Ordering::Greater => { return SkipResult::OverStep }, - } - } - } - + + // TODO implement a faster skip_next + fn doc(&self,) -> DocId { self.doc } diff --git a/src/postings/writer.rs b/src/postings/writer.rs index e73ddfb12..c762130a6 100644 --- a/src/postings/writer.rs +++ b/src/postings/writer.rs @@ -94,7 +94,7 @@ impl PostingsWriter { } pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> { - for (term, postings_id) in self.term_index.iter() { + for (term, postings_id) in &self.term_index { let term_postings_writer = &self.postings[postings_id.clone()]; let term_docfreq = term_postings_writer.doc_freq(); try!(serializer.new_term(&term, term_docfreq)); diff --git a/src/schema/field.rs b/src/schema/field.rs index d67b30bda..0c991f187 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -3,8 +3,6 @@ use std::io::Write; use std::io::Read; use common::BinarySerializable; -// TODO impl Copy trait - #[derive(Copy,Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] pub struct Field(pub u8); diff --git a/src/store/mod.rs b/src/store/mod.rs index 5bb0e9b10..64ad695e8 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -56,7 +56,7 @@ mod tests { let store_source = directory.open_read(&path).unwrap(); let store = StoreReader::new(store_source); for i in (0..10).map(|i| i * 3 / 2) { - assert_eq!(*store.get(&i).unwrap().get_first(field_title).unwrap().text(), format!("Doc {}", i)); + assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(), format!("Doc {}", i)); } } @@ -78,7 +78,7 @@ mod tests { let store_source = directory.open_read(&path).unwrap(); let store = StoreReader::new(store_source); b.iter(|| { - store.get(&12).unwrap(); + store.get(12).unwrap(); }); } diff --git a/src/store/reader.rs b/src/store/reader.rs index 99c29ba76..d8567f839 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -35,14 +35,14 @@ impl StoreReader { offsets } - fn block_offset(&self, seek: &DocId) -> OffsetIndex { - fn search(offsets: &[OffsetIndex], seek: &DocId) -> OffsetIndex { + fn block_offset(&self, seek: DocId) -> OffsetIndex { + fn search(offsets: &[OffsetIndex], seek: DocId) -> OffsetIndex { let m = offsets.len() / 2; let pivot_offset = &offsets[m]; if offsets.len() <= 1 { return pivot_offset.clone() } - match pivot_offset.0.cmp(seek) { + match pivot_offset.0.cmp(&seek) { Ordering::Less => search(&offsets[m..], seek), Ordering::Equal => pivot_offset.clone(), Ordering::Greater => search(&offsets[..m], seek), @@ -62,12 +62,12 @@ impl StoreReader { lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()) } - pub fn get(&self, doc_id: &DocId) -> io::Result { + pub fn get(&self, doc_id: DocId) -> io::Result { let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id); try!(self.read_block(block_offset as usize)); let mut current_block_mut = self.current_block.borrow_mut(); let mut cursor = Cursor::new(&mut current_block_mut[..]); - for _ in first_doc_id..*doc_id { + for _ in first_doc_id..doc_id { let block_length = try!(u32::deserialize(&mut cursor)); try!(cursor.seek(SeekFrom::Current(block_length as i64))); } diff --git a/src/store/writer.rs b/src/store/writer.rs index 64073fa86..5a968c2bd 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -52,7 +52,7 @@ impl StoreWriter { match reader.offsets.last() { Some(&OffsetIndex(ref num_docs, ref body_size)) => { try!(self.writer.write_all(&reader.data.as_slice()[0..*body_size as usize])); - for &OffsetIndex(doc, offset) in reader.offsets.iter() { + for &OffsetIndex(doc, offset) in &reader.offsets { self.offsets.push(OffsetIndex(self.doc + doc, self.written + offset)); } self.written += *body_size; @@ -68,7 +68,7 @@ impl StoreWriter { pub fn store<'a>(&mut self, field_values: &Vec<&'a FieldValue>) -> io::Result<()> { self.intermediary_buffer.clear(); try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); - for field_value in field_values.iter() { + for field_value in field_values { try!((*field_value).serialize(&mut self.intermediary_buffer)); } try!((self.intermediary_buffer.len() as u32).serialize(&mut self.current_block));