From 4152de6c0d01f898f57ba35dfcf2b3cf7fccb774 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 24 Feb 2016 08:57:39 +0900 Subject: [PATCH] bugfix --- src/core/codec.rs | 9 +++--- src/core/mod.rs | 1 - src/core/reader.rs | 6 ++-- src/core/simdcompression.rs | 18 ++++++----- src/core/store.rs | 5 ++- tests/core.rs | 62 ++++++++++++++++++++++++++----------- 6 files changed, 63 insertions(+), 38 deletions(-) diff --git a/src/core/codec.rs b/src/core/codec.rs index 70f55b997..554ba5836 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -10,6 +10,7 @@ use core::schema::Term; use core::schema::DocId; use core::store::StoreWriter; use std::fs::File; +use core::serialize::BinarySerializable; use fst; use core::simdcompression; use std::convert::From; @@ -49,18 +50,16 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64); self.cur_term_num_docs = doc_freq; // writing the size of the posting list - try!(self.postings_write.write_u32::(doc_freq)); - self.written_bytes_postings += 4; + self.written_bytes_postings += try!((doc_freq as u32).serialize(&mut self.postings_write)); Ok(()) } fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<(), IOError> { // TODO write_all transmuted [u8] let docs_data = self.encoder.encode(doc_ids); - try!(self.postings_write.write_u32::(docs_data.len() as u32)); - self.written_bytes_postings += 4; + self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write)); for num in docs_data { - try!(self.postings_write.write_u32::(num.clone() as u32)); + self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); } Ok(()) } diff --git a/src/core/mod.rs b/src/core/mod.rs index f1303ccea..a337402b1 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -6,7 +6,6 @@ pub mod analyzer; pub mod serial; pub mod reader; pub mod codec; -pub mod error; pub mod searcher; pub mod collector; pub mod skip; diff --git a/src/core/reader.rs b/src/core/reader.rs index a191e2557..d59bd067f 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -5,6 +5,7 @@ use core::schema::Document; use fst; use core::postings::IntersectionPostings; use byteorder::{BigEndian, ReadBytesExt}; +use core::serialize::BinarySerializable; use std::io::Cursor; use core::schema::DocId; use core::directory::SegmentComponent; @@ -39,14 +40,14 @@ impl SegmentPostings { pub fn from_data(data: &[u8]) -> SegmentPostings { let mut cursor = Cursor::new(data); - let doc_freq = cursor.read_u32::().unwrap() as usize; + let doc_freq: u32 = u32::deserialize(&mut cursor).unwrap(); let data_size = cursor.read_u32::().unwrap() as usize; // TODO remove allocs let mut data = Vec::with_capacity(data_size); for _ in 0..data_size { data.push(cursor.read_u32::().unwrap()); } - let mut doc_ids: Vec = (0..doc_freq as u32 ).collect(); + let mut doc_ids: Vec = (0..doc_freq as u32).collect(); let decoder = Decoder::new(); decoder.decode(&data, &mut doc_ids); SegmentPostings { @@ -135,7 +136,6 @@ impl SegmentReader { segment_postings.push(segment_posting); } None => { - println!("not found {:?}", term); segment_postings.clear(); segment_postings.push(SegmentPostings::empty()); break; diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index 7a93651c6..734a79ef1 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -70,13 +70,15 @@ impl Decoder { #[test] fn test_encode_big() { let mut encoder = Encoder::new(); - let input: Vec = (0..10000).into_iter().collect(); - let data = encoder.encode(&input); - assert_eq!(data.len(), 962); + let num_ints = 10000 as usize; + let expected_length = 1274; + let input: Vec = (0..num_ints as u32) + .map(|i| i * 7 / 2) + .into_iter().collect(); + let encoded_data = encoder.encode(&input); + assert_eq!(encoded_data.len(), expected_length); let decoder = Decoder::new(); - let mut data_output: Vec = (0..10000).collect(); - assert_eq!(10000, decoder.decode(&data[0..962], &mut data_output)); - for i in 0..10000 { - assert_eq!(data_output[i], input[i]) ; - } + let mut decoded_data: Vec = (0..num_ints as u32).collect(); + assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); } diff --git a/src/core/store.rs b/src/core/store.rs index 7b04ca23a..844d99ada 100644 --- a/src/core/store.rs +++ b/src/core/store.rs @@ -4,7 +4,6 @@ use std::cell::RefCell; use core::schema::DocId; use core::schema::Document; use core::schema::FieldValue; -use core::error; use core::serialize::BinarySerializable; use std::io::Write; use std::io::Read; @@ -193,7 +192,7 @@ mod tests { let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."); { let mut store_writer = StoreWriter::new(store_file.reopen().unwrap()); - for i in 0..10000 { + for i in 0..1000 { let mut fields: Vec = Vec::new(); { let field_value = FieldValue { @@ -219,7 +218,7 @@ mod tests { let store_mmap = MmapReadOnly::open(&store_file).unwrap(); let store = StoreReader::new(store_mmap); assert_eq!(offsets, store.offsets); - for i in 0..1000 { + for i in (0..10).map(|i| i * 3 / 2) { assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i)); } } diff --git a/tests/core.rs b/tests/core.rs index 44ddaccc0..9a31011ba 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -12,36 +12,35 @@ use regex::Regex; -pub struct TestCollector { - docs: Vec, - current_segment: Option, +// only make sense for a single segment +struct TestCollector { + docs: Vec, } impl TestCollector { pub fn new() -> TestCollector { TestCollector { docs: Vec::new(), - current_segment: None, } } - pub fn docs(self,) -> Vec { + pub fn docs(self,) -> Vec { self.docs } } impl Collector for TestCollector { - fn set_segment(&mut self, segment: &SegmentReader) { - self.current_segment = Some(segment.id()); - } + fn set_segment(&mut self, segment: &SegmentReader) {} fn collect(&mut self, doc_id: DocId) { - self.docs.push(DocAddress(self.current_segment.clone().unwrap(), doc_id)); + self.docs.push(doc_id); } } + + #[test] fn test_indexing() { let mut schema = Schema::new(); @@ -111,16 +110,43 @@ fn test_searcher() { let segment = commit_result.unwrap(); } { + let searcher = Searcher::for_directory(directory); - let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), ); - let mut collector = TestCollector::new(); - searcher.search(&terms, &mut collector); - let vals: Vec = collector - .docs() - .iter() - .map(|doc| doc.1) - .collect::>(); - assert_eq!(vals, [1, 2]); + let get_doc_ids = |terms: Vec| { + let mut collector = TestCollector::new(); + searcher.search(&terms, &mut collector); + collector.docs() + }; + { + assert_eq!( + get_doc_ids(vec!(Term::from_field_text(&text_field, "a"))), + vec!(1, 2)); + } + { + assert_eq!( + get_doc_ids(vec!(Term::from_field_text(&text_field, "af"))), + vec!(0)); + } + { + assert_eq!( + get_doc_ids(vec!(Term::from_field_text(&text_field, "b"))), + vec!(0, 1, 2)); + } + { + assert_eq!( + get_doc_ids(vec!(Term::from_field_text(&text_field, "c"))), + vec!(1, 2)); + } + { + assert_eq!( + get_doc_ids(vec!(Term::from_field_text(&text_field, "d"))), + vec!(2)); + } + { + assert_eq!( + get_doc_ids(vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), )), + vec!(1, 2)); + } } }