From 5d3b00a7b7143dbb713eaf6bd9ec15575f70f3be Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 18 Feb 2016 22:11:56 +0900 Subject: [PATCH 1/2] Removed println! --- src/core/reader.rs | 4 ---- src/core/writer.rs | 1 - 2 files changed, 5 deletions(-) diff --git a/src/core/reader.rs b/src/core/reader.rs index db0c11f78..7c6cae8a5 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -37,7 +37,6 @@ impl SegmentPostings { pub fn from_data(data: &[u8]) -> SegmentPostings { let mut cursor = Cursor::new(data); let doc_freq = cursor.read_u32::().unwrap() as usize; - println!("doc_freq {}", doc_freq); let data_size = cursor.read_u32::().unwrap() as usize; // TODO remove allocs let mut data = Vec::with_capacity(data_size); @@ -47,9 +46,6 @@ impl SegmentPostings { let mut doc_ids: Vec = (0..doc_freq as u32 ).collect(); let decoder = Decoder::new(); decoder.decode(&data, &mut doc_ids); - for a in doc_ids.iter() { - println!("uncompressed {}", a); - } SegmentPostings { doc_ids: doc_ids, doc_id: 0, diff --git a/src/core/writer.rs b/src/core/writer.rs index 9f613b757..23b254eb2 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -115,7 +115,6 @@ impl SegmentWriter { let mut tokens = self.tokenizer.tokenize(&field_value.text); while tokens.read_one(&mut term_buffer) { let term = Term::from_field_text(&field_value.field, term_buffer.as_ref()); - println!("token {:?}", term); self.suscribe(doc_id, term); self.num_tokens += 1; } From ee1141f3da6244a6a732a5e7cd57be59913a38c8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 19 Feb 2016 10:37:49 +0900 Subject: [PATCH 2/2] bugfix and lowercase --- cpp/encode.cpp | 15 --------------- src/core/analyzer.rs | 13 +++++++++++-- src/core/reader.rs | 19 +++++++++++++------ src/core/searcher.rs | 19 +++++++++++++++---- src/core/simdcompression.rs | 23 +++++++++++++---------- 5 files changed, 52 insertions(+), 37 deletions(-) diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 462f81a38..933ab51d2 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -1,7 +1,3 @@ - - -// /usr/bin/c++ -Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -I/Users/pmasurel/github/FastPFor/headers -o CMakeFiles/example.dir/example.cpp.o -c /Users/pmasurel/github/FastPFor/example.cpp - #include #include @@ -15,10 +11,6 @@ static shared_ptr codec = CODECFactory::getFromName("s4-bp128-dm" extern "C" { - - - - size_t encode_native( uint32_t* begin, const size_t num_els, @@ -29,13 +21,6 @@ extern "C" { num_els, output, output_length); - { - size_t num_ints = output_length; - uint32_t* uncompressed = new uint32_t[100]; - codec -> decodeArray(output, output_length, uncompressed, num_ints); - delete uncompressed; - } - return output_length; } diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs index b7977b75a..75f6460cb 100644 --- a/src/core/analyzer.rs +++ b/src/core/analyzer.rs @@ -12,14 +12,23 @@ pub struct TokenIter<'a> { chars: Chars<'a>, } + +fn append_char(c: char, term_buffer: &mut String) { + for c_lower in c.to_lowercase() { + term_buffer.push(c_lower); + } +} + impl<'a> TokenIter<'a> { + + pub fn read_one(&mut self, term_buffer: &mut String) -> bool { term_buffer.clear(); loop { match self.chars.next() { Some(c) => { if c.is_alphanumeric() { - term_buffer.push(c); + append_char(c, term_buffer); break; } else { @@ -35,7 +44,7 @@ impl<'a> TokenIter<'a> { match self.chars.next() { Some(c) => { if c.is_alphanumeric() { - term_buffer.push(c); + append_char(c, term_buffer); } else { break; diff --git a/src/core/reader.rs b/src/core/reader.rs index 7c6cae8a5..693e0c74d 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -121,10 +121,8 @@ impl SegmentReader { } pub fn get_term<'a>(&'a self, term: &Term) -> Option { - println!("Term {:?}", term); match self.term_offsets.get(term.as_slice()) { Some(offset) => { - println!("offset {}", offset); Some(self.read_postings(offset as usize)) }, None => None, @@ -132,10 +130,19 @@ impl SegmentReader { } pub fn search(&self, terms: &Vec) -> IntersectionPostings { - let segment_postings: Vec = terms - .iter() - .map(|term| self.get_term(term).unwrap()) - .collect(); + + let mut segment_postings: Vec = Vec::new(); + for term in terms.iter() { + match self.get_term(term) { + Some(segment_posting) => { + segment_postings.push(segment_posting); + } + None => { + segment_postings.clear(); + break; + } + } + } IntersectionPostings::from_postings(segment_postings) } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 39cf105c2..5390fb746 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -10,11 +10,22 @@ pub struct Searcher { impl Searcher { pub fn for_directory(directory: Directory) -> Searcher { + let mut segment_readers: Vec = Vec::new(); + for segment in directory.segments().into_iter() { + println!("{:?}", segment); + match SegmentReader::open(segment.clone()) { + Ok(segment_reader) => { + segment_readers.push(segment_reader); + println!("opened {:?}", segment); + } + Err(err) => { + // TODO return err + println!("Error while opening {:?}, {:?}", segment, err); + } + } + } Searcher { - segments: directory.segments() - .into_iter() - .map(|segment| SegmentReader::open(segment).unwrap() ) // TODO error handling - .collect() + segments: segment_readers } } } diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index b09dcbe38..7186323a4 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -24,16 +24,17 @@ impl Encoder { } pub fn encode(&mut self, input: &[u32]) -> &[u32] { - unsafe { + self.input_buffer.clear(); let input_len = input.len(); - if input_len > self.input_buffer.len() { - self.input_buffer = (0..input_len as u32 + 10 ).collect(); - self.output_buffer = (0..input_len as u32 + 10).collect(); + if input_len >= self.input_buffer.len() { + self.input_buffer = (0..input_len as u32).collect(); + self.output_buffer = (0..input_len as u32 + 1000).collect(); // TODO use resize when available } - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); let written_size = encode_native( self.input_buffer.as_mut_ptr(), input_len as size_t, @@ -70,16 +71,18 @@ impl Decoder { } + + #[test] -fn test_encode_decode() { +fn test_encode_big() { let mut encoder = Encoder::new(); - let input: Vec = vec!(2,3,5,7,11,13,17,19,23); + let input: Vec = (0..100000).into_iter().collect(); let data = encoder.encode(&input); assert_eq!(data.len(), 4); let decoder = Decoder::new(); - let mut data_output: Vec = (0..100).collect(); - assert_eq!(9, decoder.decode(&data[0..4], &mut data_output)); - for i in 0..9 { + let mut data_output: Vec = (0..10000).collect(); + assert_eq!(10000, decoder.decode(&data[0..4], &mut data_output)); + for i in 0..10000 { assert_eq!(data_output[i], input[i]) ; } }