From 45746e4175ecf5dd0366b1135b656daf212efa72 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 6 May 2016 16:46:03 +0900 Subject: [PATCH] test passing again --- TODO.md | 1 + cpp/encode.cpp | 2 +- src/common/vint.rs | 8 ++++---- src/compression/block128.rs | 4 ++-- src/compression/s4bp128.rs | 4 ++-- src/compression/vints.rs | 25 ++++++++++++++----------- src/core/merger.rs | 1 - src/datastruct/skip/mod.rs | 6 +++--- src/directory/ram_directory.rs | 7 ++----- src/postings/segment_postings.rs | 30 +++++++----------------------- src/postings/serializer.rs | 10 ++++++---- src/schema/text_field.rs | 1 - src/schema/u32_field.rs | 1 - 13 files changed, 42 insertions(+), 58 deletions(-) diff --git a/TODO.md b/TODO.md index a0dffa614..72b741276 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1,4 @@ +pass over offset from previous block error management split directory add doc values diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 872805095..647699b07 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -49,7 +49,7 @@ extern "C" { size_t output_capacity = 128; const uint32_t* pointer_end = simd_pack.decodeArray(reinterpret_cast(compressed_data), compressed_size / 4, uncompressed, output_capacity); const uint8_t* pointer_end_u8 = reinterpret_cast(pointer_end); - return static_cast(pointer_end_u8 - compressed_data) * 4; + return static_cast(pointer_end_u8 - compressed_data); } diff --git a/src/common/vint.rs b/src/common/vint.rs index 5c3eeabe9..48e442818 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -18,15 +18,14 @@ impl BinarySerializable for VInt { let mut written: usize = 0; let mut buffer = [0u8; 10]; loop { - let mut next_byte: u8 = (remaining % 128u64) as u8; + let next_byte: u8 = (remaining % 128u64) as u8; remaining /= 128u64; if remaining == 0u64 { - buffer[written] = next_byte; + buffer[written] = next_byte | 128u8; written += 1; break; } else { - next_byte |= 128u8; buffer[written] = next_byte; written += 1; } @@ -43,7 +42,7 @@ impl BinarySerializable for VInt { match bytes.next() { Some(Ok(b)) => { result += ((b % 128u8) as u64) << shift; - if b & 128 == 0u8 { + if b & 128u8 != 0u8 { break; } shift += 7; @@ -56,3 +55,4 @@ impl BinarySerializable for VInt { Ok(VInt(result)) } } + diff --git a/src/compression/block128.rs b/src/compression/block128.rs index 6e93e5ae5..57321a792 100644 --- a/src/compression/block128.rs +++ b/src/compression/block128.rs @@ -130,7 +130,7 @@ mod tests { input[i as usize] = i * 7 / 2; } let mut encoded_vec: Vec = encoder.encode_sorted(&input).to_vec(); - assert_eq!(encoded_vec.len(), 21); + assert_eq!(encoded_vec.len(), 84); for i in 0u8..*num_extra_values as u8 { encoded_vec.push(i); } @@ -142,7 +142,7 @@ mod tests { assert_eq!(uncompressed_values[i], input[i]); } for i in 0..*num_extra_values { - assert_eq!(remaining_input[i], i as u32); + assert_eq!(remaining_input[i], i as u8); } } } diff --git a/src/compression/s4bp128.rs b/src/compression/s4bp128.rs index d17067a91..4a9a19f97 100644 --- a/src/compression/s4bp128.rs +++ b/src/compression/s4bp128.rs @@ -121,7 +121,7 @@ mod tests { fn test_encode_sorted_big() { let mut encoder = S4BP128Encoder::new(); let num_ints = 10_000 as usize; - let expected_length = 1_274; + let expected_length = 5_096; let input: Vec = (0..num_ints as u32) .map(|i| i * 7 / 2) .into_iter().collect(); @@ -137,7 +137,7 @@ mod tests { fn test_encode_unsorted_big() { let mut encoder = S4BP128Encoder::new(); let num_ints = 10_000 as usize; - let expected_length = 1_897; + let expected_length = 7_588; let input: Vec = (0..num_ints as u32) .map(|i| i * 7 % 37) .into_iter().collect(); diff --git a/src/compression/vints.rs b/src/compression/vints.rs index 797d98e61..c3a95965c 100644 --- a/src/compression/vints.rs +++ b/src/compression/vints.rs @@ -3,13 +3,13 @@ use std::ptr; use std::iter; extern { - fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u8, output_capacity: size_t) -> size_t; + fn decode_sorted_vint_native(compressed_data: *const u8, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; } pub struct VIntsEncoder { input_buffer: Vec, - output_buffer: Vec, + output_buffer: Vec, } impl VIntsEncoder { @@ -17,11 +17,11 @@ impl VIntsEncoder { pub fn new() -> VIntsEncoder { VIntsEncoder { input_buffer: Vec::with_capacity(128), - output_buffer: iter::repeat(0u32).take(256).collect(), + output_buffer: iter::repeat(0u8).take(256 * 4).collect(), } } - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u8] { assert!(input.len() < 128); let input_len = input.len(); let written_size: usize; @@ -32,7 +32,7 @@ impl VIntsEncoder { self.input_buffer.as_mut_ptr(), input_len as size_t, self.output_buffer.as_mut_ptr(), - 256, + 256 * 4, ); } return &self.output_buffer[0..written_size]; @@ -54,7 +54,7 @@ impl VIntsDecoder { } pub fn decode_sorted(&mut self, - compressed_data: &[u32]) -> &[u32] { + compressed_data: &[u8]) -> &[u32] { unsafe { let num_uncompressed = decode_sorted_vint_native( compressed_data.as_ptr(), @@ -76,7 +76,7 @@ mod tests { fn test_encode_vint() { { let mut encoder = VIntsEncoder::new(); - let expected_length = 31; + let expected_length = 124; let input: Vec = (0u32..123u32) .map(|i| i * 7 / 2) .into_iter() @@ -90,10 +90,13 @@ mod tests { } { let mut encoder = VIntsEncoder::new(); - let input = vec!(3, 17u32, 187); + let input = vec!(3u32, 17u32, 187u32); let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), 1); - assert_eq!(encoded_data[0], 2167049859u32); + assert_eq!(encoded_data.len(), 4); + assert_eq!(encoded_data[0], 3u8 + 128u8); + assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8); + assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8)); + assert_eq!(encoded_data[3], (1u8 + 128u8)); } { let mut encoder = VIntsEncoder::new(); diff --git a/src/core/merger.rs b/src/core/merger.rs index 8f4c4e77d..a98ad7505 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -19,7 +19,6 @@ use core::index::SegmentInfo; use std::cmp::{min, max, Ordering}; struct PostingsMerger<'a> { - // doc_ids: Vec, doc_offsets: Vec, heap: BinaryHeap, term_streams: Vec>, diff --git a/src/datastruct/skip/mod.rs b/src/datastruct/skip/mod.rs index ce5c71933..f81abc5cd 100644 --- a/src/datastruct/skip/mod.rs +++ b/src/datastruct/skip/mod.rs @@ -129,7 +129,7 @@ mod tests { skip_list_builder.insert(2, &3).unwrap(); skip_list_builder.write::>(&mut output).unwrap(); assert_eq!(output.len(), 13); - assert_eq!(output[0], 1); + assert_eq!(output[0], 1u8 + 128u8); } #[test] @@ -141,7 +141,7 @@ mod tests { } skip_list_builder.write::>(&mut output).unwrap(); assert_eq!(output.len(), 117); - assert_eq!(output[0], 3); + assert_eq!(output[0], 3u8 + 128u8); } #[test] @@ -154,7 +154,7 @@ mod tests { } skip_list_builder.write::>(&mut output).unwrap(); assert_eq!(output.len(), 81); - assert_eq!(output[0], 3); + assert_eq!(output[0], 128u8 + 3u8); } } diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index a467f27ce..acb769e11 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -1,7 +1,6 @@ use directory::{Directory, ReadOnlySource}; use std::io::{Cursor, Write, Seek, SeekFrom}; use std::io; -use atomicwrites; use std::fmt; use std::sync::{Arc, RwLock}; use std::collections::HashMap; @@ -71,10 +70,8 @@ impl Directory for RAMDirectory { } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { - let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite); - meta_file.write(|f| { - f.write_all(data) - }) + let mut write = try!(self.open_write(path)); + write.write_all(data) } fn sync(&self, _: &Path) -> io::Result<()> { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index c44e151cb..e351d2c17 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -2,9 +2,11 @@ use postings::Postings; use compression::{NUM_DOCS_PER_BLOCK, Block128Decoder}; use DocId; use std::cmp::Ordering; -use std::mem; use postings::SkipResult; +use std::io::Cursor; +use common::VInt; use std::num::Wrapping; +use common::BinarySerializable; // No Term Frequency, no postings. pub struct SegmentPostings<'a> { @@ -32,6 +34,10 @@ impl<'a> SegmentPostings<'a> { self.remaining_data = self.block_decoder.decode_sorted(self.remaining_data); } else { + let mut cursor = Cursor::new(self.remaining_data); + let remaining_len: usize = VInt::deserialize(&mut cursor).unwrap().0 as usize; + let position = cursor.position() as usize; + self.remaining_data = &self.remaining_data[position..position+remaining_len]; self.block_decoder.decode_sorted_remaining(self.remaining_data); } } @@ -43,28 +49,6 @@ impl<'a> SegmentPostings<'a> { remaining_data: data, cur: Wrapping(usize::max_value()), } - // let mut data_u32: &[u32] = unsafe { mem::transmute(data) }; - // let mut doc_ids: Vec = Vec::with_capacity(doc_freq as usize); - // { - // let mut block_decoder = Block128Decoder::new(); - // let num_blocks = doc_freq / (NUM_DOCS_PER_BLOCK as u32); - // for _ in 0..num_blocks { - // let (remaining = block_decoder.decode_sorted(data_u32); - // doc_ids.extend_from_slice(uncompressed); - // data_u32 = remaining; - // } - // if doc_freq % 128 != 0 { - // let data_u8: &[u8] = unsafe { mem::transmute(data_u32) }; - // let mut cursor = Cursor::new(data_u8); - // let vint_len: usize = VInt::deserialize(&mut cursor).unwrap().val() as usize; - // let cursor_pos = cursor.position() as usize; - // let vint_data: &[u32] = unsafe { mem::transmute(&data_u8[cursor_pos..]) }; - // let mut vints_decoder = VIntsDecoder::new(); - // doc_ids.extend_from_slice(vints_decoder.decode_sorted(&vint_data[..vint_len])); - // } - // } - // SegmentPostings(doc_ids) - } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index c7cb1b0fd..630d72fed 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -69,6 +69,7 @@ impl PostingsSerializer { { let block_encoded = self.vints_encoder.encode_sorted(&self.doc_ids[..]); self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write)); + for num in block_encoded { self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); } @@ -84,7 +85,8 @@ impl PostingsSerializer { } if self.is_positions_enabled { let positions_encoded: &[u8] = self.positions_encoder.encode(&self.position_deltas[..]); - self.positions_write.write_all(positions_encoded); + self.written_bytes_positions += try!(VInt(positions_encoded.len() as u64).serialize(&mut self.positions_write)); + try!(self.positions_write.write_all(positions_encoded)); self.written_bytes_positions += positions_encoded.len(); self.position_deltas.clear(); } @@ -106,17 +108,17 @@ impl PostingsSerializer { { // encode the positions let block_encoded: &[u8] = self.block_encoder.encode_sorted(&self.doc_ids); - self.postings_write.write_all(block_encoded); + try!(self.postings_write.write_all(block_encoded)); self.written_bytes_postings += block_encoded.len(); } if self.is_termfreq_enabled { // encode the term_freqs let block_encoded: &[u8] = self.block_encoder.encode_sorted(&self.term_freqs); - self.postings_write.write_all(block_encoded); + try!(self.postings_write.write_all(block_encoded)); self.written_bytes_postings += block_encoded.len(); if self.is_positions_enabled { let positions_encoded: &[u8] = self.positions_encoder.encode(&self.position_deltas[..]); - self.positions_write.write_all(positions_encoded); + try!(self.positions_write.write_all(positions_encoded)); self.written_bytes_positions += positions_encoded.len(); self.position_deltas.clear(); } diff --git a/src/schema/text_field.rs b/src/schema/text_field.rs index 8915b7648..ecd8340a0 100644 --- a/src/schema/text_field.rs +++ b/src/schema/text_field.rs @@ -3,7 +3,6 @@ use std::io; use std::io::Read; use common::BinarySerializable; -use rustc_serialize::Encodable; use rustc_serialize::Decoder; use rustc_serialize::Encoder; use std::ops::BitOr; diff --git a/src/schema/u32_field.rs b/src/schema/u32_field.rs index 06a214bb2..11164bc24 100644 --- a/src/schema/u32_field.rs +++ b/src/schema/u32_field.rs @@ -3,7 +3,6 @@ use std::io::Write; use std::io::Read; use common::BinarySerializable; -use rustc_serialize::Encodable; use rustc_serialize::Decoder; use rustc_serialize::Encoder;