test passing again

This commit is contained in:
Paul Masurel
2016-05-06 16:46:03 +09:00
parent e66b70051d
commit 45746e4175
13 changed files with 42 additions and 58 deletions

View File

@@ -1,3 +1,4 @@
pass over offset from previous block
error management
split directory
add doc values

View File

@@ -49,7 +49,7 @@ extern "C" {
size_t output_capacity = 128;
const uint32_t* pointer_end = simd_pack.decodeArray(reinterpret_cast<const uint32_t*>(compressed_data), compressed_size / 4, uncompressed, output_capacity);
const uint8_t* pointer_end_u8 = reinterpret_cast<const uint8_t*>(pointer_end);
return static_cast<size_t>(pointer_end_u8 - compressed_data) * 4;
return static_cast<size_t>(pointer_end_u8 - compressed_data);
}

View File

@@ -18,15 +18,14 @@ impl BinarySerializable for VInt {
let mut written: usize = 0;
let mut buffer = [0u8; 10];
loop {
let mut next_byte: u8 = (remaining % 128u64) as u8;
let next_byte: u8 = (remaining % 128u64) as u8;
remaining /= 128u64;
if remaining == 0u64 {
buffer[written] = next_byte;
buffer[written] = next_byte | 128u8;
written += 1;
break;
}
else {
next_byte |= 128u8;
buffer[written] = next_byte;
written += 1;
}
@@ -43,7 +42,7 @@ impl BinarySerializable for VInt {
match bytes.next() {
Some(Ok(b)) => {
result += ((b % 128u8) as u64) << shift;
if b & 128 == 0u8 {
if b & 128u8 != 0u8 {
break;
}
shift += 7;
@@ -56,3 +55,4 @@ impl BinarySerializable for VInt {
Ok(VInt(result))
}
}

View File

@@ -130,7 +130,7 @@ mod tests {
input[i as usize] = i * 7 / 2;
}
let mut encoded_vec: Vec<u8> = encoder.encode_sorted(&input).to_vec();
assert_eq!(encoded_vec.len(), 21);
assert_eq!(encoded_vec.len(), 84);
for i in 0u8..*num_extra_values as u8 {
encoded_vec.push(i);
}
@@ -142,7 +142,7 @@ mod tests {
assert_eq!(uncompressed_values[i], input[i]);
}
for i in 0..*num_extra_values {
assert_eq!(remaining_input[i], i as u32);
assert_eq!(remaining_input[i], i as u8);
}
}
}

View File

@@ -121,7 +121,7 @@ mod tests {
fn test_encode_sorted_big() {
let mut encoder = S4BP128Encoder::new();
let num_ints = 10_000 as usize;
let expected_length = 1_274;
let expected_length = 5_096;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
@@ -137,7 +137,7 @@ mod tests {
fn test_encode_unsorted_big() {
let mut encoder = S4BP128Encoder::new();
let num_ints = 10_000 as usize;
let expected_length = 1_897;
let expected_length = 7_588;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 % 37)
.into_iter().collect();

View File

@@ -3,13 +3,13 @@ use std::ptr;
use std::iter;
extern {
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u8, output_capacity: size_t) -> size_t;
fn decode_sorted_vint_native(compressed_data: *const u8, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
pub struct VIntsEncoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
output_buffer: Vec<u8>,
}
impl VIntsEncoder {
@@ -17,11 +17,11 @@ impl VIntsEncoder {
pub fn new() -> VIntsEncoder {
VIntsEncoder {
input_buffer: Vec::with_capacity(128),
output_buffer: iter::repeat(0u32).take(256).collect(),
output_buffer: iter::repeat(0u8).take(256 * 4).collect(),
}
}
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u8] {
assert!(input.len() < 128);
let input_len = input.len();
let written_size: usize;
@@ -32,7 +32,7 @@ impl VIntsEncoder {
self.input_buffer.as_mut_ptr(),
input_len as size_t,
self.output_buffer.as_mut_ptr(),
256,
256 * 4,
);
}
return &self.output_buffer[0..written_size];
@@ -54,7 +54,7 @@ impl VIntsDecoder {
}
pub fn decode_sorted(&mut self,
compressed_data: &[u32]) -> &[u32] {
compressed_data: &[u8]) -> &[u32] {
unsafe {
let num_uncompressed = decode_sorted_vint_native(
compressed_data.as_ptr(),
@@ -76,7 +76,7 @@ mod tests {
fn test_encode_vint() {
{
let mut encoder = VIntsEncoder::new();
let expected_length = 31;
let expected_length = 124;
let input: Vec<u32> = (0u32..123u32)
.map(|i| i * 7 / 2)
.into_iter()
@@ -90,10 +90,13 @@ mod tests {
}
{
let mut encoder = VIntsEncoder::new();
let input = vec!(3, 17u32, 187);
let input = vec!(3u32, 17u32, 187u32);
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), 1);
assert_eq!(encoded_data[0], 2167049859u32);
assert_eq!(encoded_data.len(), 4);
assert_eq!(encoded_data[0], 3u8 + 128u8);
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
assert_eq!(encoded_data[3], (1u8 + 128u8));
}
{
let mut encoder = VIntsEncoder::new();

View File

@@ -19,7 +19,6 @@ use core::index::SegmentInfo;
use std::cmp::{min, max, Ordering};
struct PostingsMerger<'a> {
// doc_ids: Vec<DocId>,
doc_offsets: Vec<DocId>,
heap: BinaryHeap<HeapItem>,
term_streams: Vec<FstMapIter<'a, TermInfo>>,

View File

@@ -129,7 +129,7 @@ mod tests {
skip_list_builder.insert(2, &3).unwrap();
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 13);
assert_eq!(output[0], 1);
assert_eq!(output[0], 1u8 + 128u8);
}
#[test]
@@ -141,7 +141,7 @@ mod tests {
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 117);
assert_eq!(output[0], 3);
assert_eq!(output[0], 3u8 + 128u8);
}
#[test]
@@ -154,7 +154,7 @@ mod tests {
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 81);
assert_eq!(output[0], 3);
assert_eq!(output[0], 128u8 + 3u8);
}
}

View File

@@ -1,7 +1,6 @@
use directory::{Directory, ReadOnlySource};
use std::io::{Cursor, Write, Seek, SeekFrom};
use std::io;
use atomicwrites;
use std::fmt;
use std::sync::{Arc, RwLock};
use std::collections::HashMap;
@@ -71,10 +70,8 @@ impl Directory for RAMDirectory {
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
let mut write = try!(self.open_write(path));
write.write_all(data)
}
fn sync(&self, _: &Path) -> io::Result<()> {

View File

@@ -2,9 +2,11 @@ use postings::Postings;
use compression::{NUM_DOCS_PER_BLOCK, Block128Decoder};
use DocId;
use std::cmp::Ordering;
use std::mem;
use postings::SkipResult;
use std::io::Cursor;
use common::VInt;
use std::num::Wrapping;
use common::BinarySerializable;
// No Term Frequency, no postings.
pub struct SegmentPostings<'a> {
@@ -32,6 +34,10 @@ impl<'a> SegmentPostings<'a> {
self.remaining_data = self.block_decoder.decode_sorted(self.remaining_data);
}
else {
let mut cursor = Cursor::new(self.remaining_data);
let remaining_len: usize = VInt::deserialize(&mut cursor).unwrap().0 as usize;
let position = cursor.position() as usize;
self.remaining_data = &self.remaining_data[position..position+remaining_len];
self.block_decoder.decode_sorted_remaining(self.remaining_data);
}
}
@@ -43,28 +49,6 @@ impl<'a> SegmentPostings<'a> {
remaining_data: data,
cur: Wrapping(usize::max_value()),
}
// let mut data_u32: &[u32] = unsafe { mem::transmute(data) };
// let mut doc_ids: Vec<u32> = Vec::with_capacity(doc_freq as usize);
// {
// let mut block_decoder = Block128Decoder::new();
// let num_blocks = doc_freq / (NUM_DOCS_PER_BLOCK as u32);
// for _ in 0..num_blocks {
// let (remaining = block_decoder.decode_sorted(data_u32);
// doc_ids.extend_from_slice(uncompressed);
// data_u32 = remaining;
// }
// if doc_freq % 128 != 0 {
// let data_u8: &[u8] = unsafe { mem::transmute(data_u32) };
// let mut cursor = Cursor::new(data_u8);
// let vint_len: usize = VInt::deserialize(&mut cursor).unwrap().val() as usize;
// let cursor_pos = cursor.position() as usize;
// let vint_data: &[u32] = unsafe { mem::transmute(&data_u8[cursor_pos..]) };
// let mut vints_decoder = VIntsDecoder::new();
// doc_ids.extend_from_slice(vints_decoder.decode_sorted(&vint_data[..vint_len]));
// }
// }
// SegmentPostings(doc_ids)
}
}

View File

@@ -69,6 +69,7 @@ impl PostingsSerializer {
{
let block_encoded = self.vints_encoder.encode_sorted(&self.doc_ids[..]);
self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
for num in block_encoded {
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
}
@@ -84,7 +85,8 @@ impl PostingsSerializer {
}
if self.is_positions_enabled {
let positions_encoded: &[u8] = self.positions_encoder.encode(&self.position_deltas[..]);
self.positions_write.write_all(positions_encoded);
self.written_bytes_positions += try!(VInt(positions_encoded.len() as u64).serialize(&mut self.positions_write));
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
self.position_deltas.clear();
}
@@ -106,17 +108,17 @@ impl PostingsSerializer {
{
// encode the positions
let block_encoded: &[u8] = self.block_encoder.encode_sorted(&self.doc_ids);
self.postings_write.write_all(block_encoded);
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
}
if self.is_termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder.encode_sorted(&self.term_freqs);
self.postings_write.write_all(block_encoded);
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
if self.is_positions_enabled {
let positions_encoded: &[u8] = self.positions_encoder.encode(&self.position_deltas[..]);
self.positions_write.write_all(positions_encoded);
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
self.position_deltas.clear();
}

View File

@@ -3,7 +3,6 @@ use std::io;
use std::io::Read;
use common::BinarySerializable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use std::ops::BitOr;

View File

@@ -3,7 +3,6 @@ use std::io::Write;
use std::io::Read;
use common::BinarySerializable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;