mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-14 21:12:54 +00:00
test passing again
This commit is contained in:
1
TODO.md
1
TODO.md
@@ -1,3 +1,4 @@
|
||||
pass over offset from previous block
|
||||
error management
|
||||
split directory
|
||||
add doc values
|
||||
|
||||
@@ -49,7 +49,7 @@ extern "C" {
|
||||
size_t output_capacity = 128;
|
||||
const uint32_t* pointer_end = simd_pack.decodeArray(reinterpret_cast<const uint32_t*>(compressed_data), compressed_size / 4, uncompressed, output_capacity);
|
||||
const uint8_t* pointer_end_u8 = reinterpret_cast<const uint8_t*>(pointer_end);
|
||||
return static_cast<size_t>(pointer_end_u8 - compressed_data) * 4;
|
||||
return static_cast<size_t>(pointer_end_u8 - compressed_data);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -18,15 +18,14 @@ impl BinarySerializable for VInt {
|
||||
let mut written: usize = 0;
|
||||
let mut buffer = [0u8; 10];
|
||||
loop {
|
||||
let mut next_byte: u8 = (remaining % 128u64) as u8;
|
||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||
remaining /= 128u64;
|
||||
if remaining == 0u64 {
|
||||
buffer[written] = next_byte;
|
||||
buffer[written] = next_byte | 128u8;
|
||||
written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
next_byte |= 128u8;
|
||||
buffer[written] = next_byte;
|
||||
written += 1;
|
||||
}
|
||||
@@ -43,7 +42,7 @@ impl BinarySerializable for VInt {
|
||||
match bytes.next() {
|
||||
Some(Ok(b)) => {
|
||||
result += ((b % 128u8) as u64) << shift;
|
||||
if b & 128 == 0u8 {
|
||||
if b & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
@@ -56,3 +55,4 @@ impl BinarySerializable for VInt {
|
||||
Ok(VInt(result))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ mod tests {
|
||||
input[i as usize] = i * 7 / 2;
|
||||
}
|
||||
let mut encoded_vec: Vec<u8> = encoder.encode_sorted(&input).to_vec();
|
||||
assert_eq!(encoded_vec.len(), 21);
|
||||
assert_eq!(encoded_vec.len(), 84);
|
||||
for i in 0u8..*num_extra_values as u8 {
|
||||
encoded_vec.push(i);
|
||||
}
|
||||
@@ -142,7 +142,7 @@ mod tests {
|
||||
assert_eq!(uncompressed_values[i], input[i]);
|
||||
}
|
||||
for i in 0..*num_extra_values {
|
||||
assert_eq!(remaining_input[i], i as u32);
|
||||
assert_eq!(remaining_input[i], i as u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ mod tests {
|
||||
fn test_encode_sorted_big() {
|
||||
let mut encoder = S4BP128Encoder::new();
|
||||
let num_ints = 10_000 as usize;
|
||||
let expected_length = 1_274;
|
||||
let expected_length = 5_096;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter().collect();
|
||||
@@ -137,7 +137,7 @@ mod tests {
|
||||
fn test_encode_unsorted_big() {
|
||||
let mut encoder = S4BP128Encoder::new();
|
||||
let num_ints = 10_000 as usize;
|
||||
let expected_length = 1_897;
|
||||
let expected_length = 7_588;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 % 37)
|
||||
.into_iter().collect();
|
||||
|
||||
@@ -3,13 +3,13 @@ use std::ptr;
|
||||
use std::iter;
|
||||
|
||||
extern {
|
||||
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u8, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_vint_native(compressed_data: *const u8, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
}
|
||||
|
||||
pub struct VIntsEncoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl VIntsEncoder {
|
||||
@@ -17,11 +17,11 @@ impl VIntsEncoder {
|
||||
pub fn new() -> VIntsEncoder {
|
||||
VIntsEncoder {
|
||||
input_buffer: Vec::with_capacity(128),
|
||||
output_buffer: iter::repeat(0u32).take(256).collect(),
|
||||
output_buffer: iter::repeat(0u8).take(256 * 4).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u8] {
|
||||
assert!(input.len() < 128);
|
||||
let input_len = input.len();
|
||||
let written_size: usize;
|
||||
@@ -32,7 +32,7 @@ impl VIntsEncoder {
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
256,
|
||||
256 * 4,
|
||||
);
|
||||
}
|
||||
return &self.output_buffer[0..written_size];
|
||||
@@ -54,7 +54,7 @@ impl VIntsDecoder {
|
||||
}
|
||||
|
||||
pub fn decode_sorted(&mut self,
|
||||
compressed_data: &[u32]) -> &[u32] {
|
||||
compressed_data: &[u8]) -> &[u32] {
|
||||
unsafe {
|
||||
let num_uncompressed = decode_sorted_vint_native(
|
||||
compressed_data.as_ptr(),
|
||||
@@ -76,7 +76,7 @@ mod tests {
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
let mut encoder = VIntsEncoder::new();
|
||||
let expected_length = 31;
|
||||
let expected_length = 124;
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter()
|
||||
@@ -90,10 +90,13 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let mut encoder = VIntsEncoder::new();
|
||||
let input = vec!(3, 17u32, 187);
|
||||
let input = vec!(3u32, 17u32, 187u32);
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), 1);
|
||||
assert_eq!(encoded_data[0], 2167049859u32);
|
||||
assert_eq!(encoded_data.len(), 4);
|
||||
assert_eq!(encoded_data[0], 3u8 + 128u8);
|
||||
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
|
||||
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
|
||||
assert_eq!(encoded_data[3], (1u8 + 128u8));
|
||||
}
|
||||
{
|
||||
let mut encoder = VIntsEncoder::new();
|
||||
|
||||
@@ -19,7 +19,6 @@ use core::index::SegmentInfo;
|
||||
use std::cmp::{min, max, Ordering};
|
||||
|
||||
struct PostingsMerger<'a> {
|
||||
// doc_ids: Vec<DocId>,
|
||||
doc_offsets: Vec<DocId>,
|
||||
heap: BinaryHeap<HeapItem>,
|
||||
term_streams: Vec<FstMapIter<'a, TermInfo>>,
|
||||
|
||||
@@ -129,7 +129,7 @@ mod tests {
|
||||
skip_list_builder.insert(2, &3).unwrap();
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 13);
|
||||
assert_eq!(output[0], 1);
|
||||
assert_eq!(output[0], 1u8 + 128u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 117);
|
||||
assert_eq!(output[0], 3);
|
||||
assert_eq!(output[0], 3u8 + 128u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -154,7 +154,7 @@ mod tests {
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 81);
|
||||
assert_eq!(output[0], 3);
|
||||
assert_eq!(output[0], 128u8 + 3u8);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use std::io::{Cursor, Write, Seek, SeekFrom};
|
||||
use std::io;
|
||||
use atomicwrites;
|
||||
use std::fmt;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::collections::HashMap;
|
||||
@@ -71,10 +70,8 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite);
|
||||
meta_file.write(|f| {
|
||||
f.write_all(data)
|
||||
})
|
||||
let mut write = try!(self.open_write(path));
|
||||
write.write_all(data)
|
||||
}
|
||||
|
||||
fn sync(&self, _: &Path) -> io::Result<()> {
|
||||
|
||||
@@ -2,9 +2,11 @@ use postings::Postings;
|
||||
use compression::{NUM_DOCS_PER_BLOCK, Block128Decoder};
|
||||
use DocId;
|
||||
use std::cmp::Ordering;
|
||||
use std::mem;
|
||||
use postings::SkipResult;
|
||||
use std::io::Cursor;
|
||||
use common::VInt;
|
||||
use std::num::Wrapping;
|
||||
use common::BinarySerializable;
|
||||
|
||||
// No Term Frequency, no postings.
|
||||
pub struct SegmentPostings<'a> {
|
||||
@@ -32,6 +34,10 @@ impl<'a> SegmentPostings<'a> {
|
||||
self.remaining_data = self.block_decoder.decode_sorted(self.remaining_data);
|
||||
}
|
||||
else {
|
||||
let mut cursor = Cursor::new(self.remaining_data);
|
||||
let remaining_len: usize = VInt::deserialize(&mut cursor).unwrap().0 as usize;
|
||||
let position = cursor.position() as usize;
|
||||
self.remaining_data = &self.remaining_data[position..position+remaining_len];
|
||||
self.block_decoder.decode_sorted_remaining(self.remaining_data);
|
||||
}
|
||||
}
|
||||
@@ -43,28 +49,6 @@ impl<'a> SegmentPostings<'a> {
|
||||
remaining_data: data,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
// let mut data_u32: &[u32] = unsafe { mem::transmute(data) };
|
||||
// let mut doc_ids: Vec<u32> = Vec::with_capacity(doc_freq as usize);
|
||||
// {
|
||||
// let mut block_decoder = Block128Decoder::new();
|
||||
// let num_blocks = doc_freq / (NUM_DOCS_PER_BLOCK as u32);
|
||||
// for _ in 0..num_blocks {
|
||||
// let (remaining = block_decoder.decode_sorted(data_u32);
|
||||
// doc_ids.extend_from_slice(uncompressed);
|
||||
// data_u32 = remaining;
|
||||
// }
|
||||
// if doc_freq % 128 != 0 {
|
||||
// let data_u8: &[u8] = unsafe { mem::transmute(data_u32) };
|
||||
// let mut cursor = Cursor::new(data_u8);
|
||||
// let vint_len: usize = VInt::deserialize(&mut cursor).unwrap().val() as usize;
|
||||
// let cursor_pos = cursor.position() as usize;
|
||||
// let vint_data: &[u32] = unsafe { mem::transmute(&data_u8[cursor_pos..]) };
|
||||
// let mut vints_decoder = VIntsDecoder::new();
|
||||
// doc_ids.extend_from_slice(vints_decoder.decode_sorted(&vint_data[..vint_len]));
|
||||
// }
|
||||
// }
|
||||
// SegmentPostings(doc_ids)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -69,6 +69,7 @@ impl PostingsSerializer {
|
||||
{
|
||||
let block_encoded = self.vints_encoder.encode_sorted(&self.doc_ids[..]);
|
||||
self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
|
||||
|
||||
for num in block_encoded {
|
||||
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
|
||||
}
|
||||
@@ -84,7 +85,8 @@ impl PostingsSerializer {
|
||||
}
|
||||
if self.is_positions_enabled {
|
||||
let positions_encoded: &[u8] = self.positions_encoder.encode(&self.position_deltas[..]);
|
||||
self.positions_write.write_all(positions_encoded);
|
||||
self.written_bytes_positions += try!(VInt(positions_encoded.len() as u64).serialize(&mut self.positions_write));
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
self.position_deltas.clear();
|
||||
}
|
||||
@@ -106,17 +108,17 @@ impl PostingsSerializer {
|
||||
{
|
||||
// encode the positions
|
||||
let block_encoded: &[u8] = self.block_encoder.encode_sorted(&self.doc_ids);
|
||||
self.postings_write.write_all(block_encoded);
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
}
|
||||
if self.is_termfreq_enabled {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] = self.block_encoder.encode_sorted(&self.term_freqs);
|
||||
self.postings_write.write_all(block_encoded);
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
if self.is_positions_enabled {
|
||||
let positions_encoded: &[u8] = self.positions_encoder.encode(&self.position_deltas[..]);
|
||||
self.positions_write.write_all(positions_encoded);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
self.position_deltas.clear();
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::io;
|
||||
|
||||
use std::io::Read;
|
||||
use common::BinarySerializable;
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Decoder;
|
||||
use rustc_serialize::Encoder;
|
||||
use std::ops::BitOr;
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::io::Write;
|
||||
use std::io::Read;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Decoder;
|
||||
use rustc_serialize::Encoder;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user