mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-01 08:00:41 +00:00
blop
This commit is contained in:
@@ -14,7 +14,7 @@ use core::store::StoreWriter;
|
||||
use core::serialize::BinarySerializable;
|
||||
use core::simdcompression;
|
||||
use core::schema::FieldValue;
|
||||
|
||||
use core::convert_to_ioerror;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TermInfo {
|
||||
@@ -22,6 +22,7 @@ pub struct TermInfo {
|
||||
pub postings_offset: u32,
|
||||
}
|
||||
|
||||
|
||||
impl BinarySerializable for TermInfo {
|
||||
|
||||
const SIZE: Size = Size::Constant(8);
|
||||
@@ -89,9 +90,9 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
|
||||
|
||||
fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()> {
|
||||
let mut write = try!(self.segment.open_write(SegmentComponent::INFO));
|
||||
let json_data = json::encode(segment_info).unwrap();
|
||||
write.write_all(json_data.as_bytes());
|
||||
write.flush();
|
||||
let json_data = try!(json::encode(segment_info).map_err(convert_to_ioerror));
|
||||
try!(write.write_all(json_data.as_bytes()));
|
||||
try!(write.flush());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -215,7 +215,7 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
|
||||
let full_path = PathBuf::from(&path);
|
||||
let mut data = SharedVec::new();
|
||||
let data = SharedVec::new();
|
||||
self.fs.insert(full_path, data.clone());
|
||||
Ok(Box::new(data))
|
||||
}
|
||||
|
||||
@@ -32,12 +32,6 @@ pub struct IndexMeta {
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
fn new() -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: Vec::new(),
|
||||
schema: Schema::new(),
|
||||
}
|
||||
}
|
||||
fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: Vec::new(),
|
||||
|
||||
@@ -8,9 +8,19 @@ pub mod reader;
|
||||
pub mod codec;
|
||||
pub mod searcher;
|
||||
pub mod collector;
|
||||
pub mod skip;
|
||||
pub mod serialize;
|
||||
pub mod store;
|
||||
pub mod simdcompression;
|
||||
pub mod fstmap;
|
||||
pub mod index;
|
||||
|
||||
|
||||
use std::error;
|
||||
use std::io;
|
||||
|
||||
pub fn convert_to_ioerror<E: 'static + error::Error + Send + Sync>(err: E) -> io::Error {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
err
|
||||
)
|
||||
}
|
||||
|
||||
@@ -17,56 +17,6 @@ pub trait Postings: Iterator<Item=DocId> {
|
||||
// impl<T: Iterator<Item=DocId>> Postings for T {}
|
||||
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VecPostings {
|
||||
doc_ids: Vec<DocId>,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl VecPostings {
|
||||
pub fn new(vals: Vec<DocId>) -> VecPostings {
|
||||
VecPostings {
|
||||
doc_ids: vals,
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for VecPostings {
|
||||
// after skipping position
|
||||
// the iterator in such a way that the
|
||||
// next call to next() will return a
|
||||
// value greater or equal to target.
|
||||
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
|
||||
loop {
|
||||
match Iterator::next(self) {
|
||||
Some(val) if val >= target => {
|
||||
return Some(val);
|
||||
},
|
||||
None => {
|
||||
return None;
|
||||
},
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for VecPostings {
|
||||
type Item = DocId;
|
||||
fn next(&mut self,) -> Option<DocId> {
|
||||
if self.cursor >= self.doc_ids.len() {
|
||||
None
|
||||
}
|
||||
else {
|
||||
self.cursor += 1;
|
||||
Some(self.doc_ids[self.cursor - 1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
pub struct IntersectionPostings<T: Postings> {
|
||||
@@ -127,6 +77,55 @@ mod tests {
|
||||
use test::Bencher;
|
||||
use core::schema::DocId;
|
||||
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VecPostings {
|
||||
doc_ids: Vec<DocId>,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl VecPostings {
|
||||
pub fn new(vals: Vec<DocId>) -> VecPostings {
|
||||
VecPostings {
|
||||
doc_ids: vals,
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for VecPostings {
|
||||
// after skipping position
|
||||
// the iterator in such a way that the
|
||||
// next call to next() will return a
|
||||
// value greater or equal to target.
|
||||
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
|
||||
loop {
|
||||
match Iterator::next(self) {
|
||||
Some(val) if val >= target => {
|
||||
return Some(val);
|
||||
},
|
||||
None => {
|
||||
return None;
|
||||
},
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for VecPostings {
|
||||
type Item = DocId;
|
||||
fn next(&mut self,) -> Option<DocId> {
|
||||
if self.cursor >= self.doc_ids.len() {
|
||||
None
|
||||
}
|
||||
else {
|
||||
self.cursor += 1;
|
||||
Some(self.doc_ids[self.cursor - 1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
|
||||
@@ -14,12 +14,11 @@ use std::io;
|
||||
use std::str;
|
||||
use core::codec::TermInfo;
|
||||
use core::fstmap::FstMap;
|
||||
use std::error;
|
||||
use rustc_serialize::json;
|
||||
use core::serial::SegmentSerializer;
|
||||
use core::serial::SerializableSegment;
|
||||
use std::str::Utf8Error;
|
||||
use core::index::SegmentInfo;
|
||||
use core::convert_to_ioerror;
|
||||
|
||||
// TODO file structure should be in codec
|
||||
|
||||
@@ -99,14 +98,6 @@ impl Iterator for SegmentPostings {
|
||||
}
|
||||
|
||||
|
||||
fn convert_to_ioerror<E: 'static + error::Error + Send + Sync>(err: E) -> io::Error {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
err
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl SegmentReader {
|
||||
|
||||
|
||||
@@ -4,12 +4,12 @@ use std::ptr;
|
||||
// use std::iter;
|
||||
|
||||
extern {
|
||||
fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
|
||||
// fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
// fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
// fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
|
||||
fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
|
||||
|
||||
}
|
||||
|
||||
pub struct Encoder {
|
||||
@@ -48,26 +48,26 @@ impl Encoder {
|
||||
}
|
||||
|
||||
|
||||
pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len + 10000 >= self.input_buffer.len() {
|
||||
let target_length = input_len + 1024;
|
||||
self.input_buffer.resize(target_length, 0);
|
||||
self.output_buffer.resize(target_length, 0);
|
||||
}
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
let written_size = encode_unsorted_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
self.output_buffer.len() as size_t,
|
||||
);
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
// pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
// self.input_buffer.clear();
|
||||
// let input_len = input.len();
|
||||
// if input_len + 10000 >= self.input_buffer.len() {
|
||||
// let target_length = input_len + 1024;
|
||||
// self.input_buffer.resize(target_length, 0);
|
||||
// self.output_buffer.resize(target_length, 0);
|
||||
// }
|
||||
// // TODO use clone_from when available
|
||||
// unsafe {
|
||||
// ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
// let written_size = encode_unsorted_native(
|
||||
// self.input_buffer.as_mut_ptr(),
|
||||
// input_len as size_t,
|
||||
// self.output_buffer.as_mut_ptr(),
|
||||
// self.output_buffer.len() as size_t,
|
||||
// );
|
||||
// return &self.output_buffer[0..written_size];
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
@@ -92,17 +92,17 @@ impl Decoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_unsorted(&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_unsorted_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
// pub fn decode_unsorted(&self,
|
||||
// compressed_data: &[u32],
|
||||
// uncompressed_values: &mut [u32]) -> size_t {
|
||||
// unsafe {
|
||||
// return decode_unsorted_native(
|
||||
// compressed_data.as_ptr(),
|
||||
// compressed_data.len() as size_t,
|
||||
// uncompressed_values.as_mut_ptr(),
|
||||
// uncompressed_values.len() as size_t);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
@@ -176,22 +176,22 @@ mod tests {
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unsorted() {
|
||||
let mut encoder = Encoder::new();
|
||||
let num_ints = 10_000 as usize;
|
||||
let expected_length = 4361;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 213_127 % 501)
|
||||
.into_iter().collect();
|
||||
assert_eq!(input.len(), 10_000);
|
||||
let encoded_data = encoder.encode_unsorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
// #[test]
|
||||
// fn test_encode_unsorted() {
|
||||
// let mut encoder = Encoder::new();
|
||||
// let num_ints = 10_000 as usize;
|
||||
// let expected_length = 4361;
|
||||
// let input: Vec<u32> = (0..num_ints as u32)
|
||||
// .map(|i| i * 213_127 % 501)
|
||||
// .into_iter().collect();
|
||||
// assert_eq!(input.len(), 10_000);
|
||||
// let encoded_data = encoder.encode_unsorted(&input);
|
||||
// assert_eq!(encoded_data.len(), expected_length);
|
||||
// let decoder = Decoder::new();
|
||||
// let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
// assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data));
|
||||
// assert_eq!(decoded_data, input);
|
||||
// }
|
||||
//
|
||||
// #[test]
|
||||
// fn test_simd_intersection() {
|
||||
|
||||
@@ -139,7 +139,7 @@ impl StoreReader {
|
||||
return offset;
|
||||
}
|
||||
|
||||
fn read_block(&self, block_offset: usize) {
|
||||
fn read_block(&self, block_offset: usize) -> io::Result<()> {
|
||||
let mut current_block_mut = self.current_block.borrow_mut();
|
||||
current_block_mut.clear();
|
||||
let total_buffer = self.data.as_slice();
|
||||
@@ -147,21 +147,21 @@ impl StoreReader {
|
||||
let block_length = u32::deserialize(&mut cursor).unwrap();
|
||||
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)];
|
||||
let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap();
|
||||
lz4_decoder.read_to_end(&mut current_block_mut);
|
||||
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())
|
||||
}
|
||||
|
||||
pub fn get(&self, doc_id: &DocId) -> io::Result<Document> {
|
||||
let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id);
|
||||
self.read_block(block_offset as usize);
|
||||
try!(self.read_block(block_offset as usize));
|
||||
let mut current_block_mut = self.current_block.borrow_mut();
|
||||
let mut cursor = Cursor::new(&mut current_block_mut[..]);
|
||||
for _ in first_doc_id..*doc_id {
|
||||
let block_length = try!(u32::deserialize(&mut cursor));
|
||||
cursor.seek(SeekFrom::Current(block_length as i64));
|
||||
try!(cursor.seek(SeekFrom::Current(block_length as i64)));
|
||||
}
|
||||
try!(u32::deserialize(&mut cursor));
|
||||
let mut field_values = Vec::new();
|
||||
let num_fields = u32::deserialize(&mut cursor).unwrap();
|
||||
let num_fields = try!(u32::deserialize(&mut cursor));
|
||||
for _ in 0..num_fields {
|
||||
let field_value = try!(FieldValue::deserialize(&mut cursor));
|
||||
field_values.push(field_value);
|
||||
|
||||
@@ -35,6 +35,10 @@ pub use core::schema::Document;
|
||||
pub use core::collector;
|
||||
pub use core::reader::SegmentReader;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
Reference in New Issue
Block a user