mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
skip
This commit is contained in:
192
src/core/skip.rs
192
src/core/skip.rs
@@ -4,16 +4,31 @@ use std::io::Read;
|
||||
use std::io::Cursor;
|
||||
use std::io::SeekFrom;
|
||||
use std::io::Seek;
|
||||
use std::marker;
|
||||
use core::DocId;
|
||||
use std::ops::DerefMut;
|
||||
use serde::Serialize;
|
||||
use serde;
|
||||
use bincode;
|
||||
use byteorder;
|
||||
use core::error;
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
|
||||
|
||||
pub trait BinarySerializable : fmt::Debug + Sized {
|
||||
// TODO move Result from Error.
|
||||
fn serialize(&self, writer: &mut Write) -> error::Result<usize>;
|
||||
fn deserialize(reader: &mut Read) -> error::Result<Self>;
|
||||
}
|
||||
|
||||
impl BinarySerializable for () {
|
||||
fn serialize(&self, writer: &mut Write) -> error::Result<usize> {
|
||||
Ok(0)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> error::Result<Self> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerBuilder {
|
||||
period: usize,
|
||||
buffer: Vec<u8>,
|
||||
@@ -47,18 +62,17 @@ impl LayerBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
fn insert<S: Serialize>(&mut self, doc_id: DocId, dest: S) -> InsertResult {
|
||||
fn insert<S: BinarySerializable>(&mut self, doc_id: DocId, value: &S) -> InsertResult {
|
||||
self.remaining -= 1;
|
||||
self.len += 1;
|
||||
let offset = self.written_size(); // TODO not sure if we want after or here
|
||||
self.buffer.write_u32::<BigEndian>(doc_id);
|
||||
value.serialize(&mut self.buffer);
|
||||
if self.remaining == 0 {
|
||||
let offset = self.written_size();
|
||||
dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
|
||||
self.remaining = self.period;
|
||||
InsertResult::SkipPointer(offset)
|
||||
InsertResult::SkipPointer(offset as u32)
|
||||
}
|
||||
else {
|
||||
doc_id.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
|
||||
dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
|
||||
InsertResult::NoNeedForSkip
|
||||
}
|
||||
}
|
||||
@@ -70,8 +84,9 @@ pub struct SkipListBuilder {
|
||||
layers: Vec<LayerBuilder>,
|
||||
}
|
||||
|
||||
|
||||
enum InsertResult {
|
||||
SkipPointer(usize),
|
||||
SkipPointer(u32),
|
||||
NoNeedForSkip,
|
||||
}
|
||||
|
||||
@@ -93,14 +108,14 @@ impl SkipListBuilder {
|
||||
&mut self.layers[layer_id]
|
||||
}
|
||||
|
||||
pub fn insert<S: Serialize>(&mut self, doc_id: DocId, dest: S) {
|
||||
pub fn insert<S: BinarySerializable>(&mut self, doc_id: DocId, dest: &S) {
|
||||
let mut layer_id = 0;
|
||||
match self.get_layer(0).insert(doc_id, dest) {
|
||||
InsertResult::SkipPointer(mut offset) => {
|
||||
loop {
|
||||
layer_id += 1;
|
||||
let skip_result = self.get_layer(layer_id)
|
||||
.insert(doc_id, offset);
|
||||
.insert(doc_id, &offset);
|
||||
match skip_result {
|
||||
InsertResult::SkipPointer(next_offset) => {
|
||||
offset = next_offset;
|
||||
@@ -130,45 +145,156 @@ impl SkipListBuilder {
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------
|
||||
// the lower layer contains only the list of doc ids.
|
||||
// A docset is represented
|
||||
// SkipList<'a, Void>
|
||||
|
||||
|
||||
struct Layer<R: Read + Seek> {
|
||||
reader: R,
|
||||
struct SkipLayer<'a, T> {
|
||||
cursor: Cursor<&'a [u8]>,
|
||||
num_items: u32,
|
||||
next_item: Option<T>,
|
||||
}
|
||||
|
||||
impl<R: Read + Seek + Clone> Layer<R> {
|
||||
fn read(reader: &mut R) -> Layer<R> {
|
||||
|
||||
fn rebase_cursor<'a>(cursor: &Cursor<&'a [u8]>) -> Cursor<&'a [u8]>{
|
||||
let data: &'a[u8] = *cursor.get_ref();
|
||||
let from_idx = cursor.position() as usize;
|
||||
let rebased_data = &data[from_idx..];
|
||||
Cursor::new(rebased_data)
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_rebase_cursor() {
|
||||
{
|
||||
let a: Vec<u8> = vec!(1, 2, 3);
|
||||
let mut cur: Cursor<&[u8]> = Cursor::new(&a);
|
||||
assert_eq!(cur.read_u8().unwrap(), 1);
|
||||
let mut rebased_cursor = rebase_cursor(&cur);
|
||||
assert_eq!(cur.read_u8().unwrap(), 2);
|
||||
assert_eq!(rebased_cursor.read_u8().unwrap(), 2);
|
||||
assert_eq!(cur.position(), 2);
|
||||
assert_eq!(rebased_cursor.position(), 1);
|
||||
cur.seek(SeekFrom::Start(0));
|
||||
assert_eq!(cur.read_u8().unwrap(), 1);
|
||||
rebased_cursor.seek(SeekFrom::Start(0));
|
||||
assert_eq!(rebased_cursor.read_u8().unwrap(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct Layer<'a, T> {
|
||||
_phantom_: marker::PhantomData<T>,
|
||||
cursor: Cursor<&'a [u8]>,
|
||||
item_idx: usize,
|
||||
num_items: usize,
|
||||
cur_id: u32,
|
||||
next_id: Option<u32>,
|
||||
}
|
||||
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
if self.item_idx >= self.num_items {
|
||||
None
|
||||
}
|
||||
else {
|
||||
let cur_val = T::deserialize(&mut self.cursor).unwrap();
|
||||
let cur_id = self.next_id;
|
||||
self.item_idx += 1;
|
||||
if self.item_idx < self.num_items - 1 {
|
||||
self.next_id = Some(u32::deserialize(&mut self.cursor).unwrap());
|
||||
}
|
||||
else {
|
||||
self.next_id = None;
|
||||
}
|
||||
Some((self.cur_id.clone(), cur_val))
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize(&self, writer: &mut Write) -> error::Result<usize> {
|
||||
// TODO error handling
|
||||
writer.write_u32::<BigEndian>(self.clone());
|
||||
Ok(4)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> error::Result<Self> {
|
||||
// TODO error handling
|
||||
Ok(reader.read_u32::<BigEndian>().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
fn read(cursor: &mut Cursor<&'a [u8]>) -> Layer<'a, T> {
|
||||
// TODO error handling?
|
||||
let num_items = reader.read_u32::<BigEndian>().unwrap() as u32;
|
||||
let num_bytes = reader.read_u32::<BigEndian>().unwrap() as u32;
|
||||
let reader_clone = reader.clone();
|
||||
reader.seek(SeekFrom::Current(num_bytes as i64));
|
||||
let num_items = cursor.read_u32::<BigEndian>().unwrap() as u32;
|
||||
println!("{} items ", num_items);
|
||||
let num_bytes = cursor.read_u32::<BigEndian>().unwrap() as u32;
|
||||
println!("{} bytes ", num_bytes);
|
||||
let mut rebased_cursor = rebase_cursor(cursor);
|
||||
cursor.seek(SeekFrom::Current(num_bytes as i64));
|
||||
// println!("cur val {:?}", cur_val);
|
||||
let next_id: Option<u32> = match rebased_cursor.read_u32::<BigEndian>() {
|
||||
Ok(val) => Some(val),
|
||||
Err(_) => None
|
||||
};
|
||||
Layer {
|
||||
reader: reader_clone,
|
||||
num_items: num_items,
|
||||
cursor: rebased_cursor,
|
||||
item_idx: 0,
|
||||
num_items: num_items as usize,
|
||||
next_id: next_id,
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
fn seek(doc_id: DocId) {
|
||||
// while self.next_doc_id < doc_id {
|
||||
// self.next_doc_id = cursor.read_u32::<BigEndian>();
|
||||
// self.cur_val = self.next_val;
|
||||
// self.next_doc_id = bincode::Deserializer::new(self.cursor, 8).read_u32();
|
||||
// self.next_val = bincode::Deserializer::new(self.cursor, 8).read_u32();
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SkipList<R: Read + Seek> {
|
||||
layers: Vec<Layer<R>>,
|
||||
pub struct SkipList<'a, T: BinarySerializable> {
|
||||
data_layer: Layer<'a, T>,
|
||||
skip_layers: Vec<Layer<'a, u32>>,
|
||||
}
|
||||
|
||||
impl<R: Read + Seek> SkipList<R> {
|
||||
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
|
||||
|
||||
pub fn read(data: &[u8]) -> SkipList<Cursor<&[u8]>> {
|
||||
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
self.data_layer.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> SkipList<'a, T> {
|
||||
|
||||
pub fn read(data: &'a [u8]) -> SkipList<'a, T> {
|
||||
let mut cursor = Cursor::new(data);
|
||||
// TODO error handling?
|
||||
let num_layers = cursor.read_u8().unwrap();
|
||||
let mut layers = Vec::new();
|
||||
for _ in (0..num_layers) {
|
||||
layers.push(Layer::read(&mut cursor));
|
||||
println!("{} layers ", num_layers);
|
||||
let mut skip_layers = Vec::new();
|
||||
for _ in (0..num_layers - 1) {
|
||||
let skip_layer: Layer<'a, u32> = Layer::read(&mut cursor);
|
||||
skip_layers.push(skip_layer);
|
||||
}
|
||||
let data_layer: Layer<'a, T> = Layer::read(&mut cursor);
|
||||
SkipList {
|
||||
layers: layers
|
||||
skip_layers: skip_layers,
|
||||
data_layer: data_layer,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
extern crate tantivy;
|
||||
|
||||
use std::io::Write;
|
||||
use tantivy::core::skip::SkipListBuilder;
|
||||
extern crate byteorder;
|
||||
use std::io::{Write, Seek};
|
||||
use std::io::SeekFrom;
|
||||
use tantivy::core::skip::{SkipListBuilder, SkipList};
|
||||
use std::io::Cursor;
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
|
||||
#[test]
|
||||
fn test_skip_list_builder() {
|
||||
{
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10);
|
||||
skip_list_builder.insert(2, 3);
|
||||
skip_list_builder.insert(2, &3);
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
assert_eq!(output.len(), 17);
|
||||
assert_eq!(output[0], 1);
|
||||
@@ -17,7 +20,7 @@ fn test_skip_list_builder() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3);
|
||||
for i in (0..9) {
|
||||
skip_list_builder.insert(i, i);
|
||||
skip_list_builder.insert(i, &i);
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
assert_eq!(output.len(), 129);
|
||||
@@ -28,7 +31,7 @@ fn test_skip_list_builder() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3);
|
||||
for i in (0..9) {
|
||||
skip_list_builder.insert(i, ());
|
||||
skip_list_builder.insert(i, &());
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
assert_eq!(output.len(), 93);
|
||||
@@ -36,15 +39,13 @@ fn test_skip_list_builder() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_skip_list_builder() {
|
||||
{
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10);
|
||||
skip_list_builder.insert(2, 3);
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
let skip_list = SkipList::read(output.as_slice());
|
||||
}
|
||||
|
||||
fn test_skip_list_reader() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10);
|
||||
skip_list_builder.insert(2, &3);
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
let skip_list: SkipList<u32> = SkipList::read(&mut output);
|
||||
// assert_eq!(output.len(), 17);
|
||||
// assert_eq!(output[0], 1);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user