This commit is contained in:
Paul Masurel
2016-02-03 22:33:16 +09:00
parent 52f601b1b9
commit f790425679
10 changed files with 188 additions and 25 deletions

View File

@@ -17,3 +17,5 @@ rustc-serialize = "0.3.16"
log = "0.3.5"
combine = "1.2.0"
tempdir = "0.3.4"
bincode = "0.4.0"
serde = "0.6.11"

View File

@@ -3,7 +3,7 @@ use core::serial::*;
use std::io::Write;
use fst::MapBuilder;
use core::error::*;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use core::directory::Segment;
use core::directory::SegmentComponent;
use core::reader::*;
@@ -13,6 +13,9 @@ use std::fs::File;
pub struct SimpleCodec;
// TODO should we vint?
pub struct SimpleSegmentSerializer {
written_bytes_postings: usize,
postings_write: File,
@@ -25,7 +28,7 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
self.cur_term_num_docs = doc_freq;
// writing the size of the posting list
match self.postings_write.write_u32::<LittleEndian>(doc_freq) {
match self.postings_write.write_u32::<BigEndian>(doc_freq) {
Ok(_) => {},
Err(_) => {
let msg = String::from("Failed writing posting list length");
@@ -37,7 +40,7 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
}
fn add_doc(&mut self, doc_id: DocId) -> Result<()> {
match self.postings_write.write_u32::<LittleEndian>(doc_id as u32) {
match self.postings_write.write_u32::<BigEndian>(doc_id as u32) {
Ok(_) => {},
Err(_) => {
let msg = String::from("Failed while writing posting list");

View File

@@ -9,7 +9,7 @@ pub trait Collector {
}
#[derive(Debug)]
pub struct DocAddress(SegmentId, DocId);
pub struct DocAddress(pub SegmentId, pub DocId);
pub struct TestCollector {
docs: Vec<DocAddress>,
@@ -33,7 +33,6 @@ impl TestCollector {
impl Collector for TestCollector {
fn set_segment(&mut self, segment: &SegmentReader) {
println!("eee");
self.current_segment = Some(segment.id());
}

View File

@@ -11,4 +11,5 @@ pub mod codec;
pub mod error;
pub mod searcher;
pub mod collector;
pub mod skip;
pub use core::global::DocId;

View File

@@ -8,7 +8,7 @@ use std::io;
use core::postings::IntersectionPostings;
use fst::raw::Fst;
use std::cmp::{Eq,PartialEq,Ord,PartialOrd,Ordering};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use std::borrow::Borrow;
use std::io::Cursor;
use core::global::DocId;
@@ -35,7 +35,7 @@ pub struct SegmentPostings<'a> {
impl<'a> SegmentPostings<'a> {
pub fn from_data(data: &[u8]) -> SegmentPostings {
let mut cursor = Cursor::new(data);
let doc_freq = cursor.read_u32::<LittleEndian>().unwrap() as usize;
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
SegmentPostings {
cursor: cursor,
num_docs_remaining: doc_freq,
@@ -71,7 +71,7 @@ impl<'a> Iterator for SegmentPostings<'a> {
}
else {
self.num_docs_remaining -= 1;
Some(self.cursor.read_u32::<LittleEndian>().unwrap() as DocId)
Some(self.cursor.read_u32::<BigEndian>().unwrap() as DocId)
}
}
}
@@ -129,7 +129,7 @@ impl SegmentReader {
fn write_postings<R: io::Read, Output, SegSer: SegmentSerializer<Output>>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> {
for i in 0..num_docs {
let doc_id = cursor.read_u32::<LittleEndian>().unwrap();
let doc_id = cursor.read_u32::<BigEndian>().unwrap();
try!(serializer.add_doc(doc_id));
}
Ok(())
@@ -146,7 +146,7 @@ impl SerializableSegment for SegmentReader {
let offset = offset_u64 as usize;
let data = unsafe { &self.postings_data.as_slice()[offset..] };
let mut cursor = Cursor::new(data);
let num_docs = cursor.read_u32::<LittleEndian>().unwrap() as DocId;
let num_docs = cursor.read_u32::<BigEndian>().unwrap() as DocId;
try!(serializer.new_term(&term, num_docs));
try!(write_postings(cursor, num_docs, &mut serializer));
},

View File

@@ -20,7 +20,6 @@ impl Searcher {
}
impl Searcher {
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
@@ -30,7 +29,6 @@ impl Searcher {
for doc_id in postings {
collector.collect(doc_id);
}
}
}

130
src/core/skip.rs Normal file
View File

@@ -0,0 +1,130 @@
use std::io::Write;
use std::io::BufWriter;
use core::DocId;
use std::ops::DerefMut;
use serde::Serialize;
use serde;
use bincode;
use byteorder;
use core::error;
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
// writer
struct LayerBuilder {
period: usize,
buffer: Vec<u8>,
remaining: usize,
len: usize,
}
impl LayerBuilder {
fn written_size(&self,) -> usize {
self.buffer.len()
}
fn write(&self, output: &mut Write) -> Result<(), byteorder::Error> {
try!(output.write_u32::<BigEndian>(self.len() as u32));
try!(output.write_u32::<BigEndian>(self.buffer.len() as u32));
try!(output.write_all(&self.buffer));
Ok(())
}
fn len(&self,) -> usize {
self.len
}
fn with_period(period: usize) -> LayerBuilder {
LayerBuilder {
period: period,
buffer: Vec::new(),
remaining: period,
len: 0,
}
}
fn insert<S: Serialize>(&mut self, doc_id: DocId, dest: S) -> InsertResult {
self.remaining -= 1;
self.len += 1;
if self.remaining == 0 {
let offset = self.written_size();
dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
self.remaining = self.period;
InsertResult::SkipPointer(offset)
}
else {
doc_id.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
InsertResult::NoNeedForSkip
}
}
}
pub struct SkipListBuilder {
period: usize,
layers: Vec<LayerBuilder>,
}
enum InsertResult {
SkipPointer(usize),
NoNeedForSkip,
}
impl SkipListBuilder {
pub fn new(period: usize) -> SkipListBuilder {
SkipListBuilder {
period: period,
layers: Vec::new(),
}
}
fn get_layer<'a>(&'a mut self, layer_id: usize) -> &mut LayerBuilder {
if layer_id == self.layers.len() {
let layer_builder = LayerBuilder::with_period(self.period);
self.layers.push(layer_builder);
}
&mut self.layers[layer_id]
}
pub fn insert<S: Serialize>(&mut self, doc_id: DocId, dest: S) {
let mut layer_id = 0;
match self.get_layer(0).insert(doc_id, dest) {
InsertResult::SkipPointer(mut offset) => {
loop {
layer_id += 1;
let skip_result = self.get_layer(layer_id)
.insert(doc_id, offset);
match skip_result {
InsertResult::SkipPointer(next_offset) => {
offset = next_offset;
},
InsertResult::NoNeedForSkip => {
return;
}
}
}
},
InsertResult::NoNeedForSkip => {
return;
}
}
}
pub fn write<W: Write>(self, output: &mut Write) -> error::Result<()> {
output.write_u8(self.layers.len() as u8);
for layer in self.layers.iter() {
match layer.write(output) {
Ok(())=> {},
Err(someerr)=> { return Err(error::Error::WriteError(format!("Could not write skiplist {:?}", someerr) )) }
}
}
Ok(())
}
}

View File

@@ -15,5 +15,7 @@ extern crate rustc_serialize;
extern crate combine;
extern crate atomicwrites;
extern crate tempdir;
extern crate bincode;
extern crate serde;
pub mod core;

View File

@@ -118,20 +118,11 @@ fn test_searcher() {
let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), );
let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector);
let vals = format!("{:?}", collector.docs());
println!("{}",vals);
assert_eq!(vals, "");
let vals: Vec<DocId> = collector.docs().iter()
.map(|doc| doc.1)
.collect::<Vec<DocId>>();
assert_eq!(vals, [1, 2]);
}
//
// let debug_serializer = DebugSegmentSerializer::new();
// let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
// let commit_result = index_writer.commit();
// assert!(commit_result.is_ok());
// let segment = commit_result.unwrap();
// let segment_reader = SegmentReader::open(segment).unwrap();
// let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader);
// assert_eq!(segment_str_before_writing, segment_str_after_reading);
}

37
tests/skip.rs Normal file
View File

@@ -0,0 +1,37 @@
extern crate tantivy;
use std::io::Write;
use tantivy::core::skip::SkipListBuilder;
#[test]
fn test_skip_list_builder() {
{
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10);
skip_list_builder.insert(2, 3);
skip_list_builder.write::<Vec<u8>>(&mut output);
assert_eq!(output.len(), 17);
assert_eq!(output[0], 1);
}
{
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3);
for i in (0..9) {
skip_list_builder.insert(i, i);
}
skip_list_builder.write::<Vec<u8>>(&mut output);
assert_eq!(output.len(), 129);
assert_eq!(output[0], 3);
}
{
// checking that void gets serialized to nothing.
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3);
for i in (0..9) {
skip_list_builder.insert(i, ());
}
skip_list_builder.write::<Vec<u8>>(&mut output);
assert_eq!(output.len(), 93);
assert_eq!(output[0], 3);
}
}