From f790425679a4c78dc4af70997ffcec3c8fbe5579 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Feb 2016 22:33:16 +0900 Subject: [PATCH] beeeee --- Cargo.toml | 2 + src/core/codec.rs | 9 ++- src/core/collector.rs | 3 +- src/core/mod.rs | 1 + src/core/reader.rs | 10 ++-- src/core/searcher.rs | 2 - src/core/skip.rs | 130 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 + tests/core.rs | 17 ++---- tests/skip.rs | 37 ++++++++++++ 10 files changed, 188 insertions(+), 25 deletions(-) create mode 100644 src/core/skip.rs create mode 100644 tests/skip.rs diff --git a/Cargo.toml b/Cargo.toml index d037e43d5..a1d353ac5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,3 +17,5 @@ rustc-serialize = "0.3.16" log = "0.3.5" combine = "1.2.0" tempdir = "0.3.4" +bincode = "0.4.0" +serde = "0.6.11" diff --git a/src/core/codec.rs b/src/core/codec.rs index 65db2fec2..7322ce34a 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -3,7 +3,7 @@ use core::serial::*; use std::io::Write; use fst::MapBuilder; use core::error::*; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use core::directory::Segment; use core::directory::SegmentComponent; use core::reader::*; @@ -13,6 +13,9 @@ use std::fs::File; pub struct SimpleCodec; + +// TODO should we vint? + pub struct SimpleSegmentSerializer { written_bytes_postings: usize, postings_write: File, @@ -25,7 +28,7 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64); self.cur_term_num_docs = doc_freq; // writing the size of the posting list - match self.postings_write.write_u32::(doc_freq) { + match self.postings_write.write_u32::(doc_freq) { Ok(_) => {}, Err(_) => { let msg = String::from("Failed writing posting list length"); @@ -37,7 +40,7 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { } fn add_doc(&mut self, doc_id: DocId) -> Result<()> { - match self.postings_write.write_u32::(doc_id as u32) { + match self.postings_write.write_u32::(doc_id as u32) { Ok(_) => {}, Err(_) => { let msg = String::from("Failed while writing posting list"); diff --git a/src/core/collector.rs b/src/core/collector.rs index 5dfd1594e..90728727a 100644 --- a/src/core/collector.rs +++ b/src/core/collector.rs @@ -9,7 +9,7 @@ pub trait Collector { } #[derive(Debug)] -pub struct DocAddress(SegmentId, DocId); +pub struct DocAddress(pub SegmentId, pub DocId); pub struct TestCollector { docs: Vec, @@ -33,7 +33,6 @@ impl TestCollector { impl Collector for TestCollector { fn set_segment(&mut self, segment: &SegmentReader) { - println!("eee"); self.current_segment = Some(segment.id()); } diff --git a/src/core/mod.rs b/src/core/mod.rs index adf307dea..d1e74071a 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -11,4 +11,5 @@ pub mod codec; pub mod error; pub mod searcher; pub mod collector; +pub mod skip; pub use core::global::DocId; diff --git a/src/core/reader.rs b/src/core/reader.rs index aea0e45ae..a50b45dbf 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -8,7 +8,7 @@ use std::io; use core::postings::IntersectionPostings; use fst::raw::Fst; use std::cmp::{Eq,PartialEq,Ord,PartialOrd,Ordering}; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use std::borrow::Borrow; use std::io::Cursor; use core::global::DocId; @@ -35,7 +35,7 @@ pub struct SegmentPostings<'a> { impl<'a> SegmentPostings<'a> { pub fn from_data(data: &[u8]) -> SegmentPostings { let mut cursor = Cursor::new(data); - let doc_freq = cursor.read_u32::().unwrap() as usize; + let doc_freq = cursor.read_u32::().unwrap() as usize; SegmentPostings { cursor: cursor, num_docs_remaining: doc_freq, @@ -71,7 +71,7 @@ impl<'a> Iterator for SegmentPostings<'a> { } else { self.num_docs_remaining -= 1; - Some(self.cursor.read_u32::().unwrap() as DocId) + Some(self.cursor.read_u32::().unwrap() as DocId) } } } @@ -129,7 +129,7 @@ impl SegmentReader { fn write_postings>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> { for i in 0..num_docs { - let doc_id = cursor.read_u32::().unwrap(); + let doc_id = cursor.read_u32::().unwrap(); try!(serializer.add_doc(doc_id)); } Ok(()) @@ -146,7 +146,7 @@ impl SerializableSegment for SegmentReader { let offset = offset_u64 as usize; let data = unsafe { &self.postings_data.as_slice()[offset..] }; let mut cursor = Cursor::new(data); - let num_docs = cursor.read_u32::().unwrap() as DocId; + let num_docs = cursor.read_u32::().unwrap() as DocId; try!(serializer.new_term(&term, num_docs)); try!(write_postings(cursor, num_docs, &mut serializer)); }, diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 316caecf8..39cf105c2 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -20,7 +20,6 @@ impl Searcher { } - impl Searcher { pub fn search(&self, terms: &Vec, collector: &mut Collector) { @@ -30,7 +29,6 @@ impl Searcher { for doc_id in postings { collector.collect(doc_id); } - } } diff --git a/src/core/skip.rs b/src/core/skip.rs new file mode 100644 index 000000000..716168c4d --- /dev/null +++ b/src/core/skip.rs @@ -0,0 +1,130 @@ +use std::io::Write; +use std::io::BufWriter; +use core::DocId; +use std::ops::DerefMut; +use serde::Serialize; +use serde; +use bincode; +use byteorder; +use core::error; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; + + + +// writer + +struct LayerBuilder { + period: usize, + buffer: Vec, + remaining: usize, + len: usize, +} + +impl LayerBuilder { + + fn written_size(&self,) -> usize { + self.buffer.len() + } + + fn write(&self, output: &mut Write) -> Result<(), byteorder::Error> { + try!(output.write_u32::(self.len() as u32)); + try!(output.write_u32::(self.buffer.len() as u32)); + try!(output.write_all(&self.buffer)); + Ok(()) + } + + fn len(&self,) -> usize { + self.len + } + + fn with_period(period: usize) -> LayerBuilder { + LayerBuilder { + period: period, + buffer: Vec::new(), + remaining: period, + len: 0, + } + } + + fn insert(&mut self, doc_id: DocId, dest: S) -> InsertResult { + self.remaining -= 1; + self.len += 1; + if self.remaining == 0 { + let offset = self.written_size(); + dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer)); + self.remaining = self.period; + InsertResult::SkipPointer(offset) + } + else { + doc_id.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer)); + dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer)); + InsertResult::NoNeedForSkip + } + } +} + + +pub struct SkipListBuilder { + period: usize, + layers: Vec, +} + + +enum InsertResult { + SkipPointer(usize), + NoNeedForSkip, +} + +impl SkipListBuilder { + + pub fn new(period: usize) -> SkipListBuilder { + SkipListBuilder { + period: period, + layers: Vec::new(), + } + } + + + fn get_layer<'a>(&'a mut self, layer_id: usize) -> &mut LayerBuilder { + if layer_id == self.layers.len() { + let layer_builder = LayerBuilder::with_period(self.period); + self.layers.push(layer_builder); + } + &mut self.layers[layer_id] + } + + pub fn insert(&mut self, doc_id: DocId, dest: S) { + let mut layer_id = 0; + match self.get_layer(0).insert(doc_id, dest) { + InsertResult::SkipPointer(mut offset) => { + loop { + layer_id += 1; + let skip_result = self.get_layer(layer_id) + .insert(doc_id, offset); + match skip_result { + InsertResult::SkipPointer(next_offset) => { + offset = next_offset; + }, + InsertResult::NoNeedForSkip => { + return; + } + } + } + }, + InsertResult::NoNeedForSkip => { + return; + } + } + } + + pub fn write(self, output: &mut Write) -> error::Result<()> { + output.write_u8(self.layers.len() as u8); + for layer in self.layers.iter() { + match layer.write(output) { + Ok(())=> {}, + Err(someerr)=> { return Err(error::Error::WriteError(format!("Could not write skiplist {:?}", someerr) )) } + } + } + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 582733c0b..32462008c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,5 +15,7 @@ extern crate rustc_serialize; extern crate combine; extern crate atomicwrites; extern crate tempdir; +extern crate bincode; +extern crate serde; pub mod core; diff --git a/tests/core.rs b/tests/core.rs index c2e2c553f..5db851ab0 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -118,20 +118,11 @@ fn test_searcher() { let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), ); let mut collector = TestCollector::new(); searcher.search(&terms, &mut collector); - let vals = format!("{:?}", collector.docs()); - println!("{}",vals); - assert_eq!(vals, ""); + let vals: Vec = collector.docs().iter() + .map(|doc| doc.1) + .collect::>(); + assert_eq!(vals, [1, 2]); } - - // - // let debug_serializer = DebugSegmentSerializer::new(); - // let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer()); - // let commit_result = index_writer.commit(); - // assert!(commit_result.is_ok()); - // let segment = commit_result.unwrap(); - // let segment_reader = SegmentReader::open(segment).unwrap(); - // let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader); - // assert_eq!(segment_str_before_writing, segment_str_after_reading); } diff --git a/tests/skip.rs b/tests/skip.rs new file mode 100644 index 000000000..1e195d0df --- /dev/null +++ b/tests/skip.rs @@ -0,0 +1,37 @@ +extern crate tantivy; + +use std::io::Write; +use tantivy::core::skip::SkipListBuilder; + +#[test] +fn test_skip_list_builder() { + { + let mut output: Vec = Vec::new(); + let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10); + skip_list_builder.insert(2, 3); + skip_list_builder.write::>(&mut output); + assert_eq!(output.len(), 17); + assert_eq!(output[0], 1); + } + { + let mut output: Vec = Vec::new(); + let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3); + for i in (0..9) { + skip_list_builder.insert(i, i); + } + skip_list_builder.write::>(&mut output); + assert_eq!(output.len(), 129); + assert_eq!(output[0], 3); + } + { + // checking that void gets serialized to nothing. + let mut output: Vec = Vec::new(); + let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3); + for i in (0..9) { + skip_list_builder.insert(i, ()); + } + skip_list_builder.write::>(&mut output); + assert_eq!(output.len(), 93); + assert_eq!(output[0], 3); + } +}