From 1215049adf2db118537fdfd060e2f0cb025c7c47 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 20 Feb 2016 16:37:15 +0900 Subject: [PATCH] storebuidler --- Cargo.toml | 1 + src/core/codec.rs | 3 ++- src/core/serialize.rs | 17 +++++++++---- src/core/store.rs | 56 +++++++++++++++++++++++++++++++++++++++---- src/lib.rs | 1 + 5 files changed, 69 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index acb45ce1d..d20c05e75 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ tempdir = "0.3.4" bincode = "0.4.0" serde = "0.6.11" libc = "0.2.6" +lz4 = "*" [build-dependencies] gcc = "0.3.24" diff --git a/src/core/codec.rs b/src/core/codec.rs index c6991b1d6..e80d894cd 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -83,9 +83,10 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { Ok(()) } - fn close(self,) -> Result<()> { + fn close(mut self,) -> Result<()> { // TODO handle errors on close self.term_fst_builder.finish(); + self.store_writer.close(); Ok(()) } } diff --git a/src/core/serialize.rs b/src/core/serialize.rs index 6b9ded357..8a6108cfc 100644 --- a/src/core/serialize.rs +++ b/src/core/serialize.rs @@ -42,6 +42,7 @@ impl BinarySerializable for Vec { } } + impl BinarySerializable for u32 { fn serialize(&self, writer: &mut Write) -> error::Result { writer.write_u32::(self.clone()) @@ -54,6 +55,18 @@ impl BinarySerializable for u32 { } } +impl BinarySerializable for u64 { + fn serialize(&self, writer: &mut Write) -> error::Result { + writer.write_u64::(self.clone()) + .map(|x| 4) + .map_err(Error::BinaryReadError) + } + fn deserialize(reader: &mut Read) -> error::Result { + reader.read_u64::() + .map_err(Error::BinaryReadError) + } +} + impl BinarySerializable for u8 { fn serialize(&self, writer: &mut Write) -> error::Result { @@ -126,8 +139,6 @@ fn test_serialize_u32() { assert!(u32::deserialize(&mut cursor).is_err()); } - - #[test] fn test_serialize_string() { let mut buffer: Vec = Vec::new(); @@ -149,8 +160,6 @@ fn test_serialize_string() { assert!(u32::deserialize(&mut cursor).is_err()); } - - #[test] fn test_serialize_vec() { let mut buffer: Vec = Vec::new(); diff --git a/src/core/store.rs b/src/core/store.rs index bba90a2fb..0370837c8 100644 --- a/src/core/store.rs +++ b/src/core/store.rs @@ -1,31 +1,79 @@ use std::io::BufWriter; use std::fs::File; +use core::global::DocId; use core::schema::Document; use core::schema::FieldValue; use core::error; use core::serialize::BinarySerializable; use std::io::Write; use std::io::Read; +use lz4; + +const BLOCK_SIZE: usize = 262144; pub struct StoreWriter { - doc: usize, - offsets: Vec, + doc: DocId, + offsets: Vec, + written: u64, writer: BufWriter, + current_block: Vec, } +#[derive(Debug)] +struct OffsetIndex(DocId, u64); + +impl BinarySerializable for OffsetIndex { + fn serialize(&self, writer: &mut Write) -> error::Result { + let OffsetIndex(a, b) = *self; + Ok(try!(a.serialize(writer)) + try!(b.serialize(writer))) + } + fn deserialize(reader: &mut Read) -> error::Result { + let a = try!(DocId::deserialize(reader)); + let b = try!(u64::deserialize(reader)); + Ok(OffsetIndex(a, b)) + } +} + + + impl StoreWriter { pub fn new(file: File) -> StoreWriter { StoreWriter { doc: 0, + written: 0, offsets: Vec::new(), writer: BufWriter::new(file), + current_block: Vec::new(), } } - pub fn store(&mut self, fields: &Vec<&FieldValue>) { - + pub fn store(&mut self, field_values: &Vec<&FieldValue>) { + (field_values.len() as u32).serialize(&mut self.current_block); + for field_value in field_values.iter() { + (*field_value).serialize(&mut self.current_block); + } self.doc += 1; + if self.current_block.len() > BLOCK_SIZE { + self.write_and_compress_block(); + } + } + + fn write_and_compress_block(&mut self,) { + // err handling + let mut encoder = lz4::EncoderBuilder::new() + .build(&mut self.writer) + .unwrap(); + encoder.write_all(&self.current_block); + self.written = self.current_block.len() as u64; + self.offsets.push(OffsetIndex(self.doc, self.written)); + self.current_block.clear(); + } + + pub fn close(&mut self,) { + if self.current_block.len() > 0 { + self.write_and_compress_block(); + } } } diff --git a/src/lib.rs b/src/lib.rs index 7b929cda8..46f1c46ec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,5 +20,6 @@ extern crate tempdir; extern crate bincode; extern crate serde; extern crate libc; +extern crate lz4; pub mod core;