From 6a46b98ea9065f15922ca48678317e0f547493e4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 23 Feb 2016 00:26:57 +0900 Subject: [PATCH] blop --- build.rs | 1 + cpp/encode.cpp | 1 - src/core/directory.rs | 50 +++++++++++++++---------- src/core/schema.rs | 85 ++++++++++++++++++++++++++++++++----------- src/core/writer.rs | 6 +-- 5 files changed, 96 insertions(+), 47 deletions(-) diff --git a/build.rs b/build.rs index 61fbc6039..573767810 100644 --- a/build.rs +++ b/build.rs @@ -5,6 +5,7 @@ fn main() { .cpp(true) .flag("-std=c++11") .flag("-O3") + .flag("-mssse3") .include("./cpp/SIMDCompressionAndIntersection/include") .object("cpp/SIMDCompressionAndIntersection/bitpacking.o") .object("cpp/SIMDCompressionAndIntersection/integratedbitpacking.o") diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 933ab51d2..9bf0ad7e8 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -1,7 +1,6 @@ #include #include - #include "codecfactory.h" #include "intersection.h" diff --git a/src/core/directory.rs b/src/core/directory.rs index 2c80ed60c..f42e740db 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -30,13 +30,21 @@ pub fn generate_segment_name() -> SegmentId { #[derive(Clone,Debug,RustcDecodable, RustcEncodable)] pub struct DirectoryMeta { - segments: Vec + segments: Vec, + schema: Schema, } impl DirectoryMeta { fn new() -> DirectoryMeta { DirectoryMeta { - segments: Vec::new() + segments: Vec::new(), + schema: Schema::new(), + } + } + fn with_schema(schema: Schema) -> DirectoryMeta { + DirectoryMeta { + segments: Vec::new(), + schema: schema, } } } @@ -80,13 +88,7 @@ pub struct Directory { impl Directory { pub fn schema(&self,) -> Schema { - self.get_read().unwrap().schema.clone() - } - - pub fn set_schema(&mut self, schema: &Schema) { - self.get_write() - .unwrap() - .set_schema(schema); + self.get_read().unwrap().metas.schema.clone() } fn get_write(&mut self) -> Result> { @@ -99,18 +101,22 @@ impl Directory { } fn get_read(&self) -> Result> { - match self.inner_directory.read() { - Ok(dir) => - Ok(dir), - Err(e) => - Err(Error::LockError(format!("Could not acquire read lock on directory. {:?}", e))) - } + self.inner_directory.read().map_err( + |e| Error::LockError(format!("Could not acquire read lock on directory. {:?}", e)) + ) } pub fn publish_segment(&mut self, segment: Segment) -> Result<()> { return try!(self.get_write()).publish_segment(segment); } + pub fn create(filepath: &Path, schema: Schema) -> Result { + let inner_directory = try!(InnerDirectory::create(filepath, schema)); + Ok(Directory { + inner_directory: Arc::new(RwLock::new(inner_directory)), + }) + } + pub fn open(filepath: &Path) -> Result { let inner_directory = try!(InnerDirectory::open(filepath)); Ok(Directory { @@ -173,7 +179,6 @@ struct InnerDirectory { index_path: PathBuf, mmap_cache: RefCell>, metas: DirectoryMeta, - schema: Schema, _temp_directory: Option, } @@ -197,8 +202,14 @@ impl InnerDirectory { self.save_metas() } - pub fn set_schema(&mut self, schema: &Schema) { - self.schema = schema.clone(); + pub fn create(filepath: &Path, schema: Schema) -> Result { + let mut directory = InnerDirectory { + index_path: PathBuf::from(filepath), + mmap_cache: RefCell::new(HashMap::new()), + metas: DirectoryMeta::with_schema(schema), + _temp_directory: None, + }; + Ok(directory) } pub fn open(filepath: &Path) -> Result { @@ -206,7 +217,6 @@ impl InnerDirectory { index_path: PathBuf::from(filepath), mmap_cache: RefCell::new(HashMap::new()), metas: DirectoryMeta::new(), - schema: Schema::new(), // TODO schema _temp_directory: None, }; try!(directory.load_metas()); //< does the directory already exists? @@ -229,7 +239,6 @@ impl InnerDirectory { index_path: PathBuf::from(tempdir_path), mmap_cache: RefCell::new(HashMap::new()), metas: DirectoryMeta::new(), - schema: Schema::new(), _temp_directory: Some(tempdir) }; //< does the directory already exists? @@ -245,6 +254,7 @@ impl InnerDirectory { // TODO check that the directory is empty. return Ok(()); } + let mut meta_file = File::open(&meta_filepath).unwrap(); let mut meta_content = String::new(); meta_file.read_to_string(&mut meta_content); diff --git a/src/core/schema.rs b/src/core/schema.rs index da9a185f3..63d69c6a0 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -6,10 +6,15 @@ use std::slice; use std::fmt; use std::io::Read; use core::serialize::BinarySerializable; +use rustc_serialize::Decodable; +use rustc_serialize::Encodable; +use rustc_serialize::Decoder; +use rustc_serialize::Encoder; + pub type DocId = u32; -#[derive(Clone,Debug,PartialEq,Eq)] +#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] pub struct FieldOptions { // untokenized_indexed: bool, tokenized_indexed: bool, @@ -48,7 +53,6 @@ impl FieldOptions { } - #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] pub struct FieldValue { pub field: Field, @@ -87,27 +91,63 @@ impl BinarySerializable for FieldValue { -#[derive(Clone,PartialEq,PartialOrd,Ord,Eq,Hash)] +#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] pub struct Term { data: Vec, } -#[derive(Clone,Debug)] +#[derive(Clone, Debug, RustcDecodable, RustcEncodable)] +struct FieldEntry { + name: String, + option: FieldOptions, +} + + +#[derive(Clone, Debug)] pub struct Schema { - fields: HashMap, - field_options: Vec, + fields: Vec, + fields_map: HashMap, // transient + field_options: Vec, // transient +} + +impl Decodable for Schema { + fn decode(d: &mut D) -> Result { + let mut schema = Schema::new(); + try!(d.read_seq(|d, num_fields| { + for i in 0..num_fields { + let field_entry = try!(FieldEntry::decode(d)); + schema.add_field(&field_entry.name, &field_entry.option); + } + Ok(()) + })); + Ok(schema) + } +} + +impl Encodable for Schema { + fn encode(&self, s: &mut S) -> Result<(), S::Error> { + try!(s.emit_seq(self.fields.len(), + |mut e| { + for (ord, field) in self.fields.iter().enumerate() { + try!(e.emit_seq_elt(ord, |e| field.encode(e))); + } + Ok(()) + })); + Ok(()) + } } impl Schema { pub fn new() -> Schema { Schema { - fields: HashMap::new(), + fields: Vec::new(), + fields_map: HashMap::new(), field_options: Vec::new(), } } pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> { - self.fields + self.fields_map .get(field_name) .map(|&Field(field_id)| { let field_options = self.field_options[field_id as usize].clone(); @@ -115,24 +155,25 @@ impl Schema { }) } - pub fn get_field(&self, field: &Field) -> FieldOptions { + pub fn field(&self, fieldname: &str) -> Option { + self.fields_map.get(&String::from(fieldname)).map(|field| field.clone()) + } + + pub fn field_options(&self, field: &Field) -> FieldOptions { let Field(field_id) = *field; self.field_options[field_id as usize].clone() } - pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field { - let next_field = Field(self.fields.len() as u8); - let field = self.fields - .entry(String::from(field_name)) - .or_insert(next_field.clone()) - .clone(); - if field == next_field { - self.field_options.push(field_options.clone()); - } - else { - let Field(field_id) = field; - self.field_options[field_id as usize] = field_options.clone(); - } + pub fn add_field(&mut self, field_name_str: &str, field_options: &FieldOptions) -> Field { + let field = Field(self.fields.len() as u8); + // TODO case if field already exists + let field_name = String::from(field_name_str); + self.fields.push(FieldEntry { + name: field_name.clone(), + option: field_options.clone(), + }); + self.fields_map.insert(field_name, field.clone()); + self.field_options.push(field_options.clone()); field } } diff --git a/src/core/writer.rs b/src/core/writer.rs index a42770222..6dbcb84c7 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -136,13 +136,12 @@ impl SegmentWriter { pub fn add(&mut self, doc: Document, schema: &Schema) { let doc_id = self.max_doc; for field_value in doc.fields() { - let field_options = schema.get_field(&field_value.field); + let field_options = schema.field_options(&field_value.field); if field_options.is_tokenized_indexed() { let mut tokens = self.tokenizer.tokenize(&field_value.text); loop { match tokens.next() { Some(token) => { - // println!("TOKEN :{}:", token); let term = Term::from_field_text(&field_value.field, token); self.suscribe(doc_id, term); self.num_tokens += 1; @@ -153,8 +152,7 @@ impl SegmentWriter { } } let mut stored_fieldvalues_it = doc.fields().filter(|field_value| { - schema.get_field(&field_value.field) - .is_stored() + schema.field_options(&field_value.field).is_stored() }); self.segment_serializer.store_doc(&mut stored_fieldvalues_it); self.max_doc += 1;