This commit is contained in:
Paul Masurel
2016-02-23 00:26:57 +09:00
parent d95c7f446a
commit 6a46b98ea9
5 changed files with 96 additions and 47 deletions

View File

@@ -5,6 +5,7 @@ fn main() {
.cpp(true)
.flag("-std=c++11")
.flag("-O3")
.flag("-mssse3")
.include("./cpp/SIMDCompressionAndIntersection/include")
.object("cpp/SIMDCompressionAndIntersection/bitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/integratedbitpacking.o")

View File

@@ -1,7 +1,6 @@
#include <iostream>
#include <stdint.h>
#include "codecfactory.h"
#include "intersection.h"

View File

@@ -30,13 +30,21 @@ pub fn generate_segment_name() -> SegmentId {
#[derive(Clone,Debug,RustcDecodable, RustcEncodable)]
pub struct DirectoryMeta {
segments: Vec<String>
segments: Vec<String>,
schema: Schema,
}
impl DirectoryMeta {
fn new() -> DirectoryMeta {
DirectoryMeta {
segments: Vec::new()
segments: Vec::new(),
schema: Schema::new(),
}
}
fn with_schema(schema: Schema) -> DirectoryMeta {
DirectoryMeta {
segments: Vec::new(),
schema: schema,
}
}
}
@@ -80,13 +88,7 @@ pub struct Directory {
impl Directory {
pub fn schema(&self,) -> Schema {
self.get_read().unwrap().schema.clone()
}
pub fn set_schema(&mut self, schema: &Schema) {
self.get_write()
.unwrap()
.set_schema(schema);
self.get_read().unwrap().metas.schema.clone()
}
fn get_write(&mut self) -> Result<RwLockWriteGuard<InnerDirectory>> {
@@ -99,18 +101,22 @@ impl Directory {
}
fn get_read(&self) -> Result<RwLockReadGuard<InnerDirectory>> {
match self.inner_directory.read() {
Ok(dir) =>
Ok(dir),
Err(e) =>
Err(Error::LockError(format!("Could not acquire read lock on directory. {:?}", e)))
}
self.inner_directory.read().map_err(
|e| Error::LockError(format!("Could not acquire read lock on directory. {:?}", e))
)
}
pub fn publish_segment(&mut self, segment: Segment) -> Result<()> {
return try!(self.get_write()).publish_segment(segment);
}
pub fn create(filepath: &Path, schema: Schema) -> Result<Directory> {
let inner_directory = try!(InnerDirectory::create(filepath, schema));
Ok(Directory {
inner_directory: Arc::new(RwLock::new(inner_directory)),
})
}
pub fn open(filepath: &Path) -> Result<Directory> {
let inner_directory = try!(InnerDirectory::open(filepath));
Ok(Directory {
@@ -173,7 +179,6 @@ struct InnerDirectory {
index_path: PathBuf,
mmap_cache: RefCell<HashMap<PathBuf, MmapReadOnly>>,
metas: DirectoryMeta,
schema: Schema,
_temp_directory: Option<TempDir>,
}
@@ -197,8 +202,14 @@ impl InnerDirectory {
self.save_metas()
}
pub fn set_schema(&mut self, schema: &Schema) {
self.schema = schema.clone();
pub fn create(filepath: &Path, schema: Schema) -> Result<InnerDirectory> {
let mut directory = InnerDirectory {
index_path: PathBuf::from(filepath),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::with_schema(schema),
_temp_directory: None,
};
Ok(directory)
}
pub fn open(filepath: &Path) -> Result<InnerDirectory> {
@@ -206,7 +217,6 @@ impl InnerDirectory {
index_path: PathBuf::from(filepath),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::new(),
schema: Schema::new(), // TODO schema
_temp_directory: None,
};
try!(directory.load_metas()); //< does the directory already exists?
@@ -229,7 +239,6 @@ impl InnerDirectory {
index_path: PathBuf::from(tempdir_path),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::new(),
schema: Schema::new(),
_temp_directory: Some(tempdir)
};
//< does the directory already exists?
@@ -245,6 +254,7 @@ impl InnerDirectory {
// TODO check that the directory is empty.
return Ok(());
}
let mut meta_file = File::open(&meta_filepath).unwrap();
let mut meta_content = String::new();
meta_file.read_to_string(&mut meta_content);

View File

@@ -6,10 +6,15 @@ use std::slice;
use std::fmt;
use std::io::Read;
use core::serialize::BinarySerializable;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
pub type DocId = u32;
#[derive(Clone,Debug,PartialEq,Eq)]
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct FieldOptions {
// untokenized_indexed: bool,
tokenized_indexed: bool,
@@ -48,7 +53,6 @@ impl FieldOptions {
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue {
pub field: Field,
@@ -87,27 +91,63 @@ impl BinarySerializable for FieldValue {
#[derive(Clone,PartialEq,PartialOrd,Ord,Eq,Hash)]
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term {
data: Vec<u8>,
}
#[derive(Clone,Debug)]
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
struct FieldEntry {
name: String,
option: FieldOptions,
}
#[derive(Clone, Debug)]
pub struct Schema {
fields: HashMap<String, Field>,
field_options: Vec<FieldOptions>,
fields: Vec<FieldEntry>,
fields_map: HashMap<String, Field>, // transient
field_options: Vec<FieldOptions>, // transient
}
impl Decodable for Schema {
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
let mut schema = Schema::new();
try!(d.read_seq(|d, num_fields| {
for i in 0..num_fields {
let field_entry = try!(FieldEntry::decode(d));
schema.add_field(&field_entry.name, &field_entry.option);
}
Ok(())
}));
Ok(schema)
}
}
impl Encodable for Schema {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
try!(s.emit_seq(self.fields.len(),
|mut e| {
for (ord, field) in self.fields.iter().enumerate() {
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
}
Ok(())
}));
Ok(())
}
}
impl Schema {
pub fn new() -> Schema {
Schema {
fields: HashMap::new(),
fields: Vec::new(),
fields_map: HashMap::new(),
field_options: Vec::new(),
}
}
pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> {
self.fields
self.fields_map
.get(field_name)
.map(|&Field(field_id)| {
let field_options = self.field_options[field_id as usize].clone();
@@ -115,24 +155,25 @@ impl Schema {
})
}
pub fn get_field(&self, field: &Field) -> FieldOptions {
pub fn field(&self, fieldname: &str) -> Option<Field> {
self.fields_map.get(&String::from(fieldname)).map(|field| field.clone())
}
pub fn field_options(&self, field: &Field) -> FieldOptions {
let Field(field_id) = *field;
self.field_options[field_id as usize].clone()
}
pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field {
let next_field = Field(self.fields.len() as u8);
let field = self.fields
.entry(String::from(field_name))
.or_insert(next_field.clone())
.clone();
if field == next_field {
self.field_options.push(field_options.clone());
}
else {
let Field(field_id) = field;
self.field_options[field_id as usize] = field_options.clone();
}
pub fn add_field(&mut self, field_name_str: &str, field_options: &FieldOptions) -> Field {
let field = Field(self.fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.fields.push(FieldEntry {
name: field_name.clone(),
option: field_options.clone(),
});
self.fields_map.insert(field_name, field.clone());
self.field_options.push(field_options.clone());
field
}
}

View File

@@ -136,13 +136,12 @@ impl SegmentWriter {
pub fn add(&mut self, doc: Document, schema: &Schema) {
let doc_id = self.max_doc;
for field_value in doc.fields() {
let field_options = schema.get_field(&field_value.field);
let field_options = schema.field_options(&field_value.field);
if field_options.is_tokenized_indexed() {
let mut tokens = self.tokenizer.tokenize(&field_value.text);
loop {
match tokens.next() {
Some(token) => {
// println!("TOKEN :{}:", token);
let term = Term::from_field_text(&field_value.field, token);
self.suscribe(doc_id, term);
self.num_tokens += 1;
@@ -153,8 +152,7 @@ impl SegmentWriter {
}
}
let mut stored_fieldvalues_it = doc.fields().filter(|field_value| {
schema.get_field(&field_value.field)
.is_stored()
schema.field_options(&field_value.field).is_stored()
});
self.segment_serializer.store_doc(&mut stored_fieldvalues_it);
self.max_doc += 1;