Merge remote-tracking branch 'origin'

This commit is contained in:
Paul Masurel
2016-02-15 09:45:04 +09:00
6 changed files with 165 additions and 54 deletions

View File

@@ -3,6 +3,7 @@ use std::path::{PathBuf, Path};
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::fs::File;
use core::schema::Schema;
use std::io::Write;
use std::io::BufWriter;
use std::io;
@@ -84,6 +85,16 @@ pub struct Directory {
impl Directory {
pub fn schema(&self,) -> Schema {
self.get_read().unwrap().schema.clone()
}
pub fn set_schema(&mut self, schema: &Schema) {
self.get_write()
.unwrap()
.set_schema(schema);
}
fn get_write(&mut self) -> Result<RwLockWriteGuard<InnerDirectory>> {
match self.inner_directory.write() {
Ok(dir) =>
@@ -173,11 +184,11 @@ struct InnerDirectory {
index_path: PathBuf,
mmap_cache: RefCell<HashMap<PathBuf, MmapReadOnly>>,
metas: DirectoryMeta,
schema: Schema,
_temp_directory: Option<TempDir>,
}
fn create_tempdir() -> Result<TempDir> {
let tempdir_res = TempDir::new("index");
match tempdir_res {
@@ -187,7 +198,6 @@ fn create_tempdir() -> Result<TempDir> {
}
impl InnerDirectory {
// TODO find a rusty way to hide that, while keeping
@@ -198,11 +208,16 @@ impl InnerDirectory {
self.save_metas()
}
pub fn set_schema(&mut self, schema: &Schema) {
self.schema = schema.clone();
}
pub fn open(filepath: &Path) -> Result<InnerDirectory> {
let mut directory = InnerDirectory {
index_path: PathBuf::from(filepath),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::new(),
schema: Schema::new(), // TODO schema
_temp_directory: None,
};
try!(directory.load_metas()); //< does the directory already exists?
@@ -225,6 +240,7 @@ impl InnerDirectory {
index_path: PathBuf::from(tempdir_path),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::new(),
schema: Schema::new(),
_temp_directory: Some(tempdir)
};
//< does the directory already exists?
@@ -288,7 +304,7 @@ impl InnerDirectory {
mmap_cache.insert(full_path.clone(), try!(open_mmap(&full_path)) );
}
let mmap_readonly: &MmapReadOnly = mmap_cache.get(&full_path).unwrap();
// // TODO remove if a proper clone is available
// TODO remove if a proper clone is available
let len = unsafe { mmap_readonly.as_slice().len() };
Ok(mmap_readonly.range(0, len))
}

View File

@@ -2,7 +2,3 @@ use std::io::{BufWriter, Write};
use std::io;
pub type DocId = u32;
pub type FieldId = u8;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(pub FieldId);

View File

@@ -2,9 +2,54 @@ use core::global::*;
use std::fmt::Write;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::string::FromUtf8Error;
use std::collections::HashMap;
use std::str;
use std::iter;
use std::slice;
use std::fmt;
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct FieldOptions {
// untokenized_indexed: bool,
tokenized_indexed: bool,
stored: bool,
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(u8);
impl FieldOptions {
pub fn is_tokenized_indexed(&self,) -> bool {
self.tokenized_indexed
}
pub fn is_stored(&self,) -> bool {
self.stored
}
pub fn set_stored(mut self,) -> FieldOptions {
self.stored = true;
self
}
pub fn set_tokenized_indexed(mut self,) -> FieldOptions {
self.tokenized_indexed = true;
self
}
pub fn new() -> FieldOptions {
FieldOptions {
tokenized_indexed: false,
stored: false,
}
}
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue {
pub field: Field,
@@ -17,6 +62,51 @@ pub struct Term {
data: Vec<u8>,
}
#[derive(Clone,Debug)]
pub struct Schema {
fields: HashMap<String, Field>,
field_options: Vec<FieldOptions>,
}
impl Schema {
pub fn new() -> Schema {
Schema {
fields: HashMap::new(),
field_options: Vec::new(),
}
}
pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> {
self.fields
.get(field_name)
.map(|&Field(field_id)| {
let field_options = self.field_options[field_id as usize].clone();
(Field(field_id), field_options)
})
}
pub fn get_field(&self, field: Field) -> FieldOptions {
let Field(field_id) = field;
self.field_options[field_id as usize].clone()
}
pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field {
let next_field = Field(self.fields.len() as u8);
let field = self.fields
.entry(String::from(field_name))
.or_insert(next_field.clone())
.clone();
if field == next_field {
self.field_options.push(field_options.clone());
}
else {
let Field(field_id) = field;
self.field_options[field_id as usize] = field_options.clone();
}
field
}
}
impl Term {
// TODO avoid all these copies in Term.
@@ -30,9 +120,9 @@ impl Term {
str::from_utf8(&self.data[1..]).unwrap()
}
pub fn from_field_text(field: Field, text: &str) -> Term {
pub fn from_field_text(field: &Field, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
let Field(field_idx) = field;
let Field(field_idx) = *field;
buffer.clear();
buffer.push(field_idx);
buffer.extend(text.as_bytes());
@@ -72,9 +162,9 @@ impl Document {
}
}
pub fn set(&mut self, field: Field, text: &str) {
pub fn set(&mut self, field: &Field, text: &str) {
self.add(FieldValue {
field: field,
field: field.clone(),
text: String::from(text)
});
}
@@ -83,14 +173,8 @@ impl Document {
self.fields.push(field_value);
}
}
impl IntoIterator for Document {
type Item = FieldValue;
type IntoIter = ::std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter {
self.fields.into_iter()
pub fn fields<'a>(&'a self,) -> slice::Iter<'a, FieldValue> {
self.fields.iter()
}
}

View File

@@ -65,21 +65,21 @@ impl<T: BinarySerializable> LayerBuilder<T> {
}
fn display_layer<'a, T: BinarySerializable>(layer: &mut Layer<'a, T>) {
for it in layer {
println!(" - {:?}", it);
}
}
pub fn display_skip_list<T: BinarySerializable>(skiplist: &mut SkipList<T>) {
println!("DataLayer");
display_layer(&mut skiplist.data_layer);
println!("SkipLayer");
for mut layer in skiplist.skip_layers.iter_mut() {
display_layer(&mut layer);
}
}
//
// fn display_layer<'a, T: BinarySerializable>(layer: &mut Layer<'a, T>) {
// for it in layer {
// println!(" - {:?}", it);
// }
// }
//
// pub fn display_skip_list<T: BinarySerializable>(skiplist: &mut SkipList<T>) {
// println!("DataLayer");
// display_layer(&mut skiplist.data_layer);
// println!("SkipLayer");
// for mut layer in skiplist.skip_layers.iter_mut() {
// display_layer(&mut layer);
// }
// }
pub struct SkipListBuilder<T: BinarySerializable> {
period: usize,
@@ -172,7 +172,6 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
println!("eeeeee {}", self.next_id);
if self.next_id == u32::max_value() {
None
}
@@ -280,7 +279,6 @@ impl<'a, T: BinarySerializable> SkipList<'a, T> {
let num_layers = offsets.len();
let start_position = cursor.position() as usize;
let layers_data: &[u8] = &data[start_position..data.len()];
println!("offsets {:?}", offsets);
let data_layer: Layer<'a, T> =
if num_layers == 0 { Layer::empty() }
else {

View File

@@ -9,6 +9,7 @@ use core::analyzer::tokenize;
use std::collections::{HashMap, BTreeMap};
use std::collections::{hash_map, btree_map};
use std::io::{BufWriter, Write};
use std::sync::Arc;
use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use std::iter::Peekable;
@@ -40,19 +41,22 @@ impl PostingsWriter {
pub struct IndexWriter {
segment_writer: SegmentWriter,
directory: Directory,
schema: Schema,
}
impl IndexWriter {
pub fn open(directory: &Directory) -> IndexWriter {
let schema = directory.schema();
IndexWriter {
segment_writer: SegmentWriter::new(),
directory: directory.clone(),
schema: schema,
}
}
pub fn add(&mut self, doc: Document) {
self.segment_writer.add(doc);
self.segment_writer.add(doc, &self.schema);
}
// TODO remove that some day
@@ -91,15 +95,17 @@ impl SegmentWriter {
}
}
pub fn add(&mut self, doc: Document) {
pub fn add(&mut self, doc: Document, schema: &Schema) {
let doc_id = self.max_doc;
for field_value in doc {
let field = field_value.field;
for token in tokenize(&field_value.text) {
let term = Term::from_field_text(field.clone(), token);
self.suscribe(doc_id, term);
}
}
for field_value in doc.fields() {
let field_options = schema.get_field(field_value.field.clone());
if field_options.is_tokenized_indexed() {
for token in tokenize(&field_value.text) {
let term = Term::from_field_text(&field_value.field, token);
self.suscribe(doc_id, term);
}
}
}
self.max_doc += 1;
}

View File

@@ -57,23 +57,29 @@ fn test_tokenizer() {
#[test]
fn test_indexing() {
let directory = Directory::from_tempdir().unwrap();
let mut schema = Schema::new();
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
let text_field = schema.add_field("text", &text_fieldtype);
let mut directory = Directory::from_tempdir().unwrap();
directory.set_schema(&schema);
{
// writing the segment
let mut index_writer = IndexWriter::open(&directory);
{
let mut doc = Document::new();
doc.set(Field(1), "af b");
doc.set(&text_field, "af b");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c");
doc.set(&text_field, "a b c");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c d");
doc.set(&text_field, "a b c d");
index_writer.add(doc);
}
@@ -91,23 +97,28 @@ fn test_indexing() {
#[test]
fn test_searcher() {
let directory = Directory::from_tempdir().unwrap();
let mut schema = Schema::new();
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
let text_field = schema.add_field("text", &text_fieldtype);
let mut directory = Directory::from_tempdir().unwrap();
directory.set_schema(&schema);
{
// writing the segment
let mut index_writer = IndexWriter::open(&directory);
{
let mut doc = Document::new();
doc.set(Field(1), "af b");
doc.set(&text_field, "af b");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c");
doc.set(&text_field, "a b c");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c d");
doc.set(&text_field, "a b c d");
index_writer.add(doc);
}
let commit_result = index_writer.commit();
@@ -115,7 +126,7 @@ fn test_searcher() {
}
{
let searcher = Searcher::for_directory(directory);
let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), );
let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), );
let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector);
let vals: Vec<DocId> = collector.docs().iter()