From bc0ea4cbcbf04a423fcef7826c14f94f2c2545fb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 14 Feb 2016 15:31:57 +0900 Subject: [PATCH] trying to add schema --- src/core/directory.rs | 18 +++++++ src/core/global.rs | 4 -- src/core/schema.rs | 110 ++++++++++++++++++++++++++++++++++++------ src/core/writer.rs | 24 +++++---- tests/core.rs | 29 +++++++---- 5 files changed, 147 insertions(+), 38 deletions(-) diff --git a/src/core/directory.rs b/src/core/directory.rs index 8262730cc..8a754a62f 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -3,6 +3,7 @@ use std::path::{PathBuf, Path}; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::fs::File; +use core::schema::Schema; use std::io::Write; use std::io::BufWriter; use std::io; @@ -84,6 +85,16 @@ pub struct Directory { impl Directory { + pub fn schema(&self,) -> Schema { + self.get_read().unwrap().schema.clone() + } + + pub fn set_schema(&mut self, schema: &Schema) { + self.get_write() + .unwrap() + .set_schema(schema); + } + fn get_write(&mut self) -> Result> { match self.inner_directory.write() { Ok(dir) => @@ -173,6 +184,7 @@ struct InnerDirectory { index_path: PathBuf, mmap_cache: RefCell>, metas: DirectoryMeta, + schema: Schema, _temp_directory: Option, } @@ -196,11 +208,16 @@ impl InnerDirectory { self.save_metas() } + pub fn set_schema(&mut self, schema: &Schema) { + self.schema = schema.clone(); + } + pub fn open(filepath: &Path) -> Result { let mut directory = InnerDirectory { index_path: PathBuf::from(filepath), mmap_cache: RefCell::new(HashMap::new()), metas: DirectoryMeta::new(), + schema: Schema::new(), // TODO schema _temp_directory: None, }; try!(directory.load_metas()); //< does the directory already exists? @@ -223,6 +240,7 @@ impl InnerDirectory { index_path: PathBuf::from(tempdir_path), mmap_cache: RefCell::new(HashMap::new()), metas: DirectoryMeta::new(), + schema: Schema::new(), _temp_directory: Some(tempdir) }; //< does the directory already exists? diff --git a/src/core/global.rs b/src/core/global.rs index 987812fe5..e50a97e00 100644 --- a/src/core/global.rs +++ b/src/core/global.rs @@ -2,7 +2,3 @@ use std::io::{BufWriter, Write}; use std::io; pub type DocId = u32; -pub type FieldId = u8; - -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] -pub struct Field(pub FieldId); diff --git a/src/core/schema.rs b/src/core/schema.rs index 383e83b7f..1ee4f9a8b 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -2,10 +2,54 @@ use core::global::*; use std::fmt::Write; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::string::FromUtf8Error; +use std::collections::HashMap; use std::str; +use std::iter; use std::slice; use std::fmt; + + +#[derive(Clone,Debug,PartialEq,Eq)] +pub struct FieldOptions { + // untokenized_indexed: bool, + tokenized_indexed: bool, + stored: bool, +} + + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] +pub struct Field(u8); + +impl FieldOptions { + pub fn is_tokenized_indexed(&self,) -> bool { + self.tokenized_indexed + } + + pub fn is_stored(&self,) -> bool { + self.stored + } + + pub fn set_stored(mut self,) -> FieldOptions { + self.stored = true; + self + } + + pub fn set_tokenized_indexed(mut self,) -> FieldOptions { + self.tokenized_indexed = true; + self + } + + pub fn new() -> FieldOptions { + FieldOptions { + tokenized_indexed: false, + stored: false, + } + } +} + + + #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] pub struct FieldValue { pub field: Field, @@ -18,6 +62,51 @@ pub struct Term { data: Vec, } +#[derive(Clone,Debug)] +pub struct Schema { + fields: HashMap, + field_options: Vec, +} + +impl Schema { + pub fn new() -> Schema { + Schema { + fields: HashMap::new(), + field_options: Vec::new(), + } + } + + pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> { + self.fields + .get(field_name) + .map(|&Field(field_id)| { + let field_options = self.field_options[field_id as usize].clone(); + (Field(field_id), field_options) + }) + } + + pub fn get_field(&self, field: Field) -> FieldOptions { + let Field(field_id) = field; + self.field_options[field_id as usize].clone() + } + + pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field { + let next_field = Field(self.fields.len() as u8); + let field = self.fields + .entry(String::from(field_name)) + .or_insert(next_field.clone()) + .clone(); + if field == next_field { + self.field_options.push(field_options.clone()); + } + else { + let Field(field_id) = field; + self.field_options[field_id as usize] = field_options.clone(); + } + field + } +} + impl Term { // TODO avoid all these copies in Term. @@ -31,9 +120,9 @@ impl Term { str::from_utf8(&self.data[1..]).unwrap() } - pub fn from_field_text(field: Field, text: &str) -> Term { + pub fn from_field_text(field: &Field, text: &str) -> Term { let mut buffer = Vec::with_capacity(1 + text.len()); - let Field(field_idx) = field; + let Field(field_idx) = *field; buffer.clear(); buffer.push(field_idx); buffer.extend(text.as_bytes()); @@ -73,9 +162,9 @@ impl Document { } } - pub fn set(&mut self, field: Field, text: &str) { + pub fn set(&mut self, field: &Field, text: &str) { self.add(FieldValue { - field: field, + field: field.clone(), text: String::from(text) }); } @@ -84,19 +173,8 @@ impl Document { self.fields.push(field_value); } - pub fn indexed_field(&self,) -> slice::Iter{ + pub fn fields<'a>(&'a self,) -> slice::Iter<'a, FieldValue> { self.fields.iter() } - } -// -// impl IntoIterator for Document { -// type Item = FieldValue; -// type IntoIter = ::std::vec::IntoIter; -// -// fn into_iter(self) -> Self::IntoIter { -// self.fields.into_iter() -// } -// -// } diff --git a/src/core/writer.rs b/src/core/writer.rs index 5bd01fe3a..a35212c84 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -9,6 +9,7 @@ use core::analyzer::tokenize; use std::collections::{HashMap, BTreeMap}; use std::collections::{hash_map, btree_map}; use std::io::{BufWriter, Write}; +use std::sync::Arc; use std::mem; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use std::iter::Peekable; @@ -40,19 +41,22 @@ impl PostingsWriter { pub struct IndexWriter { segment_writer: SegmentWriter, directory: Directory, + schema: Schema, } impl IndexWriter { pub fn open(directory: &Directory) -> IndexWriter { + let schema = directory.schema(); IndexWriter { segment_writer: SegmentWriter::new(), directory: directory.clone(), + schema: schema, } } pub fn add(&mut self, doc: Document) { - self.segment_writer.add(doc); + self.segment_writer.add(doc, &self.schema); } // TODO remove that some day @@ -91,15 +95,17 @@ impl SegmentWriter { } } - pub fn add(&mut self, doc: Document) { + pub fn add(&mut self, doc: Document, schema: &Schema) { let doc_id = self.max_doc; - for field_value in doc.indexed_field() { - let field = &field_value.field; - for token in tokenize(&field_value.text) { - let term = Term::from_field_text(field.clone(), token); - self.suscribe(doc_id, term); - } - } + for field_value in doc.fields() { + let field_options = schema.get_field(field_value.field.clone()); + if field_options.is_tokenized_indexed() { + for token in tokenize(&field_value.text) { + let term = Term::from_field_text(&field_value.field, token); + self.suscribe(doc_id, term); + } + } + } self.max_doc += 1; } diff --git a/tests/core.rs b/tests/core.rs index 5db851ab0..1fd785ef7 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -57,23 +57,29 @@ fn test_tokenizer() { #[test] fn test_indexing() { - let directory = Directory::from_tempdir().unwrap(); + let mut schema = Schema::new(); + let text_fieldtype = FieldOptions::new().set_tokenized_indexed(); + let text_field = schema.add_field("text", &text_fieldtype); + + let mut directory = Directory::from_tempdir().unwrap(); + directory.set_schema(&schema); + { // writing the segment let mut index_writer = IndexWriter::open(&directory); { let mut doc = Document::new(); - doc.set(Field(1), "af b"); + doc.set(&text_field, "af b"); index_writer.add(doc); } { let mut doc = Document::new(); - doc.set(Field(1), "a b c"); + doc.set(&text_field, "a b c"); index_writer.add(doc); } { let mut doc = Document::new(); - doc.set(Field(1), "a b c d"); + doc.set(&text_field, "a b c d"); index_writer.add(doc); } @@ -91,23 +97,28 @@ fn test_indexing() { #[test] fn test_searcher() { - let directory = Directory::from_tempdir().unwrap(); + let mut schema = Schema::new(); + let text_fieldtype = FieldOptions::new().set_tokenized_indexed(); + let text_field = schema.add_field("text", &text_fieldtype); + let mut directory = Directory::from_tempdir().unwrap(); + directory.set_schema(&schema); + { // writing the segment let mut index_writer = IndexWriter::open(&directory); { let mut doc = Document::new(); - doc.set(Field(1), "af b"); + doc.set(&text_field, "af b"); index_writer.add(doc); } { let mut doc = Document::new(); - doc.set(Field(1), "a b c"); + doc.set(&text_field, "a b c"); index_writer.add(doc); } { let mut doc = Document::new(); - doc.set(Field(1), "a b c d"); + doc.set(&text_field, "a b c d"); index_writer.add(doc); } let commit_result = index_writer.commit(); @@ -115,7 +126,7 @@ fn test_searcher() { } { let searcher = Searcher::for_directory(directory); - let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), ); + let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), ); let mut collector = TestCollector::new(); searcher.search(&terms, &mut collector); let vals: Vec = collector.docs().iter()