trying to add schema

This commit is contained in:
Paul Masurel
2016-02-14 15:31:57 +09:00
parent 311aa5211f
commit bc0ea4cbcb
5 changed files with 147 additions and 38 deletions

View File

@@ -3,6 +3,7 @@ use std::path::{PathBuf, Path};
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::fs::File;
use core::schema::Schema;
use std::io::Write;
use std::io::BufWriter;
use std::io;
@@ -84,6 +85,16 @@ pub struct Directory {
impl Directory {
pub fn schema(&self,) -> Schema {
self.get_read().unwrap().schema.clone()
}
pub fn set_schema(&mut self, schema: &Schema) {
self.get_write()
.unwrap()
.set_schema(schema);
}
fn get_write(&mut self) -> Result<RwLockWriteGuard<InnerDirectory>> {
match self.inner_directory.write() {
Ok(dir) =>
@@ -173,6 +184,7 @@ struct InnerDirectory {
index_path: PathBuf,
mmap_cache: RefCell<HashMap<PathBuf, MmapReadOnly>>,
metas: DirectoryMeta,
schema: Schema,
_temp_directory: Option<TempDir>,
}
@@ -196,11 +208,16 @@ impl InnerDirectory {
self.save_metas()
}
pub fn set_schema(&mut self, schema: &Schema) {
self.schema = schema.clone();
}
pub fn open(filepath: &Path) -> Result<InnerDirectory> {
let mut directory = InnerDirectory {
index_path: PathBuf::from(filepath),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::new(),
schema: Schema::new(), // TODO schema
_temp_directory: None,
};
try!(directory.load_metas()); //< does the directory already exists?
@@ -223,6 +240,7 @@ impl InnerDirectory {
index_path: PathBuf::from(tempdir_path),
mmap_cache: RefCell::new(HashMap::new()),
metas: DirectoryMeta::new(),
schema: Schema::new(),
_temp_directory: Some(tempdir)
};
//< does the directory already exists?

View File

@@ -2,7 +2,3 @@ use std::io::{BufWriter, Write};
use std::io;
pub type DocId = u32;
pub type FieldId = u8;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(pub FieldId);

View File

@@ -2,10 +2,54 @@ use core::global::*;
use std::fmt::Write;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::string::FromUtf8Error;
use std::collections::HashMap;
use std::str;
use std::iter;
use std::slice;
use std::fmt;
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct FieldOptions {
// untokenized_indexed: bool,
tokenized_indexed: bool,
stored: bool,
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(u8);
impl FieldOptions {
pub fn is_tokenized_indexed(&self,) -> bool {
self.tokenized_indexed
}
pub fn is_stored(&self,) -> bool {
self.stored
}
pub fn set_stored(mut self,) -> FieldOptions {
self.stored = true;
self
}
pub fn set_tokenized_indexed(mut self,) -> FieldOptions {
self.tokenized_indexed = true;
self
}
pub fn new() -> FieldOptions {
FieldOptions {
tokenized_indexed: false,
stored: false,
}
}
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue {
pub field: Field,
@@ -18,6 +62,51 @@ pub struct Term {
data: Vec<u8>,
}
#[derive(Clone,Debug)]
pub struct Schema {
fields: HashMap<String, Field>,
field_options: Vec<FieldOptions>,
}
impl Schema {
pub fn new() -> Schema {
Schema {
fields: HashMap::new(),
field_options: Vec::new(),
}
}
pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> {
self.fields
.get(field_name)
.map(|&Field(field_id)| {
let field_options = self.field_options[field_id as usize].clone();
(Field(field_id), field_options)
})
}
pub fn get_field(&self, field: Field) -> FieldOptions {
let Field(field_id) = field;
self.field_options[field_id as usize].clone()
}
pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field {
let next_field = Field(self.fields.len() as u8);
let field = self.fields
.entry(String::from(field_name))
.or_insert(next_field.clone())
.clone();
if field == next_field {
self.field_options.push(field_options.clone());
}
else {
let Field(field_id) = field;
self.field_options[field_id as usize] = field_options.clone();
}
field
}
}
impl Term {
// TODO avoid all these copies in Term.
@@ -31,9 +120,9 @@ impl Term {
str::from_utf8(&self.data[1..]).unwrap()
}
pub fn from_field_text(field: Field, text: &str) -> Term {
pub fn from_field_text(field: &Field, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
let Field(field_idx) = field;
let Field(field_idx) = *field;
buffer.clear();
buffer.push(field_idx);
buffer.extend(text.as_bytes());
@@ -73,9 +162,9 @@ impl Document {
}
}
pub fn set(&mut self, field: Field, text: &str) {
pub fn set(&mut self, field: &Field, text: &str) {
self.add(FieldValue {
field: field,
field: field.clone(),
text: String::from(text)
});
}
@@ -84,19 +173,8 @@ impl Document {
self.fields.push(field_value);
}
pub fn indexed_field(&self,) -> slice::Iter<FieldValue>{
pub fn fields<'a>(&'a self,) -> slice::Iter<'a, FieldValue> {
self.fields.iter()
}
}
//
// impl IntoIterator for Document {
// type Item = FieldValue;
// type IntoIter = ::std::vec::IntoIter<FieldValue>;
//
// fn into_iter(self) -> Self::IntoIter {
// self.fields.into_iter()
// }
//
// }

View File

@@ -9,6 +9,7 @@ use core::analyzer::tokenize;
use std::collections::{HashMap, BTreeMap};
use std::collections::{hash_map, btree_map};
use std::io::{BufWriter, Write};
use std::sync::Arc;
use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use std::iter::Peekable;
@@ -40,19 +41,22 @@ impl PostingsWriter {
pub struct IndexWriter {
segment_writer: SegmentWriter,
directory: Directory,
schema: Schema,
}
impl IndexWriter {
pub fn open(directory: &Directory) -> IndexWriter {
let schema = directory.schema();
IndexWriter {
segment_writer: SegmentWriter::new(),
directory: directory.clone(),
schema: schema,
}
}
pub fn add(&mut self, doc: Document) {
self.segment_writer.add(doc);
self.segment_writer.add(doc, &self.schema);
}
// TODO remove that some day
@@ -91,15 +95,17 @@ impl SegmentWriter {
}
}
pub fn add(&mut self, doc: Document) {
pub fn add(&mut self, doc: Document, schema: &Schema) {
let doc_id = self.max_doc;
for field_value in doc.indexed_field() {
let field = &field_value.field;
for field_value in doc.fields() {
let field_options = schema.get_field(field_value.field.clone());
if field_options.is_tokenized_indexed() {
for token in tokenize(&field_value.text) {
let term = Term::from_field_text(field.clone(), token);
let term = Term::from_field_text(&field_value.field, token);
self.suscribe(doc_id, term);
}
}
}
self.max_doc += 1;
}

View File

@@ -57,23 +57,29 @@ fn test_tokenizer() {
#[test]
fn test_indexing() {
let directory = Directory::from_tempdir().unwrap();
let mut schema = Schema::new();
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
let text_field = schema.add_field("text", &text_fieldtype);
let mut directory = Directory::from_tempdir().unwrap();
directory.set_schema(&schema);
{
// writing the segment
let mut index_writer = IndexWriter::open(&directory);
{
let mut doc = Document::new();
doc.set(Field(1), "af b");
doc.set(&text_field, "af b");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c");
doc.set(&text_field, "a b c");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c d");
doc.set(&text_field, "a b c d");
index_writer.add(doc);
}
@@ -91,23 +97,28 @@ fn test_indexing() {
#[test]
fn test_searcher() {
let directory = Directory::from_tempdir().unwrap();
let mut schema = Schema::new();
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
let text_field = schema.add_field("text", &text_fieldtype);
let mut directory = Directory::from_tempdir().unwrap();
directory.set_schema(&schema);
{
// writing the segment
let mut index_writer = IndexWriter::open(&directory);
{
let mut doc = Document::new();
doc.set(Field(1), "af b");
doc.set(&text_field, "af b");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c");
doc.set(&text_field, "a b c");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field(1), "a b c d");
doc.set(&text_field, "a b c d");
index_writer.add(doc);
}
let commit_result = index_writer.commit();
@@ -115,7 +126,7 @@ fn test_searcher() {
}
{
let searcher = Searcher::for_directory(directory);
let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), );
let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), );
let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector);
let vals: Vec<DocId> = collector.docs().iter()