mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
trying to add schema
This commit is contained in:
@@ -3,6 +3,7 @@ use std::path::{PathBuf, Path};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::fs::File;
|
||||
use core::schema::Schema;
|
||||
use std::io::Write;
|
||||
use std::io::BufWriter;
|
||||
use std::io;
|
||||
@@ -84,6 +85,16 @@ pub struct Directory {
|
||||
|
||||
impl Directory {
|
||||
|
||||
pub fn schema(&self,) -> Schema {
|
||||
self.get_read().unwrap().schema.clone()
|
||||
}
|
||||
|
||||
pub fn set_schema(&mut self, schema: &Schema) {
|
||||
self.get_write()
|
||||
.unwrap()
|
||||
.set_schema(schema);
|
||||
}
|
||||
|
||||
fn get_write(&mut self) -> Result<RwLockWriteGuard<InnerDirectory>> {
|
||||
match self.inner_directory.write() {
|
||||
Ok(dir) =>
|
||||
@@ -173,6 +184,7 @@ struct InnerDirectory {
|
||||
index_path: PathBuf,
|
||||
mmap_cache: RefCell<HashMap<PathBuf, MmapReadOnly>>,
|
||||
metas: DirectoryMeta,
|
||||
schema: Schema,
|
||||
_temp_directory: Option<TempDir>,
|
||||
}
|
||||
|
||||
@@ -196,11 +208,16 @@ impl InnerDirectory {
|
||||
self.save_metas()
|
||||
}
|
||||
|
||||
pub fn set_schema(&mut self, schema: &Schema) {
|
||||
self.schema = schema.clone();
|
||||
}
|
||||
|
||||
pub fn open(filepath: &Path) -> Result<InnerDirectory> {
|
||||
let mut directory = InnerDirectory {
|
||||
index_path: PathBuf::from(filepath),
|
||||
mmap_cache: RefCell::new(HashMap::new()),
|
||||
metas: DirectoryMeta::new(),
|
||||
schema: Schema::new(), // TODO schema
|
||||
_temp_directory: None,
|
||||
};
|
||||
try!(directory.load_metas()); //< does the directory already exists?
|
||||
@@ -223,6 +240,7 @@ impl InnerDirectory {
|
||||
index_path: PathBuf::from(tempdir_path),
|
||||
mmap_cache: RefCell::new(HashMap::new()),
|
||||
metas: DirectoryMeta::new(),
|
||||
schema: Schema::new(),
|
||||
_temp_directory: Some(tempdir)
|
||||
};
|
||||
//< does the directory already exists?
|
||||
|
||||
@@ -2,7 +2,3 @@ use std::io::{BufWriter, Write};
|
||||
use std::io;
|
||||
|
||||
pub type DocId = u32;
|
||||
pub type FieldId = u8;
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(pub FieldId);
|
||||
|
||||
@@ -2,10 +2,54 @@ use core::global::*;
|
||||
use std::fmt::Write;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::string::FromUtf8Error;
|
||||
use std::collections::HashMap;
|
||||
use std::str;
|
||||
use std::iter;
|
||||
use std::slice;
|
||||
use std::fmt;
|
||||
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct FieldOptions {
|
||||
// untokenized_indexed: bool,
|
||||
tokenized_indexed: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(u8);
|
||||
|
||||
impl FieldOptions {
|
||||
pub fn is_tokenized_indexed(&self,) -> bool {
|
||||
self.tokenized_indexed
|
||||
}
|
||||
|
||||
pub fn is_stored(&self,) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
pub fn set_stored(mut self,) -> FieldOptions {
|
||||
self.stored = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_tokenized_indexed(mut self,) -> FieldOptions {
|
||||
self.tokenized_indexed = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn new() -> FieldOptions {
|
||||
FieldOptions {
|
||||
tokenized_indexed: false,
|
||||
stored: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct FieldValue {
|
||||
pub field: Field,
|
||||
@@ -18,6 +62,51 @@ pub struct Term {
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug)]
|
||||
pub struct Schema {
|
||||
fields: HashMap<String, Field>,
|
||||
field_options: Vec<FieldOptions>,
|
||||
}
|
||||
|
||||
impl Schema {
|
||||
pub fn new() -> Schema {
|
||||
Schema {
|
||||
fields: HashMap::new(),
|
||||
field_options: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> {
|
||||
self.fields
|
||||
.get(field_name)
|
||||
.map(|&Field(field_id)| {
|
||||
let field_options = self.field_options[field_id as usize].clone();
|
||||
(Field(field_id), field_options)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_field(&self, field: Field) -> FieldOptions {
|
||||
let Field(field_id) = field;
|
||||
self.field_options[field_id as usize].clone()
|
||||
}
|
||||
|
||||
pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field {
|
||||
let next_field = Field(self.fields.len() as u8);
|
||||
let field = self.fields
|
||||
.entry(String::from(field_name))
|
||||
.or_insert(next_field.clone())
|
||||
.clone();
|
||||
if field == next_field {
|
||||
self.field_options.push(field_options.clone());
|
||||
}
|
||||
else {
|
||||
let Field(field_id) = field;
|
||||
self.field_options[field_id as usize] = field_options.clone();
|
||||
}
|
||||
field
|
||||
}
|
||||
}
|
||||
|
||||
impl Term {
|
||||
|
||||
// TODO avoid all these copies in Term.
|
||||
@@ -31,9 +120,9 @@ impl Term {
|
||||
str::from_utf8(&self.data[1..]).unwrap()
|
||||
}
|
||||
|
||||
pub fn from_field_text(field: Field, text: &str) -> Term {
|
||||
pub fn from_field_text(field: &Field, text: &str) -> Term {
|
||||
let mut buffer = Vec::with_capacity(1 + text.len());
|
||||
let Field(field_idx) = field;
|
||||
let Field(field_idx) = *field;
|
||||
buffer.clear();
|
||||
buffer.push(field_idx);
|
||||
buffer.extend(text.as_bytes());
|
||||
@@ -73,9 +162,9 @@ impl Document {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, field: Field, text: &str) {
|
||||
pub fn set(&mut self, field: &Field, text: &str) {
|
||||
self.add(FieldValue {
|
||||
field: field,
|
||||
field: field.clone(),
|
||||
text: String::from(text)
|
||||
});
|
||||
}
|
||||
@@ -84,19 +173,8 @@ impl Document {
|
||||
self.fields.push(field_value);
|
||||
}
|
||||
|
||||
pub fn indexed_field(&self,) -> slice::Iter<FieldValue>{
|
||||
pub fn fields<'a>(&'a self,) -> slice::Iter<'a, FieldValue> {
|
||||
self.fields.iter()
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
//
|
||||
// impl IntoIterator for Document {
|
||||
// type Item = FieldValue;
|
||||
// type IntoIter = ::std::vec::IntoIter<FieldValue>;
|
||||
//
|
||||
// fn into_iter(self) -> Self::IntoIter {
|
||||
// self.fields.into_iter()
|
||||
// }
|
||||
//
|
||||
// }
|
||||
|
||||
@@ -9,6 +9,7 @@ use core::analyzer::tokenize;
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::collections::{hash_map, btree_map};
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::sync::Arc;
|
||||
use std::mem;
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::iter::Peekable;
|
||||
@@ -40,19 +41,22 @@ impl PostingsWriter {
|
||||
pub struct IndexWriter {
|
||||
segment_writer: SegmentWriter,
|
||||
directory: Directory,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
|
||||
pub fn open(directory: &Directory) -> IndexWriter {
|
||||
let schema = directory.schema();
|
||||
IndexWriter {
|
||||
segment_writer: SegmentWriter::new(),
|
||||
directory: directory.clone(),
|
||||
schema: schema,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&mut self, doc: Document) {
|
||||
self.segment_writer.add(doc);
|
||||
self.segment_writer.add(doc, &self.schema);
|
||||
}
|
||||
|
||||
// TODO remove that some day
|
||||
@@ -91,15 +95,17 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&mut self, doc: Document) {
|
||||
pub fn add(&mut self, doc: Document, schema: &Schema) {
|
||||
let doc_id = self.max_doc;
|
||||
for field_value in doc.indexed_field() {
|
||||
let field = &field_value.field;
|
||||
for field_value in doc.fields() {
|
||||
let field_options = schema.get_field(field_value.field.clone());
|
||||
if field_options.is_tokenized_indexed() {
|
||||
for token in tokenize(&field_value.text) {
|
||||
let term = Term::from_field_text(field.clone(), token);
|
||||
let term = Term::from_field_text(&field_value.field, token);
|
||||
self.suscribe(doc_id, term);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.max_doc += 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -57,23 +57,29 @@ fn test_tokenizer() {
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let directory = Directory::from_tempdir().unwrap();
|
||||
let mut schema = Schema::new();
|
||||
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
|
||||
let text_field = schema.add_field("text", &text_fieldtype);
|
||||
|
||||
let mut directory = Directory::from_tempdir().unwrap();
|
||||
directory.set_schema(&schema);
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = IndexWriter::open(&directory);
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "af b");
|
||||
doc.set(&text_field, "af b");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c");
|
||||
doc.set(&text_field, "a b c");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c d");
|
||||
doc.set(&text_field, "a b c d");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
|
||||
@@ -91,23 +97,28 @@ fn test_indexing() {
|
||||
|
||||
#[test]
|
||||
fn test_searcher() {
|
||||
let directory = Directory::from_tempdir().unwrap();
|
||||
let mut schema = Schema::new();
|
||||
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
|
||||
let text_field = schema.add_field("text", &text_fieldtype);
|
||||
let mut directory = Directory::from_tempdir().unwrap();
|
||||
directory.set_schema(&schema);
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = IndexWriter::open(&directory);
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "af b");
|
||||
doc.set(&text_field, "af b");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c");
|
||||
doc.set(&text_field, "a b c");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c d");
|
||||
doc.set(&text_field, "a b c d");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
let commit_result = index_writer.commit();
|
||||
@@ -115,7 +126,7 @@ fn test_searcher() {
|
||||
}
|
||||
{
|
||||
let searcher = Searcher::for_directory(directory);
|
||||
let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), );
|
||||
let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), );
|
||||
let mut collector = TestCollector::new();
|
||||
searcher.search(&terms, &mut collector);
|
||||
let vals: Vec<DocId> = collector.docs().iter()
|
||||
|
||||
Reference in New Issue
Block a user