mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-20 02:00:40 +00:00
Merge remote-tracking branch 'origin'
This commit is contained in:
@@ -3,6 +3,7 @@ use std::path::{PathBuf, Path};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::fs::File;
|
||||
use core::schema::Schema;
|
||||
use std::io::Write;
|
||||
use std::io::BufWriter;
|
||||
use std::io;
|
||||
@@ -84,6 +85,16 @@ pub struct Directory {
|
||||
|
||||
impl Directory {
|
||||
|
||||
pub fn schema(&self,) -> Schema {
|
||||
self.get_read().unwrap().schema.clone()
|
||||
}
|
||||
|
||||
pub fn set_schema(&mut self, schema: &Schema) {
|
||||
self.get_write()
|
||||
.unwrap()
|
||||
.set_schema(schema);
|
||||
}
|
||||
|
||||
fn get_write(&mut self) -> Result<RwLockWriteGuard<InnerDirectory>> {
|
||||
match self.inner_directory.write() {
|
||||
Ok(dir) =>
|
||||
@@ -173,11 +184,11 @@ struct InnerDirectory {
|
||||
index_path: PathBuf,
|
||||
mmap_cache: RefCell<HashMap<PathBuf, MmapReadOnly>>,
|
||||
metas: DirectoryMeta,
|
||||
schema: Schema,
|
||||
_temp_directory: Option<TempDir>,
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn create_tempdir() -> Result<TempDir> {
|
||||
let tempdir_res = TempDir::new("index");
|
||||
match tempdir_res {
|
||||
@@ -187,7 +198,6 @@ fn create_tempdir() -> Result<TempDir> {
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl InnerDirectory {
|
||||
|
||||
// TODO find a rusty way to hide that, while keeping
|
||||
@@ -198,11 +208,16 @@ impl InnerDirectory {
|
||||
self.save_metas()
|
||||
}
|
||||
|
||||
pub fn set_schema(&mut self, schema: &Schema) {
|
||||
self.schema = schema.clone();
|
||||
}
|
||||
|
||||
pub fn open(filepath: &Path) -> Result<InnerDirectory> {
|
||||
let mut directory = InnerDirectory {
|
||||
index_path: PathBuf::from(filepath),
|
||||
mmap_cache: RefCell::new(HashMap::new()),
|
||||
metas: DirectoryMeta::new(),
|
||||
schema: Schema::new(), // TODO schema
|
||||
_temp_directory: None,
|
||||
};
|
||||
try!(directory.load_metas()); //< does the directory already exists?
|
||||
@@ -225,6 +240,7 @@ impl InnerDirectory {
|
||||
index_path: PathBuf::from(tempdir_path),
|
||||
mmap_cache: RefCell::new(HashMap::new()),
|
||||
metas: DirectoryMeta::new(),
|
||||
schema: Schema::new(),
|
||||
_temp_directory: Some(tempdir)
|
||||
};
|
||||
//< does the directory already exists?
|
||||
@@ -288,7 +304,7 @@ impl InnerDirectory {
|
||||
mmap_cache.insert(full_path.clone(), try!(open_mmap(&full_path)) );
|
||||
}
|
||||
let mmap_readonly: &MmapReadOnly = mmap_cache.get(&full_path).unwrap();
|
||||
// // TODO remove if a proper clone is available
|
||||
// TODO remove if a proper clone is available
|
||||
let len = unsafe { mmap_readonly.as_slice().len() };
|
||||
Ok(mmap_readonly.range(0, len))
|
||||
}
|
||||
|
||||
@@ -2,7 +2,3 @@ use std::io::{BufWriter, Write};
|
||||
use std::io;
|
||||
|
||||
pub type DocId = u32;
|
||||
pub type FieldId = u8;
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(pub FieldId);
|
||||
|
||||
@@ -2,9 +2,54 @@ use core::global::*;
|
||||
use std::fmt::Write;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::string::FromUtf8Error;
|
||||
use std::collections::HashMap;
|
||||
use std::str;
|
||||
use std::iter;
|
||||
use std::slice;
|
||||
use std::fmt;
|
||||
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct FieldOptions {
|
||||
// untokenized_indexed: bool,
|
||||
tokenized_indexed: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(u8);
|
||||
|
||||
impl FieldOptions {
|
||||
pub fn is_tokenized_indexed(&self,) -> bool {
|
||||
self.tokenized_indexed
|
||||
}
|
||||
|
||||
pub fn is_stored(&self,) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
pub fn set_stored(mut self,) -> FieldOptions {
|
||||
self.stored = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_tokenized_indexed(mut self,) -> FieldOptions {
|
||||
self.tokenized_indexed = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn new() -> FieldOptions {
|
||||
FieldOptions {
|
||||
tokenized_indexed: false,
|
||||
stored: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct FieldValue {
|
||||
pub field: Field,
|
||||
@@ -17,6 +62,51 @@ pub struct Term {
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug)]
|
||||
pub struct Schema {
|
||||
fields: HashMap<String, Field>,
|
||||
field_options: Vec<FieldOptions>,
|
||||
}
|
||||
|
||||
impl Schema {
|
||||
pub fn new() -> Schema {
|
||||
Schema {
|
||||
fields: HashMap::new(),
|
||||
field_options: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_field_name(&self, field_name: &str) -> Option<(Field, FieldOptions)> {
|
||||
self.fields
|
||||
.get(field_name)
|
||||
.map(|&Field(field_id)| {
|
||||
let field_options = self.field_options[field_id as usize].clone();
|
||||
(Field(field_id), field_options)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_field(&self, field: Field) -> FieldOptions {
|
||||
let Field(field_id) = field;
|
||||
self.field_options[field_id as usize].clone()
|
||||
}
|
||||
|
||||
pub fn add_field(&mut self, field_name: &str, field_options: &FieldOptions) -> Field {
|
||||
let next_field = Field(self.fields.len() as u8);
|
||||
let field = self.fields
|
||||
.entry(String::from(field_name))
|
||||
.or_insert(next_field.clone())
|
||||
.clone();
|
||||
if field == next_field {
|
||||
self.field_options.push(field_options.clone());
|
||||
}
|
||||
else {
|
||||
let Field(field_id) = field;
|
||||
self.field_options[field_id as usize] = field_options.clone();
|
||||
}
|
||||
field
|
||||
}
|
||||
}
|
||||
|
||||
impl Term {
|
||||
|
||||
// TODO avoid all these copies in Term.
|
||||
@@ -30,9 +120,9 @@ impl Term {
|
||||
str::from_utf8(&self.data[1..]).unwrap()
|
||||
}
|
||||
|
||||
pub fn from_field_text(field: Field, text: &str) -> Term {
|
||||
pub fn from_field_text(field: &Field, text: &str) -> Term {
|
||||
let mut buffer = Vec::with_capacity(1 + text.len());
|
||||
let Field(field_idx) = field;
|
||||
let Field(field_idx) = *field;
|
||||
buffer.clear();
|
||||
buffer.push(field_idx);
|
||||
buffer.extend(text.as_bytes());
|
||||
@@ -72,9 +162,9 @@ impl Document {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, field: Field, text: &str) {
|
||||
pub fn set(&mut self, field: &Field, text: &str) {
|
||||
self.add(FieldValue {
|
||||
field: field,
|
||||
field: field.clone(),
|
||||
text: String::from(text)
|
||||
});
|
||||
}
|
||||
@@ -83,14 +173,8 @@ impl Document {
|
||||
self.fields.push(field_value);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl IntoIterator for Document {
|
||||
type Item = FieldValue;
|
||||
type IntoIter = ::std::vec::IntoIter<FieldValue>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.fields.into_iter()
|
||||
pub fn fields<'a>(&'a self,) -> slice::Iter<'a, FieldValue> {
|
||||
self.fields.iter()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -65,21 +65,21 @@ impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn display_layer<'a, T: BinarySerializable>(layer: &mut Layer<'a, T>) {
|
||||
for it in layer {
|
||||
println!(" - {:?}", it);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn display_skip_list<T: BinarySerializable>(skiplist: &mut SkipList<T>) {
|
||||
println!("DataLayer");
|
||||
display_layer(&mut skiplist.data_layer);
|
||||
println!("SkipLayer");
|
||||
for mut layer in skiplist.skip_layers.iter_mut() {
|
||||
display_layer(&mut layer);
|
||||
}
|
||||
}
|
||||
//
|
||||
// fn display_layer<'a, T: BinarySerializable>(layer: &mut Layer<'a, T>) {
|
||||
// for it in layer {
|
||||
// println!(" - {:?}", it);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// pub fn display_skip_list<T: BinarySerializable>(skiplist: &mut SkipList<T>) {
|
||||
// println!("DataLayer");
|
||||
// display_layer(&mut skiplist.data_layer);
|
||||
// println!("SkipLayer");
|
||||
// for mut layer in skiplist.skip_layers.iter_mut() {
|
||||
// display_layer(&mut layer);
|
||||
// }
|
||||
// }
|
||||
|
||||
pub struct SkipListBuilder<T: BinarySerializable> {
|
||||
period: usize,
|
||||
@@ -172,7 +172,6 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
println!("eeeeee {}", self.next_id);
|
||||
if self.next_id == u32::max_value() {
|
||||
None
|
||||
}
|
||||
@@ -280,7 +279,6 @@ impl<'a, T: BinarySerializable> SkipList<'a, T> {
|
||||
let num_layers = offsets.len();
|
||||
let start_position = cursor.position() as usize;
|
||||
let layers_data: &[u8] = &data[start_position..data.len()];
|
||||
println!("offsets {:?}", offsets);
|
||||
let data_layer: Layer<'a, T> =
|
||||
if num_layers == 0 { Layer::empty() }
|
||||
else {
|
||||
|
||||
@@ -9,6 +9,7 @@ use core::analyzer::tokenize;
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::collections::{hash_map, btree_map};
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::sync::Arc;
|
||||
use std::mem;
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::iter::Peekable;
|
||||
@@ -40,19 +41,22 @@ impl PostingsWriter {
|
||||
pub struct IndexWriter {
|
||||
segment_writer: SegmentWriter,
|
||||
directory: Directory,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
|
||||
pub fn open(directory: &Directory) -> IndexWriter {
|
||||
let schema = directory.schema();
|
||||
IndexWriter {
|
||||
segment_writer: SegmentWriter::new(),
|
||||
directory: directory.clone(),
|
||||
schema: schema,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&mut self, doc: Document) {
|
||||
self.segment_writer.add(doc);
|
||||
self.segment_writer.add(doc, &self.schema);
|
||||
}
|
||||
|
||||
// TODO remove that some day
|
||||
@@ -91,15 +95,17 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&mut self, doc: Document) {
|
||||
pub fn add(&mut self, doc: Document, schema: &Schema) {
|
||||
let doc_id = self.max_doc;
|
||||
for field_value in doc {
|
||||
let field = field_value.field;
|
||||
for token in tokenize(&field_value.text) {
|
||||
let term = Term::from_field_text(field.clone(), token);
|
||||
self.suscribe(doc_id, term);
|
||||
}
|
||||
}
|
||||
for field_value in doc.fields() {
|
||||
let field_options = schema.get_field(field_value.field.clone());
|
||||
if field_options.is_tokenized_indexed() {
|
||||
for token in tokenize(&field_value.text) {
|
||||
let term = Term::from_field_text(&field_value.field, token);
|
||||
self.suscribe(doc_id, term);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.max_doc += 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -57,23 +57,29 @@ fn test_tokenizer() {
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let directory = Directory::from_tempdir().unwrap();
|
||||
let mut schema = Schema::new();
|
||||
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
|
||||
let text_field = schema.add_field("text", &text_fieldtype);
|
||||
|
||||
let mut directory = Directory::from_tempdir().unwrap();
|
||||
directory.set_schema(&schema);
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = IndexWriter::open(&directory);
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "af b");
|
||||
doc.set(&text_field, "af b");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c");
|
||||
doc.set(&text_field, "a b c");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c d");
|
||||
doc.set(&text_field, "a b c d");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
|
||||
@@ -91,23 +97,28 @@ fn test_indexing() {
|
||||
|
||||
#[test]
|
||||
fn test_searcher() {
|
||||
let directory = Directory::from_tempdir().unwrap();
|
||||
let mut schema = Schema::new();
|
||||
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
|
||||
let text_field = schema.add_field("text", &text_fieldtype);
|
||||
let mut directory = Directory::from_tempdir().unwrap();
|
||||
directory.set_schema(&schema);
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = IndexWriter::open(&directory);
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "af b");
|
||||
doc.set(&text_field, "af b");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c");
|
||||
doc.set(&text_field, "a b c");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c d");
|
||||
doc.set(&text_field, "a b c d");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
let commit_result = index_writer.commit();
|
||||
@@ -115,7 +126,7 @@ fn test_searcher() {
|
||||
}
|
||||
{
|
||||
let searcher = Searcher::for_directory(directory);
|
||||
let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), );
|
||||
let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), );
|
||||
let mut collector = TestCollector::new();
|
||||
searcher.search(&terms, &mut collector);
|
||||
let vals: Vec<DocId> = collector.docs().iter()
|
||||
|
||||
Reference in New Issue
Block a user