This commit is contained in:
Paul Masurel
2016-02-26 10:08:58 +09:00
parent 821f11795d
commit 914a79372b
5 changed files with 81 additions and 41 deletions

View File

@@ -97,6 +97,6 @@ impl SimpleCodec {
pub fn write<I: SerializableSegment>(index: &I, segment: &Segment) -> Result<(), IOError> {
let mut serializer = try!(SimpleCodec::serializer(segment));
index.write(&mut serializer)
index.write(serializer)
}
}

View File

@@ -1,27 +1,29 @@
use std::io;
use std::io::Write;
use std::fs::File;
use fst::Map;
use fst::MapBuilder;
use std::rc::Rc;
use core::serialize::BinarySerializable;
use std::marker::PhantomData;
use fst;
use std::ops::Deref;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
struct FstMapBuilder<V: BinarySerializable> {
fst_builder: MapBuilder<File>,
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
impl<V: BinarySerializable> FstMapBuilder<V> {
fn new(file: File) -> io::Result<FstMapBuilder<V>> {
let fst_builder = try!(MapBuilder::new(file).map_err(convert_fst_error));
fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
@@ -37,12 +39,46 @@ impl<V: BinarySerializable> FstMapBuilder<V> {
Ok(())
}
fn close(self,) -> io::Result<()> {
fn close(self,) -> io::Result<W> {
let mut file = try!(self.fst_builder
.into_inner()
.map_err(convert_fst_error));
file.write_all(&self.data);
Ok(())
Ok(file)
}
}
pub struct FstMap<R: Deref<Target=[u8]>, V: BinarySerializable> {
//fst::Map,
data: R,
_phantom_: PhantomData<V>,
}
impl<R: Deref<Target=[u8]>, V: BinarySerializable> FstMap<R, V> {
pub fn new(data: R) -> FstMap<R, V> {
FstMap {
data: data,
_phantom_: PhantomData,
}
}
pub fn read(key: &[u8]) -> Option<V> {
None
}
}
mod tests {
use super::{FstMapBuilder, FstMap};
#[test]
fn test_fstmap() {
let mut fst_map_builder: FstMapBuilder<Vec<u8>, u32> = FstMapBuilder::new(Vec::new()).unwrap();
fst_map_builder.insert("abc".as_bytes(), &34).unwrap();
fst_map_builder.insert("abcd".as_bytes(), &343).unwrap();
let data = fst_map_builder.close().unwrap();
}
}

View File

@@ -1,16 +1,16 @@
use core::schema::*;
use std::fmt;
use std::io::Error as IOError;
use std::io;
pub trait SegmentSerializer<Output> {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<(), IOError>;
fn write_docs(&mut self, docs: &[DocId]) -> Result<(), IOError>; // TODO add size
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<(), io::Error>;
fn write_docs(&mut self, docs: &[DocId]) -> Result<(), io::Error>; // TODO add size
fn store_doc(&mut self, field: &mut Iterator<Item=&FieldValue>);
fn close(self,) -> Result<Output, IOError>;
fn close(self,) -> Result<Output, io::Error>;
}
pub trait SerializableSegment {
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, serializer: &mut SegSer) -> Result<Output, IOError>;
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, serializer: SegSer) -> io::Result<Output>;
}
@@ -29,7 +29,7 @@ impl DebugSegmentSerializer {
pub fn debug_string<S: SerializableSegment>(index: &S) -> String {
let mut serializer = DebugSegmentSerializer::new();
index.write(&mut serializer).unwrap()
index.write(serializer).unwrap()
}
pub fn new() -> DebugSegmentSerializer {
@@ -41,7 +41,7 @@ impl DebugSegmentSerializer {
}
impl SegmentSerializer<String> for DebugSegmentSerializer {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<(), IOError> {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<(), io::Error> {
self.text.push_str(&format!("{:?}\n", term));
Ok(())
}
@@ -57,14 +57,14 @@ impl SegmentSerializer<String> for DebugSegmentSerializer {
}
}
fn write_docs(&mut self, docs: &[DocId]) -> Result<(), IOError> {
fn write_docs(&mut self, docs: &[DocId]) -> Result<(), io::Error> {
for doc in docs {
self.text.push_str(&format!(" - Doc {:?}\n", doc));
}
Ok(())
}
fn close(self,) -> Result<String, IOError> {
fn close(self,) -> Result<String, io::Error> {
Ok(self.text)
}
}

View File

@@ -1,5 +1,6 @@
use core::schema::*;
use core::codec::*;
use std::io;
use std::rc::Rc;
use core::directory::Directory;
use core::analyzer::SimpleTokenizer;
@@ -35,8 +36,7 @@ pub struct IndexWriter {
schema: Schema,
}
fn new_segment_writer(directory: &Directory, ) -> SegmentWriter {
fn new_segment_writer(directory: &Directory, ) -> SegmentWriter {
let segment = directory.new_segment();
SegmentWriter::for_segment(segment)
}
@@ -174,15 +174,15 @@ impl SegmentWriter {
self.get_postings_writer(term).suscribe(doc);
}
}
//
// impl SerializableSegment for SegmentWriter {
// fn write<Output, SegSer: SegmentSerializer<Output>>(&self, serializer: &mut SegSer) -> Result<Output> {
// for (term, postings_id) in self.term_index.iter() {
// let doc_ids = &self.postings[postings_id.clone()].doc_ids;
// let term_docfreq = doc_ids.len() as u32;
// serializer.new_term(&term, term_docfreq);
// serializer.write_docs(&doc_ids);
// }
// serializer.close()
// }
// }
impl SerializableSegment for SegmentWriter {
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, mut serializer: SegSer) -> io::Result<Output> {
for (term, postings_id) in self.term_index.iter() {
let doc_ids = &self.postings[postings_id.clone()].doc_ids;
let term_docfreq = doc_ids.len() as u32;
serializer.new_term(&term, term_docfreq);
serializer.write_docs(&doc_ids);
}
serializer.close()
}
}

View File

@@ -7,11 +7,11 @@ extern crate tempdir;
use tantivy::core::schema::*;
use tantivy::core::writer::IndexWriter;
use tantivy::core::collector::Collector;
use tantivy::core::searcher::{Searcher, DocAddress};
use tantivy::core::searcher::Searcher;
use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
use tantivy::core::reader::SegmentReader;
use regex::Regex;
use tantivy::core::serial::DebugSegmentSerializer;
// only make sense for a single segment
@@ -49,7 +49,7 @@ fn test_indexing() {
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
let text_field = schema.add_field("text", &text_fieldtype);
let mut directory = Directory::create_from_tempdir(schema).unwrap();
let directory = Directory::create_from_tempdir(schema).unwrap();
{
// writing the segment
@@ -70,16 +70,20 @@ fn test_indexing() {
index_writer.add(doc);
}
//let debug_serializer = DebugSegmentSerializer::new();
//let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
println!("{:?}", segment_str_before_writing);
let commit_result = index_writer.commit();
assert!(commit_result.is_ok());
let segment = commit_result.unwrap();
SegmentReader::open(segment).unwrap();
let segment_reader = SegmentReader::open(segment).unwrap();
// TODO ENABLE TEST
//let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader);
//assert_eq!(segment_str_before_writing, segment_str_after_reading);
// let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader);
// assert_eq!(segment_str_before_writing, segment_str_after_reading);
}
}
@@ -88,7 +92,7 @@ fn test_searcher() {
let mut schema = Schema::new();
let text_fieldtype = FieldOptions::new().set_tokenized_indexed();
let text_field = schema.add_field("text", &text_fieldtype);
let mut directory = Directory::create_from_tempdir(schema).unwrap();
let directory = Directory::create_from_tempdir(schema).unwrap();
{
// writing the segment