using tempdir. unit test working.

This commit is contained in:
Paul Masurel
2016-01-22 13:44:07 +09:00
parent 2d0054f08b
commit 1e61cefc99
7 changed files with 67 additions and 61 deletions

View File

@@ -15,3 +15,4 @@ atomicwrites = "0.0.14"
tempfile = "2.0.0"
rustc-serialize = "0.3.16"
log = "0.3.5"
tempdir = "0.3.4"

View File

@@ -16,11 +16,11 @@ pub struct SimpleCodec;
pub struct SimpleSegmentSerializer {
written_bytes_postings: usize,
postings_write: File,
term_fst_builder: MapBuilder<File>,
term_fst_builder: MapBuilder<File>, // TODO find an alternative to work around the "move"
cur_term_num_docs: DocId,
}
impl SegmentSerializer for SimpleSegmentSerializer {
impl SegmentSerializer<()> for SimpleSegmentSerializer {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> {
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
self.cur_term_num_docs = doc_freq;
@@ -48,7 +48,9 @@ impl SegmentSerializer for SimpleSegmentSerializer {
Ok(())
}
fn close(&mut self,) -> Result<()> {
fn close(self,) -> Result<()> {
// TODO handle errors on close
self.term_fst_builder.finish();
Ok(())
}
}
@@ -72,7 +74,7 @@ impl SimpleCodec {
pub fn write<I: SerializableSegment>(index: &I, segment: &Segment) -> Result<()> {
let mut serializer = try!(SimpleCodec::serializer(segment));
index.write(&mut serializer)
let serializer = try!(SimpleCodec::serializer(segment));
index.write(serializer)
}
}

View File

@@ -1,4 +1,5 @@
use std::path::PathBuf;
use std::path::Path;
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::fs::File;
@@ -88,7 +89,7 @@ impl Directory {
self.save_metas();
}
pub fn from(filepath: &str) -> Result<Directory> {
pub fn open(filepath: &Path) -> Result<Directory> {
// TODO error management
let mut directory = Directory {
index_path: PathBuf::from(filepath),
@@ -180,6 +181,8 @@ impl Directory {
}
}
/////////////////////////
// Segment

View File

@@ -44,7 +44,7 @@ impl SegmentIndexReader {
}
fn write_postings<R: io::Read, SegSer: SegmentSerializer>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> {
fn write_postings<R: io::Read, Output, SegSer: SegmentSerializer<Output>>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> {
for i in 0..num_docs {
let doc_id = cursor.read_u32::<LittleEndian>().unwrap();
try!(serializer.add_doc(doc_id));
@@ -54,7 +54,7 @@ fn write_postings<R: io::Read, SegSer: SegmentSerializer>(mut cursor: R, num_doc
impl SerializableSegment for SegmentIndexReader {
fn write<SegSer: SegmentSerializer>(&self, serializer: &mut SegSer) -> Result<()> {
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, mut serializer: SegSer) -> Result<Output> {
let mut term_offsets_it = self.term_offsets.stream();
loop {
match term_offsets_it.next() {
@@ -65,12 +65,12 @@ impl SerializableSegment for SegmentIndexReader {
let mut cursor = Cursor::new(data);
let num_docs = cursor.read_u32::<LittleEndian>().unwrap() as DocId;
try!(serializer.new_term(&term, num_docs));
try!(write_postings(cursor, num_docs, serializer));
try!(write_postings(cursor, num_docs, &mut serializer));
},
None => { break; }
}
}
Ok(())
serializer.close()
}
}

View File

@@ -1,28 +1,48 @@
use core::global::*;
use core::schema::*;
use core::error::{Result, Error};
use std::fmt;
pub trait SegmentSerializer<Output> {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>;
fn add_doc(&mut self, doc_id: DocId) -> Result<()>;
fn close(self,) -> Result<Output>;
}
pub trait SerializableSegment {
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, serializer: SegSer) -> Result<Output>;
}
// change the API to remove the lifetime, by
// "pushing" the data to a SegmentSerializer.
#[derive(Debug)]
pub struct DebugSegmentSerialize {
pub struct DebugSegmentSerializer {
text: String,
}
impl DebugSegmentSerialize {
pub fn to_string(&self,) -> &String {
&self.text
impl fmt::Debug for DebugSegmentSerializer {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.text)
}
}
impl DebugSegmentSerializer {
pub fn debug_string<S: SerializableSegment>(index: &S) -> String {
let serializer = DebugSegmentSerializer::new();
index.write(serializer).unwrap()
}
pub fn new() -> DebugSegmentSerialize {
DebugSegmentSerialize {
pub fn new() -> DebugSegmentSerializer {
DebugSegmentSerializer {
text: String::new(),
}
}
}
impl SegmentSerializer for DebugSegmentSerialize {
impl SegmentSerializer<String> for DebugSegmentSerializer {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> {
self.text.push_str(&format!("{:?}\n", term));
Ok(())
@@ -33,19 +53,16 @@ impl SegmentSerializer for DebugSegmentSerialize {
Ok(())
}
fn close(&mut self,) -> Result<()> {
Ok(())
fn close(self,) -> Result<String> {
Ok(self.text)
}
}
pub trait SegmentSerializer {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>;
fn add_doc(&mut self, doc_id: DocId) -> Result<()>;
fn close(&mut self,) -> Result<()>;
pub fn serialize_eq<L: SerializableSegment, R: SerializableSegment>(left: &L, right: &R) -> bool{
let str_left = DebugSegmentSerializer::debug_string(left);
let str_right = DebugSegmentSerializer::debug_string(right);
str_left == str_right
}
pub trait SerializableSegment {
fn write<SegSer: SegmentSerializer>(&self, serializer: &mut SegSer) -> Result<()>;
}
// TODO make iteration over Fields somehow sorted

View File

@@ -55,6 +55,11 @@ impl IndexWriter {
self.segment_writer.add(doc);
}
// TODO remove that some day
pub fn current_segment_writer(&self,) -> &SegmentWriter {
&self.segment_writer
}
pub fn commit(&mut self,) -> Result<Segment> {
let segment = self.directory.new_segment();
try!(SimpleCodec::write(&self.segment_writer, &segment).map(|sz| (segment.clone(), sz)));
@@ -119,8 +124,8 @@ impl SegmentWriter {
}
impl SerializableSegment for SegmentWriter {
fn write<SegSer: SegmentSerializer>(&self, serializer: &mut SegSer) -> Result<()> {
for (term, postings_id) in self.term_index.iter() {
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, mut serializer: SegSer) -> Result<Output> {
for (term, postings_id) in self.term_index.iter() {
let doc_ids = &self.postings[postings_id.clone()].doc_ids;
let term_docfreq = doc_ids.len() as u32;
serializer.new_term(&term, term_docfreq);
@@ -128,6 +133,6 @@ impl SerializableSegment for SegmentWriter {
serializer.add_doc(doc_id.clone());
}
}
Ok(())
serializer.close()
}
}

View File

@@ -1,7 +1,6 @@
extern crate tantivy;
extern crate itertools;
extern crate byteorder;
extern crate regex;
extern crate tempdir;
use tantivy::core::postings::{VecPostings, intersection};
use tantivy::core::postings::Postings;
@@ -36,7 +35,8 @@ fn test_tokenizer() {
#[test]
fn test_indexing() {
let directory = Directory::from("/Users/pmasurel/temp/idx").unwrap();
let tmp_dir = tempdir::TempDir::new("test_indexing").unwrap();
let directory = Directory::open(tmp_dir.path()).unwrap();
{
// writing the segment
let mut index_writer = IndexWriter::open(&directory);
@@ -55,37 +55,15 @@ fn test_indexing() {
doc.set(Field(1), "a b c d");
index_writer.add(doc);
}
let debug_serializer = DebugSegmentSerialize::new();
// let segment_writer = index_writer.current_segment_writer();
let mut debug_serializer = DebugSegmentSerializer::new();
let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
let commit_result = index_writer.commit();
println!("{:?}", commit_result);
assert!(commit_result.is_ok());
// reading the segment
println!("------");
// {
// let segment = commit_result.unwrap();
// let index_reader = SegmentIndexReader::open(segment).unwrap();
// let mut term_cursor = index_reader.term_cursor();
// loop {
// match term_cursor.next() {
// Some((term, mut doc_cursor)) => {
// println!("{:?}", term);
// for doc in doc_cursor {
// println!(" Doc {}", doc);
// }
// },
// None => {
// break;
// },
// }
// }
// }
assert!(false);
}
{
// TODO add index opening stuff
// let index_reader = IndexReader::open(&directory);
let segment = commit_result.unwrap();
let index_reader = SegmentIndexReader::open(segment).unwrap();
let segment_str_after_reading = DebugSegmentSerializer::debug_string(&index_reader);
assert_eq!(segment_str_before_writing, segment_str_after_reading);
}
}