From 1e61cefc99bb15fe2cc03c2523bc3dbf639b7ef9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 22 Jan 2016 13:44:07 +0900 Subject: [PATCH] using tempdir. unit test working. --- Cargo.toml | 1 + src/core/codec.rs | 12 +++++----- src/core/directory.rs | 5 ++++- src/core/reader.rs | 8 +++---- src/core/serial.rs | 51 ++++++++++++++++++++++++++++--------------- src/core/writer.rs | 11 +++++++--- tests/core.rs | 40 ++++++++------------------------- 7 files changed, 67 insertions(+), 61 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 17a2bd304..e3c74fcb2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ atomicwrites = "0.0.14" tempfile = "2.0.0" rustc-serialize = "0.3.16" log = "0.3.5" +tempdir = "0.3.4" diff --git a/src/core/codec.rs b/src/core/codec.rs index 174c8b22c..65db2fec2 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -16,11 +16,11 @@ pub struct SimpleCodec; pub struct SimpleSegmentSerializer { written_bytes_postings: usize, postings_write: File, - term_fst_builder: MapBuilder, + term_fst_builder: MapBuilder, // TODO find an alternative to work around the "move" cur_term_num_docs: DocId, } -impl SegmentSerializer for SimpleSegmentSerializer { +impl SegmentSerializer<()> for SimpleSegmentSerializer { fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> { self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64); self.cur_term_num_docs = doc_freq; @@ -48,7 +48,9 @@ impl SegmentSerializer for SimpleSegmentSerializer { Ok(()) } - fn close(&mut self,) -> Result<()> { + fn close(self,) -> Result<()> { + // TODO handle errors on close + self.term_fst_builder.finish(); Ok(()) } } @@ -72,7 +74,7 @@ impl SimpleCodec { pub fn write(index: &I, segment: &Segment) -> Result<()> { - let mut serializer = try!(SimpleCodec::serializer(segment)); - index.write(&mut serializer) + let serializer = try!(SimpleCodec::serializer(segment)); + index.write(serializer) } } diff --git a/src/core/directory.rs b/src/core/directory.rs index 151f5aeb1..039cd71d9 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -1,4 +1,5 @@ use std::path::PathBuf; +use std::path::Path; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::fs::File; @@ -88,7 +89,7 @@ impl Directory { self.save_metas(); } - pub fn from(filepath: &str) -> Result { + pub fn open(filepath: &Path) -> Result { // TODO error management let mut directory = Directory { index_path: PathBuf::from(filepath), @@ -180,6 +181,8 @@ impl Directory { } } + + ///////////////////////// // Segment diff --git a/src/core/reader.rs b/src/core/reader.rs index 0723a75a1..ade062716 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -44,7 +44,7 @@ impl SegmentIndexReader { } -fn write_postings(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> { +fn write_postings>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> { for i in 0..num_docs { let doc_id = cursor.read_u32::().unwrap(); try!(serializer.add_doc(doc_id)); @@ -54,7 +54,7 @@ fn write_postings(mut cursor: R, num_doc impl SerializableSegment for SegmentIndexReader { - fn write(&self, serializer: &mut SegSer) -> Result<()> { + fn write>(&self, mut serializer: SegSer) -> Result { let mut term_offsets_it = self.term_offsets.stream(); loop { match term_offsets_it.next() { @@ -65,12 +65,12 @@ impl SerializableSegment for SegmentIndexReader { let mut cursor = Cursor::new(data); let num_docs = cursor.read_u32::().unwrap() as DocId; try!(serializer.new_term(&term, num_docs)); - try!(write_postings(cursor, num_docs, serializer)); + try!(write_postings(cursor, num_docs, &mut serializer)); }, None => { break; } } } - Ok(()) + serializer.close() } } diff --git a/src/core/serial.rs b/src/core/serial.rs index 0f19427b2..a6349156e 100644 --- a/src/core/serial.rs +++ b/src/core/serial.rs @@ -1,28 +1,48 @@ use core::global::*; use core::schema::*; use core::error::{Result, Error}; +use std::fmt; + + +pub trait SegmentSerializer { + fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>; + fn add_doc(&mut self, doc_id: DocId) -> Result<()>; + fn close(self,) -> Result; +} + +pub trait SerializableSegment { + fn write>(&self, serializer: SegSer) -> Result; +} + // change the API to remove the lifetime, by // "pushing" the data to a SegmentSerializer. -#[derive(Debug)] -pub struct DebugSegmentSerialize { +pub struct DebugSegmentSerializer { text: String, } -impl DebugSegmentSerialize { - pub fn to_string(&self,) -> &String { - &self.text +impl fmt::Debug for DebugSegmentSerializer { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.text) + } +} + +impl DebugSegmentSerializer { + + pub fn debug_string(index: &S) -> String { + let serializer = DebugSegmentSerializer::new(); + index.write(serializer).unwrap() } - pub fn new() -> DebugSegmentSerialize { - DebugSegmentSerialize { + pub fn new() -> DebugSegmentSerializer { + DebugSegmentSerializer { text: String::new(), } } } -impl SegmentSerializer for DebugSegmentSerialize { +impl SegmentSerializer for DebugSegmentSerializer { fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> { self.text.push_str(&format!("{:?}\n", term)); Ok(()) @@ -33,19 +53,16 @@ impl SegmentSerializer for DebugSegmentSerialize { Ok(()) } - fn close(&mut self,) -> Result<()> { - Ok(()) + fn close(self,) -> Result { + Ok(self.text) } } -pub trait SegmentSerializer { - fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>; - fn add_doc(&mut self, doc_id: DocId) -> Result<()>; - fn close(&mut self,) -> Result<()>; +pub fn serialize_eq(left: &L, right: &R) -> bool{ + let str_left = DebugSegmentSerializer::debug_string(left); + let str_right = DebugSegmentSerializer::debug_string(right); + str_left == str_right } -pub trait SerializableSegment { - fn write(&self, serializer: &mut SegSer) -> Result<()>; -} // TODO make iteration over Fields somehow sorted diff --git a/src/core/writer.rs b/src/core/writer.rs index 7716fadf3..739d581f8 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -55,6 +55,11 @@ impl IndexWriter { self.segment_writer.add(doc); } + // TODO remove that some day + pub fn current_segment_writer(&self,) -> &SegmentWriter { + &self.segment_writer + } + pub fn commit(&mut self,) -> Result { let segment = self.directory.new_segment(); try!(SimpleCodec::write(&self.segment_writer, &segment).map(|sz| (segment.clone(), sz))); @@ -119,8 +124,8 @@ impl SegmentWriter { } impl SerializableSegment for SegmentWriter { - fn write(&self, serializer: &mut SegSer) -> Result<()> { - for (term, postings_id) in self.term_index.iter() { + fn write>(&self, mut serializer: SegSer) -> Result { + for (term, postings_id) in self.term_index.iter() { let doc_ids = &self.postings[postings_id.clone()].doc_ids; let term_docfreq = doc_ids.len() as u32; serializer.new_term(&term, term_docfreq); @@ -128,6 +133,6 @@ impl SerializableSegment for SegmentWriter { serializer.add_doc(doc_id.clone()); } } - Ok(()) + serializer.close() } } diff --git a/tests/core.rs b/tests/core.rs index 30de96bc3..cb85c366e 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -1,7 +1,6 @@ extern crate tantivy; -extern crate itertools; -extern crate byteorder; extern crate regex; +extern crate tempdir; use tantivy::core::postings::{VecPostings, intersection}; use tantivy::core::postings::Postings; @@ -36,7 +35,8 @@ fn test_tokenizer() { #[test] fn test_indexing() { - let directory = Directory::from("/Users/pmasurel/temp/idx").unwrap(); + let tmp_dir = tempdir::TempDir::new("test_indexing").unwrap(); + let directory = Directory::open(tmp_dir.path()).unwrap(); { // writing the segment let mut index_writer = IndexWriter::open(&directory); @@ -55,37 +55,15 @@ fn test_indexing() { doc.set(Field(1), "a b c d"); index_writer.add(doc); } - let debug_serializer = DebugSegmentSerialize::new(); - // let segment_writer = index_writer.current_segment_writer(); + let mut debug_serializer = DebugSegmentSerializer::new(); + let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer()); let commit_result = index_writer.commit(); - println!("{:?}", commit_result); assert!(commit_result.is_ok()); - // reading the segment - println!("------"); - // { - // let segment = commit_result.unwrap(); - // let index_reader = SegmentIndexReader::open(segment).unwrap(); - // let mut term_cursor = index_reader.term_cursor(); - // loop { - // match term_cursor.next() { - // Some((term, mut doc_cursor)) => { - // println!("{:?}", term); - // for doc in doc_cursor { - // println!(" Doc {}", doc); - // } - // }, - // None => { - // break; - // }, - // } - // } - // } - assert!(false); - } - { - // TODO add index opening stuff - // let index_reader = IndexReader::open(&directory); + let segment = commit_result.unwrap(); + let index_reader = SegmentIndexReader::open(segment).unwrap(); + let segment_str_after_reading = DebugSegmentSerializer::debug_string(&index_reader); + assert_eq!(segment_str_before_writing, segment_str_after_reading); } }