mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
using tempdir. unit test working.
This commit is contained in:
@@ -15,3 +15,4 @@ atomicwrites = "0.0.14"
|
||||
tempfile = "2.0.0"
|
||||
rustc-serialize = "0.3.16"
|
||||
log = "0.3.5"
|
||||
tempdir = "0.3.4"
|
||||
|
||||
@@ -16,11 +16,11 @@ pub struct SimpleCodec;
|
||||
pub struct SimpleSegmentSerializer {
|
||||
written_bytes_postings: usize,
|
||||
postings_write: File,
|
||||
term_fst_builder: MapBuilder<File>,
|
||||
term_fst_builder: MapBuilder<File>, // TODO find an alternative to work around the "move"
|
||||
cur_term_num_docs: DocId,
|
||||
}
|
||||
|
||||
impl SegmentSerializer for SimpleSegmentSerializer {
|
||||
impl SegmentSerializer<()> for SimpleSegmentSerializer {
|
||||
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> {
|
||||
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
|
||||
self.cur_term_num_docs = doc_freq;
|
||||
@@ -48,7 +48,9 @@ impl SegmentSerializer for SimpleSegmentSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn close(&mut self,) -> Result<()> {
|
||||
fn close(self,) -> Result<()> {
|
||||
// TODO handle errors on close
|
||||
self.term_fst_builder.finish();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -72,7 +74,7 @@ impl SimpleCodec {
|
||||
|
||||
|
||||
pub fn write<I: SerializableSegment>(index: &I, segment: &Segment) -> Result<()> {
|
||||
let mut serializer = try!(SimpleCodec::serializer(segment));
|
||||
index.write(&mut serializer)
|
||||
let serializer = try!(SimpleCodec::serializer(segment));
|
||||
index.write(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::path::PathBuf;
|
||||
use std::path::Path;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::fs::File;
|
||||
@@ -88,7 +89,7 @@ impl Directory {
|
||||
self.save_metas();
|
||||
}
|
||||
|
||||
pub fn from(filepath: &str) -> Result<Directory> {
|
||||
pub fn open(filepath: &Path) -> Result<Directory> {
|
||||
// TODO error management
|
||||
let mut directory = Directory {
|
||||
index_path: PathBuf::from(filepath),
|
||||
@@ -180,6 +181,8 @@ impl Directory {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////////////////////////
|
||||
// Segment
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ impl SegmentIndexReader {
|
||||
}
|
||||
|
||||
|
||||
fn write_postings<R: io::Read, SegSer: SegmentSerializer>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> {
|
||||
fn write_postings<R: io::Read, Output, SegSer: SegmentSerializer<Output>>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> {
|
||||
for i in 0..num_docs {
|
||||
let doc_id = cursor.read_u32::<LittleEndian>().unwrap();
|
||||
try!(serializer.add_doc(doc_id));
|
||||
@@ -54,7 +54,7 @@ fn write_postings<R: io::Read, SegSer: SegmentSerializer>(mut cursor: R, num_doc
|
||||
|
||||
impl SerializableSegment for SegmentIndexReader {
|
||||
|
||||
fn write<SegSer: SegmentSerializer>(&self, serializer: &mut SegSer) -> Result<()> {
|
||||
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, mut serializer: SegSer) -> Result<Output> {
|
||||
let mut term_offsets_it = self.term_offsets.stream();
|
||||
loop {
|
||||
match term_offsets_it.next() {
|
||||
@@ -65,12 +65,12 @@ impl SerializableSegment for SegmentIndexReader {
|
||||
let mut cursor = Cursor::new(data);
|
||||
let num_docs = cursor.read_u32::<LittleEndian>().unwrap() as DocId;
|
||||
try!(serializer.new_term(&term, num_docs));
|
||||
try!(write_postings(cursor, num_docs, serializer));
|
||||
try!(write_postings(cursor, num_docs, &mut serializer));
|
||||
},
|
||||
None => { break; }
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
serializer.close()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,28 +1,48 @@
|
||||
use core::global::*;
|
||||
use core::schema::*;
|
||||
use core::error::{Result, Error};
|
||||
use std::fmt;
|
||||
|
||||
|
||||
pub trait SegmentSerializer<Output> {
|
||||
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>;
|
||||
fn add_doc(&mut self, doc_id: DocId) -> Result<()>;
|
||||
fn close(self,) -> Result<Output>;
|
||||
}
|
||||
|
||||
pub trait SerializableSegment {
|
||||
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, serializer: SegSer) -> Result<Output>;
|
||||
}
|
||||
|
||||
|
||||
// change the API to remove the lifetime, by
|
||||
// "pushing" the data to a SegmentSerializer.
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DebugSegmentSerialize {
|
||||
pub struct DebugSegmentSerializer {
|
||||
text: String,
|
||||
}
|
||||
|
||||
impl DebugSegmentSerialize {
|
||||
pub fn to_string(&self,) -> &String {
|
||||
&self.text
|
||||
impl fmt::Debug for DebugSegmentSerializer {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", self.text)
|
||||
}
|
||||
}
|
||||
|
||||
impl DebugSegmentSerializer {
|
||||
|
||||
pub fn debug_string<S: SerializableSegment>(index: &S) -> String {
|
||||
let serializer = DebugSegmentSerializer::new();
|
||||
index.write(serializer).unwrap()
|
||||
}
|
||||
|
||||
pub fn new() -> DebugSegmentSerialize {
|
||||
DebugSegmentSerialize {
|
||||
pub fn new() -> DebugSegmentSerializer {
|
||||
DebugSegmentSerializer {
|
||||
text: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentSerializer for DebugSegmentSerialize {
|
||||
impl SegmentSerializer<String> for DebugSegmentSerializer {
|
||||
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> {
|
||||
self.text.push_str(&format!("{:?}\n", term));
|
||||
Ok(())
|
||||
@@ -33,19 +53,16 @@ impl SegmentSerializer for DebugSegmentSerialize {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn close(&mut self,) -> Result<()> {
|
||||
Ok(())
|
||||
fn close(self,) -> Result<String> {
|
||||
Ok(self.text)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SegmentSerializer {
|
||||
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>;
|
||||
fn add_doc(&mut self, doc_id: DocId) -> Result<()>;
|
||||
fn close(&mut self,) -> Result<()>;
|
||||
pub fn serialize_eq<L: SerializableSegment, R: SerializableSegment>(left: &L, right: &R) -> bool{
|
||||
let str_left = DebugSegmentSerializer::debug_string(left);
|
||||
let str_right = DebugSegmentSerializer::debug_string(right);
|
||||
str_left == str_right
|
||||
}
|
||||
|
||||
pub trait SerializableSegment {
|
||||
fn write<SegSer: SegmentSerializer>(&self, serializer: &mut SegSer) -> Result<()>;
|
||||
}
|
||||
|
||||
// TODO make iteration over Fields somehow sorted
|
||||
|
||||
@@ -55,6 +55,11 @@ impl IndexWriter {
|
||||
self.segment_writer.add(doc);
|
||||
}
|
||||
|
||||
// TODO remove that some day
|
||||
pub fn current_segment_writer(&self,) -> &SegmentWriter {
|
||||
&self.segment_writer
|
||||
}
|
||||
|
||||
pub fn commit(&mut self,) -> Result<Segment> {
|
||||
let segment = self.directory.new_segment();
|
||||
try!(SimpleCodec::write(&self.segment_writer, &segment).map(|sz| (segment.clone(), sz)));
|
||||
@@ -119,8 +124,8 @@ impl SegmentWriter {
|
||||
}
|
||||
|
||||
impl SerializableSegment for SegmentWriter {
|
||||
fn write<SegSer: SegmentSerializer>(&self, serializer: &mut SegSer) -> Result<()> {
|
||||
for (term, postings_id) in self.term_index.iter() {
|
||||
fn write<Output, SegSer: SegmentSerializer<Output>>(&self, mut serializer: SegSer) -> Result<Output> {
|
||||
for (term, postings_id) in self.term_index.iter() {
|
||||
let doc_ids = &self.postings[postings_id.clone()].doc_ids;
|
||||
let term_docfreq = doc_ids.len() as u32;
|
||||
serializer.new_term(&term, term_docfreq);
|
||||
@@ -128,6 +133,6 @@ impl SerializableSegment for SegmentWriter {
|
||||
serializer.add_doc(doc_id.clone());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
serializer.close()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
extern crate tantivy;
|
||||
extern crate itertools;
|
||||
extern crate byteorder;
|
||||
extern crate regex;
|
||||
extern crate tempdir;
|
||||
|
||||
use tantivy::core::postings::{VecPostings, intersection};
|
||||
use tantivy::core::postings::Postings;
|
||||
@@ -36,7 +35,8 @@ fn test_tokenizer() {
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let directory = Directory::from("/Users/pmasurel/temp/idx").unwrap();
|
||||
let tmp_dir = tempdir::TempDir::new("test_indexing").unwrap();
|
||||
let directory = Directory::open(tmp_dir.path()).unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = IndexWriter::open(&directory);
|
||||
@@ -55,37 +55,15 @@ fn test_indexing() {
|
||||
doc.set(Field(1), "a b c d");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
let debug_serializer = DebugSegmentSerialize::new();
|
||||
// let segment_writer = index_writer.current_segment_writer();
|
||||
let mut debug_serializer = DebugSegmentSerializer::new();
|
||||
let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
|
||||
|
||||
let commit_result = index_writer.commit();
|
||||
println!("{:?}", commit_result);
|
||||
assert!(commit_result.is_ok());
|
||||
// reading the segment
|
||||
println!("------");
|
||||
// {
|
||||
// let segment = commit_result.unwrap();
|
||||
// let index_reader = SegmentIndexReader::open(segment).unwrap();
|
||||
// let mut term_cursor = index_reader.term_cursor();
|
||||
// loop {
|
||||
// match term_cursor.next() {
|
||||
// Some((term, mut doc_cursor)) => {
|
||||
// println!("{:?}", term);
|
||||
// for doc in doc_cursor {
|
||||
// println!(" Doc {}", doc);
|
||||
// }
|
||||
// },
|
||||
// None => {
|
||||
// break;
|
||||
// },
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
assert!(false);
|
||||
}
|
||||
{
|
||||
// TODO add index opening stuff
|
||||
// let index_reader = IndexReader::open(&directory);
|
||||
let segment = commit_result.unwrap();
|
||||
let index_reader = SegmentIndexReader::open(segment).unwrap();
|
||||
let segment_str_after_reading = DebugSegmentSerializer::debug_string(&index_reader);
|
||||
assert_eq!(segment_str_before_writing, segment_str_after_reading);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user