This commit is contained in:
Paul Masurel
2016-01-17 14:12:40 +09:00
parent 67eb27367c
commit 726d38d26d
9 changed files with 119 additions and 29 deletions

8
readme.md Normal file
View File

@@ -0,0 +1,8 @@
Closing an IndexWriter spawns an IndexSerializable
An indexserializable contains all of the method
to be written on disk as a segment.
IndexReader impl IndexSerializable

View File

@@ -1,21 +1,73 @@
use std::io;
use core::serial::SerializableSegment;
use core::serial::*;
use std::io::Write;
use fst::MapBuilder;
use core::error::*;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
pub struct SegmentOutput<'a> {
terms: &'a Write,
postings: &'a Write,
pub trait SegmentOutput<'a, W: Write> {
fn terms(&self,) -> W;
fn postings(&self,) -> W;
// TODO positions, docvalues, ...
}
pub trait Codec {
fn write<'a, 'b, I: SerializableSegment<'a>>(index: &I, output: &'b SegmentOutput) -> Result<usize, io::Error> {
Ok(0)
fn write<'a, I: SerializableSegment<'a>, W: Write>(index: &'a I, output: &'a SegmentOutput<'a, W>) -> Result<usize>;
}
pub struct SimpleCodec;
impl SimpleCodec {
fn write_postings<D: DocCursor, W: Write>(mut doc_it: D, postings: &mut W) -> Result<usize> {
let mut written_bytes: usize = 4;
postings.write_u32::<LittleEndian>(doc_it.len() as u32);
// TODO handle error correctly
for doc_id in doc_it {
postings.write_u32::<LittleEndian>(doc_id as u32);
written_bytes += 4;
}
Ok(written_bytes)
}
}
pub struct DebugCodec;
impl Codec for SimpleCodec {
fn write<'a, I: SerializableSegment<'a>, W: Write>(index: &'a I, output: &'a SegmentOutput<'a, W>) -> Result<usize> {
let term_trie_builder_result = MapBuilder::new(output.terms());
if term_trie_builder_result.is_err() {
// TODO include cause somehow
return Err(Error::IOError(String::from("Failed creating the term builder")));
}
let mut term_buffer: String = String::new();
let mut term_trie_builder = term_trie_builder_result.unwrap();
let mut term_cursor = index.term_cursor();
let mut offset: usize = 0;
let mut postings_output = output.postings();
loop {
match term_cursor.next() {
Some((term, doc_it)) => {
term.write_into(&mut term_buffer);
match term_trie_builder.insert(&term_buffer, offset as u64) {
Ok(_) => {}
Err(_) => {
return Err(Error::IOError(String::from("Failed while inserting into the fst")))
},
}
offset += try!(SimpleCodec::write_postings(doc_it, &mut postings_output));
},
None => {
break;
}
}
}
Ok(0)
}
}
// impl DebugCodec {
// fn write_field(field_name) {

8
src/core/error.rs Normal file
View File

@@ -0,0 +1,8 @@
use std::result;
pub enum Error {
IOError(String),
}
pub type Result<T> = result::Result<T, Error>;

View File

@@ -8,5 +8,6 @@ pub mod analyzer;
pub mod serial;
pub mod reader;
pub mod codec;
pub mod error;
pub use core::global::DocId;

View File

@@ -1,4 +1,5 @@
use core::global::*;
use std::fmt::Write;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue {
@@ -13,6 +14,15 @@ pub struct Term<'a> {
pub text: &'a str,
}
impl<'a> Term<'a> {
pub fn write_into(&self, term_str: &mut String) {
term_str.clear();
let Field(field_idx) = self.field;
// TODO avoid writing the field idx.
term_str.write_fmt(format_args!("{}:{}", field_idx, self.text));
}
}
pub struct Document {
fields: Vec<FieldValue>,

View File

@@ -4,15 +4,15 @@ use core::schema::*;
// Trait sufficient to serialize a segment.
pub trait SerializableSegment<'a> {
type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl
fn term_cursor(&'a mut self) -> Self::TermCur;
fn term_cursor(&'a self) -> Self::TermCur;
}
pub trait DocCursor: Iterator<Item=DocId> {
fn doc(&self) -> DocId;
fn len(&self) -> usize;
}
// TODO make iteration over Fields somehow sorted
// (Not only forms)
pub trait TermCursor<'a> {
type DocCur: DocCursor;

View File

@@ -183,11 +183,14 @@ impl<'a> CIWTermCursor<'a> {
}
fn doc_cursor(&self,) -> CIWDocCursor<'a> {
let postings = self.current_form_postings
.as_ref()
.unwrap()
.postings;
let num_docs = postings.doc_ids.len();
CIWDocCursor {
docs_it: self.current_form_postings
.as_ref()
.unwrap()
.postings
num_docs: num_docs,
docs_it: postings
.doc_ids
.iter(),
current: None
@@ -245,7 +248,7 @@ impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
type TermCur = CIWTermCursor<'a>;
fn term_cursor(&'a mut self) -> CIWTermCursor<'a> {
fn term_cursor(&'a self) -> CIWTermCursor<'a> {
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field
let mut term_cursor = CIWTermCursor {
@@ -267,6 +270,7 @@ impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
pub struct CIWDocCursor<'a> {
docs_it: slice::Iter<'a, DocId>,
current: Option<DocId>,
num_docs: usize,
}
impl<'a> Iterator for CIWDocCursor<'a> {
@@ -279,7 +283,12 @@ impl<'a> Iterator for CIWDocCursor<'a> {
}
impl<'a> DocCursor for CIWDocCursor<'a> {
fn doc(&self,) -> DocId {
self.current.unwrap()
}
fn len(&self) -> usize {
self.num_docs
}
}

View File

@@ -1,6 +1,6 @@
#[macro_use]
extern crate lazy_static;
extern crate fst;
extern crate byteorder;
extern crate rand;

View File

@@ -8,6 +8,7 @@ use tantivy::core::postings::Postings;
use tantivy::core::analyzer::tokenize;
use tantivy::core::serial::*;
use tantivy::core::schema::*;
use tantivy::core::codec::SimpleCodec;
use tantivy::core::global::*;
use tantivy::core::writer::{IndexWriter, ClosedIndexWriter};
use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
@@ -54,20 +55,21 @@ fn test_indexing() {
index_writer.add(doc);
}
let mut closed_index_writer: ClosedIndexWriter = index_writer.close();
let mut term_cursor = closed_index_writer.term_cursor();
loop {
match term_cursor.next() {
Some((term, doc_it)) => {
println!("{:?}", term);
for doc in doc_it {
println!(" doc {}", doc);
}
},
None => {
break;
}
}
}
// SimpleCodec::write(closed_index_writer, output);
// let mut term_cursor = closed_index_writer.term_cursor();
// loop {
// match term_cursor.next() {
// Some((term, doc_it)) => {
// println!("{:?}", term);
// for doc in doc_it {
// println!(" doc {}", doc);
// }
// },
// None => {
// break;
// }
// }
// }
assert!(false);
}
{