From 726d38d26d7882b0b097a8425d2b7ddbf60cfa72 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 17 Jan 2016 14:12:40 +0900 Subject: [PATCH] blop --- readme.md | 8 ++++++ src/core/codec.rs | 66 +++++++++++++++++++++++++++++++++++++++++----- src/core/error.rs | 8 ++++++ src/core/mod.rs | 1 + src/core/schema.rs | 10 +++++++ src/core/serial.rs | 4 +-- src/core/writer.rs | 19 +++++++++---- src/lib.rs | 2 +- tests/core.rs | 30 +++++++++++---------- 9 files changed, 119 insertions(+), 29 deletions(-) create mode 100644 readme.md create mode 100644 src/core/error.rs diff --git a/readme.md b/readme.md new file mode 100644 index 000000000..4d7462d8e --- /dev/null +++ b/readme.md @@ -0,0 +1,8 @@ + +Closing an IndexWriter spawns an IndexSerializable + +An indexserializable contains all of the method +to be written on disk as a segment. + + +IndexReader impl IndexSerializable diff --git a/src/core/codec.rs b/src/core/codec.rs index 08ccaacc9..005b07c06 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -1,21 +1,73 @@ use std::io; -use core::serial::SerializableSegment; +use core::serial::*; use std::io::Write; +use fst::MapBuilder; +use core::error::*; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -pub struct SegmentOutput<'a> { - terms: &'a Write, - postings: &'a Write, +pub trait SegmentOutput<'a, W: Write> { + fn terms(&self,) -> W; + fn postings(&self,) -> W; // TODO positions, docvalues, ... } pub trait Codec { - fn write<'a, 'b, I: SerializableSegment<'a>>(index: &I, output: &'b SegmentOutput) -> Result { - Ok(0) + fn write<'a, I: SerializableSegment<'a>, W: Write>(index: &'a I, output: &'a SegmentOutput<'a, W>) -> Result; +} + +pub struct SimpleCodec; + +impl SimpleCodec { + fn write_postings(mut doc_it: D, postings: &mut W) -> Result { + let mut written_bytes: usize = 4; + postings.write_u32::(doc_it.len() as u32); + // TODO handle error correctly + for doc_id in doc_it { + postings.write_u32::(doc_id as u32); + written_bytes += 4; + } + Ok(written_bytes) } } -pub struct DebugCodec; +impl Codec for SimpleCodec { + fn write<'a, I: SerializableSegment<'a>, W: Write>(index: &'a I, output: &'a SegmentOutput<'a, W>) -> Result { + let term_trie_builder_result = MapBuilder::new(output.terms()); + if term_trie_builder_result.is_err() { + // TODO include cause somehow + return Err(Error::IOError(String::from("Failed creating the term builder"))); + } + let mut term_buffer: String = String::new(); + let mut term_trie_builder = term_trie_builder_result.unwrap(); + let mut term_cursor = index.term_cursor(); + let mut offset: usize = 0; + let mut postings_output = output.postings(); + loop { + match term_cursor.next() { + Some((term, doc_it)) => { + term.write_into(&mut term_buffer); + match term_trie_builder.insert(&term_buffer, offset as u64) { + Ok(_) => {} + Err(_) => { + return Err(Error::IOError(String::from("Failed while inserting into the fst"))) + }, + } + offset += try!(SimpleCodec::write_postings(doc_it, &mut postings_output)); + }, + None => { + break; + } + } + } + Ok(0) + + } +} + + + + // impl DebugCodec { // fn write_field(field_name) { diff --git a/src/core/error.rs b/src/core/error.rs new file mode 100644 index 000000000..c104d6b97 --- /dev/null +++ b/src/core/error.rs @@ -0,0 +1,8 @@ +use std::result; + + +pub enum Error { + IOError(String), +} + +pub type Result = result::Result; diff --git a/src/core/mod.rs b/src/core/mod.rs index b4a470576..f5cd4fa11 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,5 +8,6 @@ pub mod analyzer; pub mod serial; pub mod reader; pub mod codec; +pub mod error; pub use core::global::DocId; diff --git a/src/core/schema.rs b/src/core/schema.rs index b9e02e753..668b974b0 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -1,4 +1,5 @@ use core::global::*; +use std::fmt::Write; #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] pub struct FieldValue { @@ -13,6 +14,15 @@ pub struct Term<'a> { pub text: &'a str, } +impl<'a> Term<'a> { + pub fn write_into(&self, term_str: &mut String) { + term_str.clear(); + let Field(field_idx) = self.field; + // TODO avoid writing the field idx. + term_str.write_fmt(format_args!("{}:{}", field_idx, self.text)); + } +} + pub struct Document { fields: Vec, diff --git a/src/core/serial.rs b/src/core/serial.rs index 452f2c2ae..7163452e0 100644 --- a/src/core/serial.rs +++ b/src/core/serial.rs @@ -4,15 +4,15 @@ use core::schema::*; // Trait sufficient to serialize a segment. pub trait SerializableSegment<'a> { type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl - fn term_cursor(&'a mut self) -> Self::TermCur; + fn term_cursor(&'a self) -> Self::TermCur; } pub trait DocCursor: Iterator { fn doc(&self) -> DocId; + fn len(&self) -> usize; } // TODO make iteration over Fields somehow sorted -// (Not only forms) pub trait TermCursor<'a> { type DocCur: DocCursor; diff --git a/src/core/writer.rs b/src/core/writer.rs index 69433e7ed..016c563ed 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -183,11 +183,14 @@ impl<'a> CIWTermCursor<'a> { } fn doc_cursor(&self,) -> CIWDocCursor<'a> { + let postings = self.current_form_postings + .as_ref() + .unwrap() + .postings; + let num_docs = postings.doc_ids.len(); CIWDocCursor { - docs_it: self.current_form_postings - .as_ref() - .unwrap() - .postings + num_docs: num_docs, + docs_it: postings .doc_ids .iter(), current: None @@ -245,7 +248,7 @@ impl<'a> SerializableSegment<'a> for ClosedIndexWriter { type TermCur = CIWTermCursor<'a>; - fn term_cursor(&'a mut self) -> CIWTermCursor<'a> { + fn term_cursor(&'a self) -> CIWTermCursor<'a> { let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter(); let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field let mut term_cursor = CIWTermCursor { @@ -267,6 +270,7 @@ impl<'a> SerializableSegment<'a> for ClosedIndexWriter { pub struct CIWDocCursor<'a> { docs_it: slice::Iter<'a, DocId>, current: Option, + num_docs: usize, } impl<'a> Iterator for CIWDocCursor<'a> { @@ -279,7 +283,12 @@ impl<'a> Iterator for CIWDocCursor<'a> { } impl<'a> DocCursor for CIWDocCursor<'a> { + fn doc(&self,) -> DocId { self.current.unwrap() } + + fn len(&self) -> usize { + self.num_docs + } } diff --git a/src/lib.rs b/src/lib.rs index 5b58ea342..efc554e4a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ #[macro_use] extern crate lazy_static; - +extern crate fst; extern crate byteorder; extern crate rand; diff --git a/tests/core.rs b/tests/core.rs index 4e6545bf6..c8a17556d 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -8,6 +8,7 @@ use tantivy::core::postings::Postings; use tantivy::core::analyzer::tokenize; use tantivy::core::serial::*; use tantivy::core::schema::*; +use tantivy::core::codec::SimpleCodec; use tantivy::core::global::*; use tantivy::core::writer::{IndexWriter, ClosedIndexWriter}; use tantivy::core::directory::{Directory, generate_segment_name, SegmentId}; @@ -54,20 +55,21 @@ fn test_indexing() { index_writer.add(doc); } let mut closed_index_writer: ClosedIndexWriter = index_writer.close(); - let mut term_cursor = closed_index_writer.term_cursor(); - loop { - match term_cursor.next() { - Some((term, doc_it)) => { - println!("{:?}", term); - for doc in doc_it { - println!(" doc {}", doc); - } - }, - None => { - break; - } - } - } + // SimpleCodec::write(closed_index_writer, output); + // let mut term_cursor = closed_index_writer.term_cursor(); + // loop { + // match term_cursor.next() { + // Some((term, doc_it)) => { + // println!("{:?}", term); + // for doc in doc_it { + // println!(" doc {}", doc); + // } + // }, + // None => { + // break; + // } + // } + // } assert!(false); } {