From e43e023e8695b48ef2c65a2170c39a0885a879ca Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 5 Mar 2016 16:21:02 +0900 Subject: [PATCH] blop --- src/core/codec.rs | 10 ++++++++++ src/core/collector.rs | 3 --- src/core/index.rs | 10 ++++++++++ src/core/serial.rs | 6 ++++++ src/core/writer.rs | 35 ++++++++++++++++++++++++----------- src/lib.rs | 14 +++++--------- 6 files changed, 55 insertions(+), 23 deletions(-) diff --git a/src/core/codec.rs b/src/core/codec.rs index c33c711d4..515adf0e4 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -1,8 +1,10 @@ use std::io; use std::io::{Read, Write}; use core::serial::{SegmentSerializer, SerializableSegment}; +use rustc_serialize::json; use core::directory::WritePtr; use core::index::Segment; +use core::index::SegmentInfo; use core::index::SegmentComponent; use core::schema::Term; use core::schema::DocId; @@ -79,6 +81,14 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { Ok(()) } + fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()> { + let mut write = try!(self.segment.open_write(SegmentComponent::INFO)); + let json_data = json::encode(segment_info).unwrap(); + write.write_all(json_data.as_bytes()); + write.flush(); + Ok(()) + } + fn close(mut self,) -> io::Result<()> { // TODO handle errors on close try!(self.term_fst_builder diff --git a/src/core/collector.rs b/src/core/collector.rs index ce1278201..1f2f6712e 100644 --- a/src/core/collector.rs +++ b/src/core/collector.rs @@ -41,9 +41,6 @@ impl Collector for FirstNCollector { } } - -// - pub struct CountCollector { count: usize, } diff --git a/src/core/index.rs b/src/core/index.rs index 8828853a8..1ac118649 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,6 +1,7 @@ use std::path::{PathBuf, Path}; use std::io; use core::schema::Schema; +use core::schema::DocId; use std::io::Write; use std::sync::{Arc, RwLock, RwLockWriteGuard, RwLockReadGuard}; use std::fmt; @@ -190,10 +191,18 @@ impl Index { } + ///////////////////////// // Segment +#[derive(Clone,Debug,RustcDecodable,RustcEncodable)] +pub struct SegmentInfo { + pub max_doc: DocId, +} + + pub enum SegmentComponent { + INFO, POSTINGS, // POSITIONS, TERMS, @@ -215,6 +224,7 @@ impl Segment { fn path_suffix(component: &SegmentComponent)-> &'static str { match *component { // SegmentComponent::POSITIONS => ".pos", + SegmentComponent::INFO => ".info", SegmentComponent::POSTINGS => ".idx", SegmentComponent::TERMS => ".term", SegmentComponent::STORE => ".store", diff --git a/src/core/serial.rs b/src/core/serial.rs index 9142fa3a4..e18ed5a1b 100644 --- a/src/core/serial.rs +++ b/src/core/serial.rs @@ -1,12 +1,14 @@ use core::schema::*; use std::fmt; use std::io; +use core::index::SegmentInfo; pub trait SegmentSerializer { fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<(), io::Error>; fn write_docs(&mut self, docs: &[DocId]) -> Result<(), io::Error>; // TODO add size fn store_doc(&mut self, field: &mut Iterator); fn close(self,) -> Result; + fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()>; } pub trait SerializableSegment { @@ -67,6 +69,10 @@ impl SegmentSerializer for DebugSegmentSerializer { fn close(self,) -> Result { Ok(self.text) } + + fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()> { + Ok(()) + } } pub fn serialize_eq(left: &L, right: &R) -> bool{ diff --git a/src/core/writer.rs b/src/core/writer.rs index fcc218a15..da04b4d4b 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -9,6 +9,7 @@ use core::serial::{SegmentSerializer, SerializableSegment}; use core::analyzer::StreamingIterator; use std::io::Error as IOError; use core::index::Segment; +use core::index::SegmentInfo; pub struct PostingsWriter { @@ -69,7 +70,7 @@ impl IndexWriter { match segment_writer_res { Ok(segment_writer) => { let segment = segment_writer.segment(); - segment_writer.write_pending(); + segment_writer.finalize(); // write(self.segment_serializer); // try!(SimpleCodec::write(&self.segment_writer, &segment).map(|sz| (segment.clone(), sz))); // At this point, the segment is written @@ -88,6 +89,7 @@ impl IndexWriter { } + pub struct SegmentWriter { num_tokens: usize, max_doc: DocId, @@ -99,22 +101,33 @@ pub struct SegmentWriter { impl SegmentWriter { - // write on disk all of the stuff that - // are still on RAM. - // for this version, that's the term dictionary - // and the postings - fn write_pending(mut self,) -> Result<(), IOError> { + // Write on disk all of the stuff that + // is still on RAM : + // - the dictionary in an fst + // - the postings + // - the segment info + fn finalize(mut self,) -> Result<(), IOError> { { - for (term, postings_id) in self.term_index.iter() { - let doc_ids = &self.postings[postings_id.clone()].doc_ids; - let term_docfreq = doc_ids.len() as u32; - self.segment_serializer.new_term(&term, term_docfreq); - self.segment_serializer.write_docs(&doc_ids); + for (term, postings_id) in self.term_index.iter() { + let doc_ids = &self.postings[postings_id.clone()].doc_ids; + let term_docfreq = doc_ids.len() as u32; + self.segment_serializer.new_term(&term, term_docfreq); + self.segment_serializer.write_docs(&doc_ids); + } } + { + let segment_info = SegmentInfo { + max_doc: self.max_doc + }; + self.segment_serializer.write_segment_info(&segment_info); } self.segment_serializer.close() } + pub fn num_docs(&self,) -> DocId { + self.max_doc + } + pub fn segment(&self,) -> Segment { self.segment_serializer.segment() } diff --git a/src/lib.rs b/src/lib.rs index fa726fc7d..bae95b4ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -96,18 +96,14 @@ mod tests { } let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer()); - println!("{:?}", segment_str_before_writing); - - let commit_result = index_writer.commit(); assert!(commit_result.is_ok()); - let segment = commit_result.unwrap(); - SegmentReader::open(segment).unwrap(); - // let segment_reader = SegmentReader::open(segment).unwrap(); - // TODO ENABLE TEST - // let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader); - // assert_eq!(segment_str_before_writing, segment_str_after_reading); + let segment = commit_result.unwrap(); + let segment_reader = SegmentReader::open(segment).unwrap(); + assert_eq!(segment_reader.num_docs(), 3); + let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader); + assert_eq!(segment_str_before_writing, segment_str_after_reading); } }