This commit is contained in:
Paul Masurel
2016-03-05 16:21:02 +09:00
parent 0bd52f7d8f
commit e43e023e86
6 changed files with 55 additions and 23 deletions

View File

@@ -1,8 +1,10 @@
use std::io;
use std::io::{Read, Write};
use core::serial::{SegmentSerializer, SerializableSegment};
use rustc_serialize::json;
use core::directory::WritePtr;
use core::index::Segment;
use core::index::SegmentInfo;
use core::index::SegmentComponent;
use core::schema::Term;
use core::schema::DocId;
@@ -79,6 +81,14 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
Ok(())
}
fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()> {
let mut write = try!(self.segment.open_write(SegmentComponent::INFO));
let json_data = json::encode(segment_info).unwrap();
write.write_all(json_data.as_bytes());
write.flush();
Ok(())
}
fn close(mut self,) -> io::Result<()> {
// TODO handle errors on close
try!(self.term_fst_builder

View File

@@ -41,9 +41,6 @@ impl Collector for FirstNCollector {
}
}
//
pub struct CountCollector {
count: usize,
}

View File

@@ -1,6 +1,7 @@
use std::path::{PathBuf, Path};
use std::io;
use core::schema::Schema;
use core::schema::DocId;
use std::io::Write;
use std::sync::{Arc, RwLock, RwLockWriteGuard, RwLockReadGuard};
use std::fmt;
@@ -190,10 +191,18 @@ impl Index {
}
/////////////////////////
// Segment
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct SegmentInfo {
pub max_doc: DocId,
}
pub enum SegmentComponent {
INFO,
POSTINGS,
// POSITIONS,
TERMS,
@@ -215,6 +224,7 @@ impl Segment {
fn path_suffix(component: &SegmentComponent)-> &'static str {
match *component {
// SegmentComponent::POSITIONS => ".pos",
SegmentComponent::INFO => ".info",
SegmentComponent::POSTINGS => ".idx",
SegmentComponent::TERMS => ".term",
SegmentComponent::STORE => ".store",

View File

@@ -1,12 +1,14 @@
use core::schema::*;
use std::fmt;
use std::io;
use core::index::SegmentInfo;
pub trait SegmentSerializer<Output> {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<(), io::Error>;
fn write_docs(&mut self, docs: &[DocId]) -> Result<(), io::Error>; // TODO add size
fn store_doc(&mut self, field: &mut Iterator<Item=&FieldValue>);
fn close(self,) -> Result<Output, io::Error>;
fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()>;
}
pub trait SerializableSegment {
@@ -67,6 +69,10 @@ impl SegmentSerializer<String> for DebugSegmentSerializer {
fn close(self,) -> Result<String, io::Error> {
Ok(self.text)
}
fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()> {
Ok(())
}
}
pub fn serialize_eq<L: SerializableSegment, R: SerializableSegment>(left: &L, right: &R) -> bool{

View File

@@ -9,6 +9,7 @@ use core::serial::{SegmentSerializer, SerializableSegment};
use core::analyzer::StreamingIterator;
use std::io::Error as IOError;
use core::index::Segment;
use core::index::SegmentInfo;
pub struct PostingsWriter {
@@ -69,7 +70,7 @@ impl IndexWriter {
match segment_writer_res {
Ok(segment_writer) => {
let segment = segment_writer.segment();
segment_writer.write_pending();
segment_writer.finalize();
// write(self.segment_serializer);
// try!(SimpleCodec::write(&self.segment_writer, &segment).map(|sz| (segment.clone(), sz)));
// At this point, the segment is written
@@ -88,6 +89,7 @@ impl IndexWriter {
}
pub struct SegmentWriter {
num_tokens: usize,
max_doc: DocId,
@@ -99,22 +101,33 @@ pub struct SegmentWriter {
impl SegmentWriter {
// write on disk all of the stuff that
// are still on RAM.
// for this version, that's the term dictionary
// and the postings
fn write_pending(mut self,) -> Result<(), IOError> {
// Write on disk all of the stuff that
// is still on RAM :
// - the dictionary in an fst
// - the postings
// - the segment info
fn finalize(mut self,) -> Result<(), IOError> {
{
for (term, postings_id) in self.term_index.iter() {
let doc_ids = &self.postings[postings_id.clone()].doc_ids;
let term_docfreq = doc_ids.len() as u32;
self.segment_serializer.new_term(&term, term_docfreq);
self.segment_serializer.write_docs(&doc_ids);
for (term, postings_id) in self.term_index.iter() {
let doc_ids = &self.postings[postings_id.clone()].doc_ids;
let term_docfreq = doc_ids.len() as u32;
self.segment_serializer.new_term(&term, term_docfreq);
self.segment_serializer.write_docs(&doc_ids);
}
}
{
let segment_info = SegmentInfo {
max_doc: self.max_doc
};
self.segment_serializer.write_segment_info(&segment_info);
}
self.segment_serializer.close()
}
pub fn num_docs(&self,) -> DocId {
self.max_doc
}
pub fn segment(&self,) -> Segment {
self.segment_serializer.segment()
}

View File

@@ -96,18 +96,14 @@ mod tests {
}
let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
println!("{:?}", segment_str_before_writing);
let commit_result = index_writer.commit();
assert!(commit_result.is_ok());
let segment = commit_result.unwrap();
SegmentReader::open(segment).unwrap();
// let segment_reader = SegmentReader::open(segment).unwrap();
// TODO ENABLE TEST
// let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader);
// assert_eq!(segment_str_before_writing, segment_str_after_reading);
let segment = commit_result.unwrap();
let segment_reader = SegmentReader::open(segment).unwrap();
assert_eq!(segment_reader.num_docs(), 3);
let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader);
assert_eq!(segment_str_before_writing, segment_str_after_reading);
}
}