This commit is contained in:
Paul Masurel
2016-02-21 17:39:16 +09:00
parent ae05b7e35a
commit 77fc25da49
6 changed files with 66 additions and 40 deletions

View File

@@ -1,16 +1,13 @@
use core::DocId;
use core::reader::SegmentReader;
use core::directory::SegmentId;
use core::searcher::DocAddress;
pub trait Collector {
fn set_segment(&mut self, segment: &SegmentReader);
fn collect(&mut self, doc_id: DocId);
}
#[derive(Debug)]
pub struct DocAddress(pub SegmentId, pub DocId);
pub struct TestCollector {
docs: Vec<DocAddress>,
current_segment: Option<SegmentId>,
@@ -29,7 +26,6 @@ impl TestCollector {
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, segment: &SegmentReader) {

View File

@@ -23,7 +23,7 @@ use atomicwrites;
use tempdir::TempDir;
use std::io::Read;
#[derive(Clone, Debug)]
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct SegmentId(pub String);
pub fn generate_segment_name() -> SegmentId {

View File

@@ -2,6 +2,8 @@ use core::directory::Directory;
use core::directory::{Segment, SegmentId};
use std::collections::BinaryHeap;
use core::schema::Term;
use core::store::StoreReader;
use core::schema::Document;
use fst::Streamer;
use fst;
use std::io;
@@ -25,9 +27,9 @@ pub struct SegmentReader {
segment: Segment,
term_offsets: fst::Map,
postings_data: MmapReadOnly,
store_reader: StoreReader,
}
pub struct SegmentPostings {
doc_id: usize,
doc_ids: Vec<u32>,
@@ -95,10 +97,6 @@ impl Iterator for SegmentPostings {
}
}
impl SegmentReader {
pub fn id(&self,) -> SegmentId {
@@ -114,14 +112,19 @@ impl SegmentReader {
return Err(Error::FSTFormat(format!("The file {:?} does not seem to be a valid term to offset transducer.", filepath)));
}
};
let store_reader = StoreReader::new(try!(segment.mmap(SegmentComponent::STORE)));
let postings_shared_mmap = try!(segment.mmap(SegmentComponent::POSTINGS));
Ok(SegmentReader {
postings_data: postings_shared_mmap,
term_offsets: term_offsets,
segment: segment,
store_reader: store_reader,
})
}
pub fn get_doc(&self, doc_id: &DocId) -> Document {
self.store_reader.get(doc_id)
}
pub fn read_postings(&self, offset: usize) -> SegmentPostings {
let postings_data = unsafe {&self.postings_data.as_slice()[offset..]};
@@ -129,12 +132,9 @@ impl SegmentReader {
}
pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings> {
match self.term_offsets.get(term.as_slice()) {
Some(offset) => {
Some(self.read_postings(offset as usize))
},
None => None,
}
self.term_offsets
.get(term.as_slice())
.map(|offset| self.read_postings(offset as usize))
}
pub fn search(&self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings> {
@@ -143,9 +143,11 @@ impl SegmentReader {
for term in terms.iter() {
match self.get_term(term) {
Some(segment_posting) => {
println!("term found {:?}", term);
segment_postings.push(segment_posting);
}
None => {
println!("not found {:?}", term);
segment_postings.clear();
segment_postings.push(SegmentPostings::empty());
break;

View File

@@ -1,35 +1,56 @@
use core::reader::SegmentReader;
use core::directory::Directory;
use core::directory::SegmentId;
use core::global::DocId;
use core::schema::Document;
use core::directory::Segment;
use core::collector::Collector;
use std::collections::HashMap;
use core::schema::Term;
use core::postings::Postings;
use core::error::Result;
pub struct Searcher {
segments: Vec<SegmentReader>,
segments_idx: HashMap<SegmentId, usize>,
}
#[derive(Debug)]
pub struct DocAddress(pub SegmentId, pub DocId);
impl Searcher {
pub fn for_directory(directory: Directory) -> Searcher {
let mut segment_readers: Vec<SegmentReader> = Vec::new();
for segment in directory.segments().into_iter() {
match SegmentReader::open(segment.clone()) {
Ok(segment_reader) => {
segment_readers.push(segment_reader);
}
Err(err) => {
// TODO return err
println!("Error while opening {:?}, {:?}", segment, err);
}
}
}
pub fn get_doc(&self, doc_address: &DocAddress) -> Document {
// TODO err
let DocAddress(ref segment_id, ref doc_id) = *doc_address;
let segment_ord = self.segments_idx.get(&segment_id).unwrap();
let segment_reader = &self.segments[segment_ord.clone()];
segment_reader.get_doc(doc_id)
}
fn add_segment(&mut self, segment: Segment) -> Result<()> {
SegmentReader::open(segment.clone())
.map(|segment_reader| {
let segment_ord = self.segments.len();
self.segments.push(segment_reader);
self.segments_idx.insert(segment.id(), segment_ord);
})
}
pub fn new() -> Searcher {
Searcher {
segments: segment_readers
segments: Vec::new(),
segments_idx: HashMap::new(),
}
}
}
impl Searcher {
pub fn for_directory(directory: Directory) -> Searcher {
let mut searcher = Searcher::new();
for segment in directory.segments().into_iter() {
searcher.add_segment(segment);
}
searcher
}
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
for segment in &self.segments {

View File

@@ -19,6 +19,8 @@ use std::io::Seek;
use lz4;
use tempfile;
// TODO cache uncompressed pages
const BLOCK_SIZE: usize = 262144;
pub struct StoreWriter {
@@ -103,7 +105,7 @@ impl StoreWriter {
}
struct StoreReader {
pub struct StoreReader {
data: MmapReadOnly,
offsets: Vec<OffsetIndex>,
current_block: RefCell<Vec<u8>>,
@@ -119,10 +121,11 @@ impl StoreReader {
Vec::deserialize(&mut cursor).unwrap()
}
fn block_offset(&self, doc_id: DocId) -> OffsetIndex {
fn block_offset(&self, doc_id: &DocId) -> OffsetIndex {
let mut offset = OffsetIndex(0, 0);
for &OffsetIndex(first_doc_id, block_offset) in self.offsets.iter() {
if first_doc_id > doc_id {
println!("First doc id {}", first_doc_id);
if first_doc_id > *doc_id {
break;
}
else {
@@ -143,12 +146,13 @@ impl StoreReader {
lz4_decoder.read_to_end(&mut current_block_mut);
}
pub fn get(&self, doc_id: DocId) -> Document {
pub fn get(&self, doc_id: &DocId) -> Document {
let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id);
self.read_block(block_offset as usize);
let mut current_block_mut = self.current_block.borrow_mut();
let mut cursor = Cursor::new(&mut current_block_mut[..]);
for _ in first_doc_id..doc_id {
println!("{} / {}", first_doc_id, doc_id);
for _ in first_doc_id..*doc_id {
let block_length = u32::deserialize(&mut cursor).unwrap();
cursor.seek(SeekFrom::Current(block_length as i64));
}
@@ -209,5 +213,7 @@ fn test_store() {
let store_mmap = MmapReadOnly::open(&store_file).unwrap();
let store = StoreReader::new(store_mmap);
assert_eq!(offsets, store.offsets);
assert_eq!(store.get(4093).get_one(&field_title).unwrap(), "Doc 4093");
for i in 0..10000 {
assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i));
}
}

View File

@@ -155,6 +155,7 @@ impl SegmentWriter {
loop {
match tokens.next() {
Some(token) => {
// println!("TOKEN :{}:", token);
let term = Term::from_field_text(&field_value.field, token);
self.suscribe(doc_id, term);
self.num_tokens += 1;