From 77fc25da490a252275144dfd59b91a886f78a2d0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 21 Feb 2016 17:39:16 +0900 Subject: [PATCH] werwer --- src/core/collector.rs | 6 +---- src/core/directory.rs | 2 +- src/core/reader.rs | 24 ++++++++++--------- src/core/searcher.rs | 55 ++++++++++++++++++++++++++++++------------- src/core/store.rs | 18 +++++++++----- src/core/writer.rs | 1 + 6 files changed, 66 insertions(+), 40 deletions(-) diff --git a/src/core/collector.rs b/src/core/collector.rs index 90728727a..25e168306 100644 --- a/src/core/collector.rs +++ b/src/core/collector.rs @@ -1,16 +1,13 @@ use core::DocId; use core::reader::SegmentReader; use core::directory::SegmentId; - +use core::searcher::DocAddress; pub trait Collector { fn set_segment(&mut self, segment: &SegmentReader); fn collect(&mut self, doc_id: DocId); } -#[derive(Debug)] -pub struct DocAddress(pub SegmentId, pub DocId); - pub struct TestCollector { docs: Vec, current_segment: Option, @@ -29,7 +26,6 @@ impl TestCollector { } } - impl Collector for TestCollector { fn set_segment(&mut self, segment: &SegmentReader) { diff --git a/src/core/directory.rs b/src/core/directory.rs index 97b781464..79fb288a3 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -23,7 +23,7 @@ use atomicwrites; use tempdir::TempDir; use std::io::Read; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct SegmentId(pub String); pub fn generate_segment_name() -> SegmentId { diff --git a/src/core/reader.rs b/src/core/reader.rs index a4393c161..ed3f0869c 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -2,6 +2,8 @@ use core::directory::Directory; use core::directory::{Segment, SegmentId}; use std::collections::BinaryHeap; use core::schema::Term; +use core::store::StoreReader; +use core::schema::Document; use fst::Streamer; use fst; use std::io; @@ -25,9 +27,9 @@ pub struct SegmentReader { segment: Segment, term_offsets: fst::Map, postings_data: MmapReadOnly, + store_reader: StoreReader, } - pub struct SegmentPostings { doc_id: usize, doc_ids: Vec, @@ -95,10 +97,6 @@ impl Iterator for SegmentPostings { } } - - - - impl SegmentReader { pub fn id(&self,) -> SegmentId { @@ -114,14 +112,19 @@ impl SegmentReader { return Err(Error::FSTFormat(format!("The file {:?} does not seem to be a valid term to offset transducer.", filepath))); } }; + let store_reader = StoreReader::new(try!(segment.mmap(SegmentComponent::STORE))); let postings_shared_mmap = try!(segment.mmap(SegmentComponent::POSTINGS)); Ok(SegmentReader { postings_data: postings_shared_mmap, term_offsets: term_offsets, segment: segment, + store_reader: store_reader, }) } + pub fn get_doc(&self, doc_id: &DocId) -> Document { + self.store_reader.get(doc_id) + } pub fn read_postings(&self, offset: usize) -> SegmentPostings { let postings_data = unsafe {&self.postings_data.as_slice()[offset..]}; @@ -129,12 +132,9 @@ impl SegmentReader { } pub fn get_term<'a>(&'a self, term: &Term) -> Option { - match self.term_offsets.get(term.as_slice()) { - Some(offset) => { - Some(self.read_postings(offset as usize)) - }, - None => None, - } + self.term_offsets + .get(term.as_slice()) + .map(|offset| self.read_postings(offset as usize)) } pub fn search(&self, terms: &Vec) -> IntersectionPostings { @@ -143,9 +143,11 @@ impl SegmentReader { for term in terms.iter() { match self.get_term(term) { Some(segment_posting) => { + println!("term found {:?}", term); segment_postings.push(segment_posting); } None => { + println!("not found {:?}", term); segment_postings.clear(); segment_postings.push(SegmentPostings::empty()); break; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index b7733fbec..facccc213 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -1,35 +1,56 @@ use core::reader::SegmentReader; use core::directory::Directory; +use core::directory::SegmentId; +use core::global::DocId; +use core::schema::Document; +use core::directory::Segment; use core::collector::Collector; +use std::collections::HashMap; use core::schema::Term; use core::postings::Postings; +use core::error::Result; pub struct Searcher { segments: Vec, + segments_idx: HashMap, } +#[derive(Debug)] +pub struct DocAddress(pub SegmentId, pub DocId); + impl Searcher { - pub fn for_directory(directory: Directory) -> Searcher { - let mut segment_readers: Vec = Vec::new(); - for segment in directory.segments().into_iter() { - match SegmentReader::open(segment.clone()) { - Ok(segment_reader) => { - segment_readers.push(segment_reader); - } - Err(err) => { - // TODO return err - println!("Error while opening {:?}, {:?}", segment, err); - } - } - } + + pub fn get_doc(&self, doc_address: &DocAddress) -> Document { + // TODO err + let DocAddress(ref segment_id, ref doc_id) = *doc_address; + let segment_ord = self.segments_idx.get(&segment_id).unwrap(); + let segment_reader = &self.segments[segment_ord.clone()]; + segment_reader.get_doc(doc_id) + } + + fn add_segment(&mut self, segment: Segment) -> Result<()> { + SegmentReader::open(segment.clone()) + .map(|segment_reader| { + let segment_ord = self.segments.len(); + self.segments.push(segment_reader); + self.segments_idx.insert(segment.id(), segment_ord); + }) + } + + pub fn new() -> Searcher { Searcher { - segments: segment_readers + segments: Vec::new(), + segments_idx: HashMap::new(), } } -} - -impl Searcher { + pub fn for_directory(directory: Directory) -> Searcher { + let mut searcher = Searcher::new(); + for segment in directory.segments().into_iter() { + searcher.add_segment(segment); + } + searcher + } pub fn search(&self, terms: &Vec, collector: &mut Collector) { for segment in &self.segments { diff --git a/src/core/store.rs b/src/core/store.rs index 859496499..36777c0ec 100644 --- a/src/core/store.rs +++ b/src/core/store.rs @@ -19,6 +19,8 @@ use std::io::Seek; use lz4; use tempfile; +// TODO cache uncompressed pages + const BLOCK_SIZE: usize = 262144; pub struct StoreWriter { @@ -103,7 +105,7 @@ impl StoreWriter { } -struct StoreReader { +pub struct StoreReader { data: MmapReadOnly, offsets: Vec, current_block: RefCell>, @@ -119,10 +121,11 @@ impl StoreReader { Vec::deserialize(&mut cursor).unwrap() } - fn block_offset(&self, doc_id: DocId) -> OffsetIndex { + fn block_offset(&self, doc_id: &DocId) -> OffsetIndex { let mut offset = OffsetIndex(0, 0); for &OffsetIndex(first_doc_id, block_offset) in self.offsets.iter() { - if first_doc_id > doc_id { + println!("First doc id {}", first_doc_id); + if first_doc_id > *doc_id { break; } else { @@ -143,12 +146,13 @@ impl StoreReader { lz4_decoder.read_to_end(&mut current_block_mut); } - pub fn get(&self, doc_id: DocId) -> Document { + pub fn get(&self, doc_id: &DocId) -> Document { let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id); self.read_block(block_offset as usize); let mut current_block_mut = self.current_block.borrow_mut(); let mut cursor = Cursor::new(&mut current_block_mut[..]); - for _ in first_doc_id..doc_id { + println!("{} / {}", first_doc_id, doc_id); + for _ in first_doc_id..*doc_id { let block_length = u32::deserialize(&mut cursor).unwrap(); cursor.seek(SeekFrom::Current(block_length as i64)); } @@ -209,5 +213,7 @@ fn test_store() { let store_mmap = MmapReadOnly::open(&store_file).unwrap(); let store = StoreReader::new(store_mmap); assert_eq!(offsets, store.offsets); - assert_eq!(store.get(4093).get_one(&field_title).unwrap(), "Doc 4093"); + for i in 0..10000 { + assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i)); + } } diff --git a/src/core/writer.rs b/src/core/writer.rs index bca453226..eefbfb822 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -155,6 +155,7 @@ impl SegmentWriter { loop { match tokens.next() { Some(token) => { + // println!("TOKEN :{}:", token); let term = Term::from_field_text(&field_value.field, token); self.suscribe(doc_id, term); self.num_tokens += 1;