diff --git a/src/core/merger.rs b/src/core/merger.rs index 9a4071a35..ea84eb166 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -95,9 +95,10 @@ impl<'a> PostingsMerger<'a> { { let offset = self.doc_offsets[heap_item.segment_ord]; let reader = &self.readers[heap_item.segment_ord]; - let segment_postings = reader.read_postings(&heap_item.term_info); - let offset_postings = OffsetPostings::new(segment_postings, offset); - segment_postings_list.push(offset_postings); + // TODO FIX MERGER!!!!!!!!! + // let segment_postings = reader.read_postings(&heap_item.term_info); + // let offset_postings = OffsetPostings::new(segment_postings, offset); + // segment_postings_list.push(offset_postings); } self.push_next_segment_el(heap_item.segment_ord); } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 021e77e82..f3d04e65d 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -57,27 +57,4 @@ impl Searcher { pub fn search(&self, query: &Q, collector: &mut C) -> io::Result { query.search(self, collector) } - - // pub fn search(&self, terms: &Vec, collector: &mut C) -> io::Result { - // let mut timer_tree = TimerTree::new(); - // { - // let mut search_timer = timer_tree.open("search"); - // for (segment_ord, segment) in self.segments.iter().enumerate() { - // let mut segment_search_timer = search_timer.open("segment_search"); - // { - // let _ = segment_search_timer.open("set_segment"); - // try!(collector.set_segment(segment_ord as SegmentLocalId, &segment)); - // } - // let mut postings = segment.search(terms, segment_search_timer.open("get_postings")); - // { - // let _collection_timer = segment_search_timer.open("collection"); - // while postings.next() { - // collector.collect(postings.doc()); - // } - // } - // } - // } - // Ok(timer_tree) - // } - } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 097f420e2..f0a95ba1b 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -22,7 +22,7 @@ use postings::intersection; use schema::FieldEntry; use schema::Schema; use schema::FieldValue; - +use postings::FreqHandler; pub struct SegmentReader { segment_info: SegmentInfo, @@ -35,6 +35,8 @@ pub struct SegmentReader { } impl SegmentReader { + + /// Returns the highest document id ever attributed in /// this segment + 1. /// Today, `tantivy` does not handle deletes so, it happens @@ -43,6 +45,21 @@ impl SegmentReader { self.segment_info.max_doc } + pub fn get_fast_field_reader(&self, field: Field) -> io::Result { + let field_entry = self.schema.get_field_entry(field); + match *field_entry { + FieldEntry::Text(_, _) => { + Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields.")) + }, + FieldEntry::U32(_, _) => { + // TODO check that the schema allows that + //Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields.")) + self.fast_fields_reader.get_field(field) + }, + } + + } + pub fn get_store_reader(&self) -> &StoreReader { &self.store_reader } @@ -73,12 +90,10 @@ impl SegmentReader { }) } - pub fn term_infos(&self) -> &FstMap { &self.term_infos } - /// Returns the document (or to be accurate, its stored field) /// bearing the given doc id. /// This method is slow and should seldom be called from @@ -87,88 +102,34 @@ impl SegmentReader { self.store_reader.get(doc_id) } - pub fn get_fast_field_reader(&self, field: Field) -> io::Result { + pub fn read_postings(&self, term: &Term) -> Option { + let field = term.get_field(); let field_entry = self.schema.get_field_entry(field); - match *field_entry { - FieldEntry::Text(_, _) => { - Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields.")) - }, - FieldEntry::U32(_, _) => { - // TODO check that the schema allows that - //Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields.")) - self.fast_fields_reader.get_field(field) - }, - } - - } - - pub fn read_postings(&self, term_info: &TermInfo) -> SegmentPostings { + let term_info = get!(self.get_term_info(&term)); let offset = term_info.postings_offset as usize; - let postings_data = &self.postings_data.as_slice()[offset..]; - SegmentPostings::from_data(term_info.doc_freq, &postings_data) - } - - // TODO better error handling - pub fn read_postings_with_positions(&self, field_value: &FieldValue) -> SegmentPostings { - let field = field_value.field(); - let field_entry = self.schema.get_field_entry(field); - match field_entry { + let postings_data = &self.postings_data[offset..]; + let freq_handler = match field_entry { &FieldEntry::Text(_, ref options) => { - if !options.get_indexing_options().is_position_enabled() { - panic!("Position not indexed"); - } + if options.get_indexing_options().is_termfreq_enabled() { + FreqHandler::new_freq_reader() + } + else { + FreqHandler::NoFreq + } } _ => { panic!("Expected text field, got {:?}", field_entry); } - } - let term = field_value.to_term(); - let term_info = self.get_term(&term).unwrap(); - let offset = term_info.postings_offset as usize; - let postings_data = &self.postings_data[offset..]; - SegmentPostings::from_data(term_info.doc_freq, &postings_data) + }; + Some(SegmentPostings::from_data(term_info.doc_freq, &postings_data, freq_handler)) } - pub fn get_term<'a>(&'a self, term: &Term) -> Option { + pub fn get_term_info<'a>(&'a self, term: &Term) -> Option { self.term_infos.get(term.as_slice()) } - - /// Returns the list of doc ids containing all of the - /// given terms. - pub fn search<'a, 'b>(&'b self, terms: &Vec, mut timer: OpenTimer<'a>) -> Box { - if terms.len() == 1 { - match self.get_term(&terms[0]) { - Some(term_info) => { - let postings: SegmentPostings<'b> = self.read_postings(&term_info); - Box::new(postings) - }, - None => { - Box::new(SegmentPostings::empty()) - }, - } - } else { - let mut segment_postings: Vec = Vec::new(); - { - let mut decode_timer = timer.open("decode_all"); - for term in terms.iter() { - match self.get_term(term) { - Some(term_info) => { - let _decode_one_timer = decode_timer.open("decode_one"); - let segment_posting = self.read_postings(&term_info); - segment_postings.push(segment_posting); - } - None => { - // currently this is a strict intersection. - return Box::new(SegmentPostings::empty()); - } - } - } - } - Box::new(intersection(segment_postings)) - } - } } + impl fmt::Debug for SegmentReader { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "SegmentReader({:?})", self.segment_id) diff --git a/src/lib.rs b/src/lib.rs index 53f80c517..f7bae468b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,13 @@ extern crate num_cpus; #[cfg(test)] extern crate test; #[cfg(test)] extern crate rand; +#[macro_use] +mod macros { + macro_rules! get( + ($e:expr) => (match $e { Some(e) => e, None => return None }) + ); +} + mod core; mod datastruct; mod postings; @@ -56,6 +63,8 @@ pub use self::common::TimerTree; /// as they are added in the segment. pub type DocId = u32; + + #[cfg(test)] mod tests { diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index 68c7b1a26..b2a65c879 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -90,9 +90,7 @@ impl<'a> Postings for IntersectionPostings<'a> { Ordering::Greater => { return SkipResult::OverStep; } - Ordering::Less => { - // - } + Ordering::Less => {} } if !self.next() { return SkipResult::End; @@ -101,9 +99,6 @@ impl<'a> Postings for IntersectionPostings<'a> { } } - - - #[inline(never)] pub fn intersection<'a>(postings: Vec>) -> IntersectionPostings<'a> { let boxed_postings: Vec> = postings @@ -114,24 +109,4 @@ pub fn intersection<'a>(postings: Vec>) -> IntersectionPosti }) .collect(); IntersectionPostings::new(boxed_postings) - // let min_len = postings.iter() - // .map(|v| v.len()) - // .min() - // .unwrap(); - // let buffer: Vec = postings.pop().unwrap().0; - // let mut output: Vec = Vec::with_capacity(min_len); - // unsafe { - // output.set_len(min_len); - // } - // let mut pair = (output, buffer); - // for posting in postings.iter() { - // pair = (pair.1, pair.0); - // let output_len = compression::intersection(posting.0.as_slice(), - // pair.0.as_slice(), - // pair.1.as_mut_slice()); - // unsafe { - // pair.1.set_len(output_len); - // } - // } - // SegmentPostings(pair.1) } \ No newline at end of file diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 10eb9f050..1590639df 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -51,6 +51,29 @@ mod tests { let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); assert_eq!(read.len(), 12); } + + #[test] + fn test_intersection() { + { + let left = Box::new(VecPostings::new(vec!(1, 3, 9))); + let right = Box::new(VecPostings::new(vec!(3, 4, 9, 18))); + let mut intersection = IntersectionPostings::new(vec!(left, right)); + assert!(intersection.next()); + assert_eq!(intersection.doc(), 3); + assert!(intersection.next()); + assert_eq!(intersection.doc(), 9); + assert!(!intersection.next()); + } + { + let a = Box::new(VecPostings::new(vec!(1, 3, 9))); + let b = Box::new(VecPostings::new(vec!(3, 4, 9, 18))); + let c = Box::new(VecPostings::new(vec!(1, 5, 9, 111))); + let mut intersection = IntersectionPostings::new(vec!(a, b, c)); + assert!(intersection.next()); + assert_eq!(intersection.doc(), 9); + assert!(!intersection.next()); + } + } } @@ -58,27 +81,10 @@ mod tests { // #[cfg(test)] // mod tests { -// + // use super::*; // use test::Bencher; -// #[test] -// fn test_intersection() { -// { -// let left = VecPostings::new(vec!(1, 3, 9)); -// let right = VecPostings::new(vec!(3, 4, 9, 18)); -// let inter = IntersectionPostings::from_postings(vec!(left, right)); -// let vals: Vec = inter.collect(); -// assert_eq!(vals, vec!(3, 9)); -// } -// { -// let a = VecPostings::new(vec!(1, 3, 9)); -// let b = VecPostings::new(vec!(3, 4, 9, 18)); -// let c = VecPostings::new(vec!(1, 5, 9, 111)); -// let inter = IntersectionPostings::from_postings(vec!(a, b, c)); -// let vals: Vec = inter.collect(); -// assert_eq!(vals, vec!(9)); -// } -// } + // // #[bench] // fn bench_single_intersection(b: &mut Bencher) { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 3dce489d8..d49c573af 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -12,7 +12,7 @@ pub struct SegmentPostings<'a> { doc_freq: usize, doc_offset: u32, block_decoder: SIMDBlockDecoder, - freq_reader: FreqHandler, + freq_handler: FreqHandler, remaining_data: &'a [u8], cur: Wrapping, } @@ -26,7 +26,7 @@ impl<'a> SegmentPostings<'a> { doc_freq: 0, doc_offset: 0, block_decoder: SIMDBlockDecoder::new(), - freq_reader: FreqHandler::NoFreq, + freq_handler: FreqHandler::NoFreq, remaining_data: &EMPTY_ARRAY, cur: Wrapping(usize::max_value()), } @@ -36,21 +36,21 @@ impl<'a> SegmentPostings<'a> { let num_remaining_docs = self.doc_freq - self.cur.0; if num_remaining_docs >= NUM_DOCS_PER_BLOCK { self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset); - self.remaining_data = self.freq_reader.read_freq_block(self.remaining_data); + self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); self.doc_offset = self.block_decoder.output()[NUM_DOCS_PER_BLOCK - 1]; } else { self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs); - self.freq_reader.read_freq_vint(self.remaining_data, num_remaining_docs); + self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs); } } - pub fn from_data(doc_freq: u32, data: &'a [u8]) -> SegmentPostings<'a> { + pub fn from_data(doc_freq: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> { SegmentPostings { doc_freq: doc_freq as usize, doc_offset: 0, block_decoder: SIMDBlockDecoder::new(), - freq_reader: FreqHandler::new_freq_reader(), + freq_handler: freq_handler, remaining_data: data, cur: Wrapping(usize::max_value()), } diff --git a/src/query/multi_term_query.rs b/src/query/multi_term_query.rs index 1a7d83f84..aa2ecfbb4 100644 --- a/src/query/multi_term_query.rs +++ b/src/query/multi_term_query.rs @@ -51,9 +51,8 @@ impl MultiTermQuery { fn search_segment<'a, 'b>(&'b self, reader: &'b SegmentReader, mut timer: OpenTimer<'a>) -> Box { if self.terms.len() == 1 { - match reader.get_term(&self.terms[0]) { - Some(term_info) => { - let postings: SegmentPostings<'b> = reader.read_postings(&term_info); + match reader.read_postings(&self.terms[0]) { + Some(postings) => { Box::new(postings) }, None => { @@ -65,11 +64,10 @@ impl MultiTermQuery { { let mut decode_timer = timer.open("decode_all"); for term in self.terms.iter() { - match reader.get_term(term) { - Some(term_info) => { - let _decode_one_timer = decode_timer.open("decode_one"); - let segment_posting = reader.read_postings(&term_info); - segment_postings.push(segment_posting); + let _decode_one_timer = decode_timer.open("decode_one"); + match reader.read_postings(term) { + Some(postings) => { + segment_postings.push(postings); } None => { // currently this is a strict intersection.