Added a bunch of test, change read_postings to work on a term

This commit is contained in:
Paul Masurel
2016-06-20 21:27:02 +09:00
parent 55507a8e0f
commit 36684d76c5
8 changed files with 84 additions and 157 deletions

View File

@@ -95,9 +95,10 @@ impl<'a> PostingsMerger<'a> {
{
let offset = self.doc_offsets[heap_item.segment_ord];
let reader = &self.readers[heap_item.segment_ord];
let segment_postings = reader.read_postings(&heap_item.term_info);
let offset_postings = OffsetPostings::new(segment_postings, offset);
segment_postings_list.push(offset_postings);
// TODO FIX MERGER!!!!!!!!!
// let segment_postings = reader.read_postings(&heap_item.term_info);
// let offset_postings = OffsetPostings::new(segment_postings, offset);
// segment_postings_list.push(offset_postings);
}
self.push_next_segment_el(heap_item.segment_ord);
}

View File

@@ -57,27 +57,4 @@ impl Searcher {
pub fn search<Q: Query, C: Collector>(&self, query: &Q, collector: &mut C) -> io::Result<TimerTree> {
query.search(self, collector)
}
// pub fn search<C: Collector>(&self, terms: &Vec<Term>, collector: &mut C) -> io::Result<TimerTree> {
// let mut timer_tree = TimerTree::new();
// {
// let mut search_timer = timer_tree.open("search");
// for (segment_ord, segment) in self.segments.iter().enumerate() {
// let mut segment_search_timer = search_timer.open("segment_search");
// {
// let _ = segment_search_timer.open("set_segment");
// try!(collector.set_segment(segment_ord as SegmentLocalId, &segment));
// }
// let mut postings = segment.search(terms, segment_search_timer.open("get_postings"));
// {
// let _collection_timer = segment_search_timer.open("collection");
// while postings.next() {
// collector.collect(postings.doc());
// }
// }
// }
// }
// Ok(timer_tree)
// }
}

View File

@@ -22,7 +22,7 @@ use postings::intersection;
use schema::FieldEntry;
use schema::Schema;
use schema::FieldValue;
use postings::FreqHandler;
pub struct SegmentReader {
segment_info: SegmentInfo,
@@ -35,6 +35,8 @@ pub struct SegmentReader {
}
impl SegmentReader {
/// Returns the highest document id ever attributed in
/// this segment + 1.
/// Today, `tantivy` does not handle deletes so, it happens
@@ -43,6 +45,21 @@ impl SegmentReader {
self.segment_info.max_doc
}
pub fn get_fast_field_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
match *field_entry {
FieldEntry::Text(_, _) => {
Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
},
FieldEntry::U32(_, _) => {
// TODO check that the schema allows that
//Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
self.fast_fields_reader.get_field(field)
},
}
}
pub fn get_store_reader(&self) -> &StoreReader {
&self.store_reader
}
@@ -73,12 +90,10 @@ impl SegmentReader {
})
}
pub fn term_infos(&self) -> &FstMap<TermInfo> {
&self.term_infos
}
/// Returns the document (or to be accurate, its stored field)
/// bearing the given doc id.
/// This method is slow and should seldom be called from
@@ -87,88 +102,34 @@ impl SegmentReader {
self.store_reader.get(doc_id)
}
pub fn get_fast_field_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
pub fn read_postings(&self, term: &Term) -> Option<SegmentPostings> {
let field = term.get_field();
let field_entry = self.schema.get_field_entry(field);
match *field_entry {
FieldEntry::Text(_, _) => {
Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
},
FieldEntry::U32(_, _) => {
// TODO check that the schema allows that
//Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
self.fast_fields_reader.get_field(field)
},
}
}
pub fn read_postings(&self, term_info: &TermInfo) -> SegmentPostings {
let term_info = get!(self.get_term_info(&term));
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data.as_slice()[offset..];
SegmentPostings::from_data(term_info.doc_freq, &postings_data)
}
// TODO better error handling
pub fn read_postings_with_positions(&self, field_value: &FieldValue) -> SegmentPostings {
let field = field_value.field();
let field_entry = self.schema.get_field_entry(field);
match field_entry {
let postings_data = &self.postings_data[offset..];
let freq_handler = match field_entry {
&FieldEntry::Text(_, ref options) => {
if !options.get_indexing_options().is_position_enabled() {
panic!("Position not indexed");
}
if options.get_indexing_options().is_termfreq_enabled() {
FreqHandler::new_freq_reader()
}
else {
FreqHandler::NoFreq
}
}
_ => {
panic!("Expected text field, got {:?}", field_entry);
}
}
let term = field_value.to_term();
let term_info = self.get_term(&term).unwrap();
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
SegmentPostings::from_data(term_info.doc_freq, &postings_data)
};
Some(SegmentPostings::from_data(term_info.doc_freq, &postings_data, freq_handler))
}
pub fn get_term<'a>(&'a self, term: &Term) -> Option<TermInfo> {
pub fn get_term_info<'a>(&'a self, term: &Term) -> Option<TermInfo> {
self.term_infos.get(term.as_slice())
}
/// Returns the list of doc ids containing all of the
/// given terms.
pub fn search<'a, 'b>(&'b self, terms: &Vec<Term>, mut timer: OpenTimer<'a>) -> Box<Postings + 'b> {
if terms.len() == 1 {
match self.get_term(&terms[0]) {
Some(term_info) => {
let postings: SegmentPostings<'b> = self.read_postings(&term_info);
Box::new(postings)
},
None => {
Box::new(SegmentPostings::empty())
},
}
} else {
let mut segment_postings: Vec<SegmentPostings> = Vec::new();
{
let mut decode_timer = timer.open("decode_all");
for term in terms.iter() {
match self.get_term(term) {
Some(term_info) => {
let _decode_one_timer = decode_timer.open("decode_one");
let segment_posting = self.read_postings(&term_info);
segment_postings.push(segment_posting);
}
None => {
// currently this is a strict intersection.
return Box::new(SegmentPostings::empty());
}
}
}
}
Box::new(intersection(segment_postings))
}
}
}
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id)

View File

@@ -27,6 +27,13 @@ extern crate num_cpus;
#[cfg(test)] extern crate test;
#[cfg(test)] extern crate rand;
#[macro_use]
mod macros {
macro_rules! get(
($e:expr) => (match $e { Some(e) => e, None => return None })
);
}
mod core;
mod datastruct;
mod postings;
@@ -56,6 +63,8 @@ pub use self::common::TimerTree;
/// as they are added in the segment.
pub type DocId = u32;
#[cfg(test)]
mod tests {

View File

@@ -90,9 +90,7 @@ impl<'a> Postings for IntersectionPostings<'a> {
Ordering::Greater => {
return SkipResult::OverStep;
}
Ordering::Less => {
//
}
Ordering::Less => {}
}
if !self.next() {
return SkipResult::End;
@@ -101,9 +99,6 @@ impl<'a> Postings for IntersectionPostings<'a> {
}
}
#[inline(never)]
pub fn intersection<'a>(postings: Vec<SegmentPostings<'a>>) -> IntersectionPostings<'a> {
let boxed_postings: Vec<Box<Postings + 'a>> = postings
@@ -114,24 +109,4 @@ pub fn intersection<'a>(postings: Vec<SegmentPostings<'a>>) -> IntersectionPosti
})
.collect();
IntersectionPostings::new(boxed_postings)
// let min_len = postings.iter()
// .map(|v| v.len())
// .min()
// .unwrap();
// let buffer: Vec<u32> = postings.pop().unwrap().0;
// let mut output: Vec<u32> = Vec::with_capacity(min_len);
// unsafe {
// output.set_len(min_len);
// }
// let mut pair = (output, buffer);
// for posting in postings.iter() {
// pair = (pair.1, pair.0);
// let output_len = compression::intersection(posting.0.as_slice(),
// pair.0.as_slice(),
// pair.1.as_mut_slice());
// unsafe {
// pair.1.set_len(output_len);
// }
// }
// SegmentPostings(pair.1)
}

View File

@@ -51,6 +51,29 @@ mod tests {
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert_eq!(read.len(), 12);
}
#[test]
fn test_intersection() {
{
let left = Box::new(VecPostings::new(vec!(1, 3, 9)));
let right = Box::new(VecPostings::new(vec!(3, 4, 9, 18)));
let mut intersection = IntersectionPostings::new(vec!(left, right));
assert!(intersection.next());
assert_eq!(intersection.doc(), 3);
assert!(intersection.next());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.next());
}
{
let a = Box::new(VecPostings::new(vec!(1, 3, 9)));
let b = Box::new(VecPostings::new(vec!(3, 4, 9, 18)));
let c = Box::new(VecPostings::new(vec!(1, 5, 9, 111)));
let mut intersection = IntersectionPostings::new(vec!(a, b, c));
assert!(intersection.next());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.next());
}
}
}
@@ -58,27 +81,10 @@ mod tests {
// #[cfg(test)]
// mod tests {
//
// use super::*;
// use test::Bencher;
// #[test]
// fn test_intersection() {
// {
// let left = VecPostings::new(vec!(1, 3, 9));
// let right = VecPostings::new(vec!(3, 4, 9, 18));
// let inter = IntersectionPostings::from_postings(vec!(left, right));
// let vals: Vec<DocId> = inter.collect();
// assert_eq!(vals, vec!(3, 9));
// }
// {
// let a = VecPostings::new(vec!(1, 3, 9));
// let b = VecPostings::new(vec!(3, 4, 9, 18));
// let c = VecPostings::new(vec!(1, 5, 9, 111));
// let inter = IntersectionPostings::from_postings(vec!(a, b, c));
// let vals: Vec<DocId> = inter.collect();
// assert_eq!(vals, vec!(9));
// }
// }
//
// #[bench]
// fn bench_single_intersection(b: &mut Bencher) {

View File

@@ -12,7 +12,7 @@ pub struct SegmentPostings<'a> {
doc_freq: usize,
doc_offset: u32,
block_decoder: SIMDBlockDecoder,
freq_reader: FreqHandler,
freq_handler: FreqHandler,
remaining_data: &'a [u8],
cur: Wrapping<usize>,
}
@@ -26,7 +26,7 @@ impl<'a> SegmentPostings<'a> {
doc_freq: 0,
doc_offset: 0,
block_decoder: SIMDBlockDecoder::new(),
freq_reader: FreqHandler::NoFreq,
freq_handler: FreqHandler::NoFreq,
remaining_data: &EMPTY_ARRAY,
cur: Wrapping(usize::max_value()),
}
@@ -36,21 +36,21 @@ impl<'a> SegmentPostings<'a> {
let num_remaining_docs = self.doc_freq - self.cur.0;
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_reader.read_freq_block(self.remaining_data);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output()[NUM_DOCS_PER_BLOCK - 1];
}
else {
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
self.freq_reader.read_freq_vint(self.remaining_data, num_remaining_docs);
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
}
}
pub fn from_data(doc_freq: u32, data: &'a [u8]) -> SegmentPostings<'a> {
pub fn from_data(doc_freq: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
SegmentPostings {
doc_freq: doc_freq as usize,
doc_offset: 0,
block_decoder: SIMDBlockDecoder::new(),
freq_reader: FreqHandler::new_freq_reader(),
freq_handler: freq_handler,
remaining_data: data,
cur: Wrapping(usize::max_value()),
}

View File

@@ -51,9 +51,8 @@ impl MultiTermQuery {
fn search_segment<'a, 'b>(&'b self, reader: &'b SegmentReader, mut timer: OpenTimer<'a>) -> Box<Postings + 'b> {
if self.terms.len() == 1 {
match reader.get_term(&self.terms[0]) {
Some(term_info) => {
let postings: SegmentPostings<'b> = reader.read_postings(&term_info);
match reader.read_postings(&self.terms[0]) {
Some(postings) => {
Box::new(postings)
},
None => {
@@ -65,11 +64,10 @@ impl MultiTermQuery {
{
let mut decode_timer = timer.open("decode_all");
for term in self.terms.iter() {
match reader.get_term(term) {
Some(term_info) => {
let _decode_one_timer = decode_timer.open("decode_one");
let segment_posting = reader.read_postings(&term_info);
segment_postings.push(segment_posting);
let _decode_one_timer = decode_timer.open("decode_one");
match reader.read_postings(term) {
Some(postings) => {
segment_postings.push(postings);
}
None => {
// currently this is a strict intersection.