mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-21 18:50:42 +00:00
Added a bunch of test, change read_postings to work on a term
This commit is contained in:
@@ -95,9 +95,10 @@ impl<'a> PostingsMerger<'a> {
|
||||
{
|
||||
let offset = self.doc_offsets[heap_item.segment_ord];
|
||||
let reader = &self.readers[heap_item.segment_ord];
|
||||
let segment_postings = reader.read_postings(&heap_item.term_info);
|
||||
let offset_postings = OffsetPostings::new(segment_postings, offset);
|
||||
segment_postings_list.push(offset_postings);
|
||||
// TODO FIX MERGER!!!!!!!!!
|
||||
// let segment_postings = reader.read_postings(&heap_item.term_info);
|
||||
// let offset_postings = OffsetPostings::new(segment_postings, offset);
|
||||
// segment_postings_list.push(offset_postings);
|
||||
}
|
||||
self.push_next_segment_el(heap_item.segment_ord);
|
||||
}
|
||||
|
||||
@@ -57,27 +57,4 @@ impl Searcher {
|
||||
pub fn search<Q: Query, C: Collector>(&self, query: &Q, collector: &mut C) -> io::Result<TimerTree> {
|
||||
query.search(self, collector)
|
||||
}
|
||||
|
||||
// pub fn search<C: Collector>(&self, terms: &Vec<Term>, collector: &mut C) -> io::Result<TimerTree> {
|
||||
// let mut timer_tree = TimerTree::new();
|
||||
// {
|
||||
// let mut search_timer = timer_tree.open("search");
|
||||
// for (segment_ord, segment) in self.segments.iter().enumerate() {
|
||||
// let mut segment_search_timer = search_timer.open("segment_search");
|
||||
// {
|
||||
// let _ = segment_search_timer.open("set_segment");
|
||||
// try!(collector.set_segment(segment_ord as SegmentLocalId, &segment));
|
||||
// }
|
||||
// let mut postings = segment.search(terms, segment_search_timer.open("get_postings"));
|
||||
// {
|
||||
// let _collection_timer = segment_search_timer.open("collection");
|
||||
// while postings.next() {
|
||||
// collector.collect(postings.doc());
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// Ok(timer_tree)
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use postings::intersection;
|
||||
use schema::FieldEntry;
|
||||
use schema::Schema;
|
||||
use schema::FieldValue;
|
||||
|
||||
use postings::FreqHandler;
|
||||
|
||||
pub struct SegmentReader {
|
||||
segment_info: SegmentInfo,
|
||||
@@ -35,6 +35,8 @@ pub struct SegmentReader {
|
||||
}
|
||||
|
||||
impl SegmentReader {
|
||||
|
||||
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
/// Today, `tantivy` does not handle deletes so, it happens
|
||||
@@ -43,6 +45,21 @@ impl SegmentReader {
|
||||
self.segment_info.max_doc
|
||||
}
|
||||
|
||||
pub fn get_fast_field_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match *field_entry {
|
||||
FieldEntry::Text(_, _) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
|
||||
},
|
||||
FieldEntry::U32(_, _) => {
|
||||
// TODO check that the schema allows that
|
||||
//Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
|
||||
self.fast_fields_reader.get_field(field)
|
||||
},
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn get_store_reader(&self) -> &StoreReader {
|
||||
&self.store_reader
|
||||
}
|
||||
@@ -73,12 +90,10 @@ impl SegmentReader {
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
pub fn term_infos(&self) -> &FstMap<TermInfo> {
|
||||
&self.term_infos
|
||||
}
|
||||
|
||||
|
||||
/// Returns the document (or to be accurate, its stored field)
|
||||
/// bearing the given doc id.
|
||||
/// This method is slow and should seldom be called from
|
||||
@@ -87,88 +102,34 @@ impl SegmentReader {
|
||||
self.store_reader.get(doc_id)
|
||||
}
|
||||
|
||||
pub fn get_fast_field_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
|
||||
pub fn read_postings(&self, term: &Term) -> Option<SegmentPostings> {
|
||||
let field = term.get_field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match *field_entry {
|
||||
FieldEntry::Text(_, _) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
|
||||
},
|
||||
FieldEntry::U32(_, _) => {
|
||||
// TODO check that the schema allows that
|
||||
//Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
|
||||
self.fast_fields_reader.get_field(field)
|
||||
},
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn read_postings(&self, term_info: &TermInfo) -> SegmentPostings {
|
||||
let term_info = get!(self.get_term_info(&term));
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = &self.postings_data.as_slice()[offset..];
|
||||
SegmentPostings::from_data(term_info.doc_freq, &postings_data)
|
||||
}
|
||||
|
||||
// TODO better error handling
|
||||
pub fn read_postings_with_positions(&self, field_value: &FieldValue) -> SegmentPostings {
|
||||
let field = field_value.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry {
|
||||
let postings_data = &self.postings_data[offset..];
|
||||
let freq_handler = match field_entry {
|
||||
&FieldEntry::Text(_, ref options) => {
|
||||
if !options.get_indexing_options().is_position_enabled() {
|
||||
panic!("Position not indexed");
|
||||
}
|
||||
if options.get_indexing_options().is_termfreq_enabled() {
|
||||
FreqHandler::new_freq_reader()
|
||||
}
|
||||
else {
|
||||
FreqHandler::NoFreq
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
panic!("Expected text field, got {:?}", field_entry);
|
||||
}
|
||||
}
|
||||
let term = field_value.to_term();
|
||||
let term_info = self.get_term(&term).unwrap();
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = &self.postings_data[offset..];
|
||||
SegmentPostings::from_data(term_info.doc_freq, &postings_data)
|
||||
};
|
||||
Some(SegmentPostings::from_data(term_info.doc_freq, &postings_data, freq_handler))
|
||||
}
|
||||
|
||||
pub fn get_term<'a>(&'a self, term: &Term) -> Option<TermInfo> {
|
||||
pub fn get_term_info<'a>(&'a self, term: &Term) -> Option<TermInfo> {
|
||||
self.term_infos.get(term.as_slice())
|
||||
}
|
||||
|
||||
/// Returns the list of doc ids containing all of the
|
||||
/// given terms.
|
||||
pub fn search<'a, 'b>(&'b self, terms: &Vec<Term>, mut timer: OpenTimer<'a>) -> Box<Postings + 'b> {
|
||||
if terms.len() == 1 {
|
||||
match self.get_term(&terms[0]) {
|
||||
Some(term_info) => {
|
||||
let postings: SegmentPostings<'b> = self.read_postings(&term_info);
|
||||
Box::new(postings)
|
||||
},
|
||||
None => {
|
||||
Box::new(SegmentPostings::empty())
|
||||
},
|
||||
}
|
||||
} else {
|
||||
let mut segment_postings: Vec<SegmentPostings> = Vec::new();
|
||||
{
|
||||
let mut decode_timer = timer.open("decode_all");
|
||||
for term in terms.iter() {
|
||||
match self.get_term(term) {
|
||||
Some(term_info) => {
|
||||
let _decode_one_timer = decode_timer.open("decode_one");
|
||||
let segment_posting = self.read_postings(&term_info);
|
||||
segment_postings.push(segment_posting);
|
||||
}
|
||||
None => {
|
||||
// currently this is a strict intersection.
|
||||
return Box::new(SegmentPostings::empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Box::new(intersection(segment_postings))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
|
||||
@@ -27,6 +27,13 @@ extern crate num_cpus;
|
||||
#[cfg(test)] extern crate test;
|
||||
#[cfg(test)] extern crate rand;
|
||||
|
||||
#[macro_use]
|
||||
mod macros {
|
||||
macro_rules! get(
|
||||
($e:expr) => (match $e { Some(e) => e, None => return None })
|
||||
);
|
||||
}
|
||||
|
||||
mod core;
|
||||
mod datastruct;
|
||||
mod postings;
|
||||
@@ -56,6 +63,8 @@ pub use self::common::TimerTree;
|
||||
/// as they are added in the segment.
|
||||
pub type DocId = u32;
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -90,9 +90,7 @@ impl<'a> Postings for IntersectionPostings<'a> {
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Less => {
|
||||
//
|
||||
}
|
||||
Ordering::Less => {}
|
||||
}
|
||||
if !self.next() {
|
||||
return SkipResult::End;
|
||||
@@ -101,9 +99,6 @@ impl<'a> Postings for IntersectionPostings<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[inline(never)]
|
||||
pub fn intersection<'a>(postings: Vec<SegmentPostings<'a>>) -> IntersectionPostings<'a> {
|
||||
let boxed_postings: Vec<Box<Postings + 'a>> = postings
|
||||
@@ -114,24 +109,4 @@ pub fn intersection<'a>(postings: Vec<SegmentPostings<'a>>) -> IntersectionPosti
|
||||
})
|
||||
.collect();
|
||||
IntersectionPostings::new(boxed_postings)
|
||||
// let min_len = postings.iter()
|
||||
// .map(|v| v.len())
|
||||
// .min()
|
||||
// .unwrap();
|
||||
// let buffer: Vec<u32> = postings.pop().unwrap().0;
|
||||
// let mut output: Vec<u32> = Vec::with_capacity(min_len);
|
||||
// unsafe {
|
||||
// output.set_len(min_len);
|
||||
// }
|
||||
// let mut pair = (output, buffer);
|
||||
// for posting in postings.iter() {
|
||||
// pair = (pair.1, pair.0);
|
||||
// let output_len = compression::intersection(posting.0.as_slice(),
|
||||
// pair.0.as_slice(),
|
||||
// pair.1.as_mut_slice());
|
||||
// unsafe {
|
||||
// pair.1.set_len(output_len);
|
||||
// }
|
||||
// }
|
||||
// SegmentPostings(pair.1)
|
||||
}
|
||||
@@ -51,6 +51,29 @@ mod tests {
|
||||
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
|
||||
assert_eq!(read.len(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
let left = Box::new(VecPostings::new(vec!(1, 3, 9)));
|
||||
let right = Box::new(VecPostings::new(vec!(3, 4, 9, 18)));
|
||||
let mut intersection = IntersectionPostings::new(vec!(left, right));
|
||||
assert!(intersection.next());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert!(intersection.next());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.next());
|
||||
}
|
||||
{
|
||||
let a = Box::new(VecPostings::new(vec!(1, 3, 9)));
|
||||
let b = Box::new(VecPostings::new(vec!(3, 4, 9, 18)));
|
||||
let c = Box::new(VecPostings::new(vec!(1, 5, 9, 111)));
|
||||
let mut intersection = IntersectionPostings::new(vec!(a, b, c));
|
||||
assert!(intersection.next());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.next());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -58,27 +81,10 @@ mod tests {
|
||||
|
||||
// #[cfg(test)]
|
||||
// mod tests {
|
||||
//
|
||||
|
||||
// use super::*;
|
||||
// use test::Bencher;
|
||||
// #[test]
|
||||
// fn test_intersection() {
|
||||
// {
|
||||
// let left = VecPostings::new(vec!(1, 3, 9));
|
||||
// let right = VecPostings::new(vec!(3, 4, 9, 18));
|
||||
// let inter = IntersectionPostings::from_postings(vec!(left, right));
|
||||
// let vals: Vec<DocId> = inter.collect();
|
||||
// assert_eq!(vals, vec!(3, 9));
|
||||
// }
|
||||
// {
|
||||
// let a = VecPostings::new(vec!(1, 3, 9));
|
||||
// let b = VecPostings::new(vec!(3, 4, 9, 18));
|
||||
// let c = VecPostings::new(vec!(1, 5, 9, 111));
|
||||
// let inter = IntersectionPostings::from_postings(vec!(a, b, c));
|
||||
// let vals: Vec<DocId> = inter.collect();
|
||||
// assert_eq!(vals, vec!(9));
|
||||
// }
|
||||
// }
|
||||
|
||||
//
|
||||
// #[bench]
|
||||
// fn bench_single_intersection(b: &mut Bencher) {
|
||||
|
||||
@@ -12,7 +12,7 @@ pub struct SegmentPostings<'a> {
|
||||
doc_freq: usize,
|
||||
doc_offset: u32,
|
||||
block_decoder: SIMDBlockDecoder,
|
||||
freq_reader: FreqHandler,
|
||||
freq_handler: FreqHandler,
|
||||
remaining_data: &'a [u8],
|
||||
cur: Wrapping<usize>,
|
||||
}
|
||||
@@ -26,7 +26,7 @@ impl<'a> SegmentPostings<'a> {
|
||||
doc_freq: 0,
|
||||
doc_offset: 0,
|
||||
block_decoder: SIMDBlockDecoder::new(),
|
||||
freq_reader: FreqHandler::NoFreq,
|
||||
freq_handler: FreqHandler::NoFreq,
|
||||
remaining_data: &EMPTY_ARRAY,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
@@ -36,21 +36,21 @@ impl<'a> SegmentPostings<'a> {
|
||||
let num_remaining_docs = self.doc_freq - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_reader.read_freq_block(self.remaining_data);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output()[NUM_DOCS_PER_BLOCK - 1];
|
||||
}
|
||||
else {
|
||||
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
self.freq_reader.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_data(doc_freq: u32, data: &'a [u8]) -> SegmentPostings<'a> {
|
||||
pub fn from_data(doc_freq: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
doc_freq: doc_freq as usize,
|
||||
doc_offset: 0,
|
||||
block_decoder: SIMDBlockDecoder::new(),
|
||||
freq_reader: FreqHandler::new_freq_reader(),
|
||||
freq_handler: freq_handler,
|
||||
remaining_data: data,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
|
||||
@@ -51,9 +51,8 @@ impl MultiTermQuery {
|
||||
|
||||
fn search_segment<'a, 'b>(&'b self, reader: &'b SegmentReader, mut timer: OpenTimer<'a>) -> Box<Postings + 'b> {
|
||||
if self.terms.len() == 1 {
|
||||
match reader.get_term(&self.terms[0]) {
|
||||
Some(term_info) => {
|
||||
let postings: SegmentPostings<'b> = reader.read_postings(&term_info);
|
||||
match reader.read_postings(&self.terms[0]) {
|
||||
Some(postings) => {
|
||||
Box::new(postings)
|
||||
},
|
||||
None => {
|
||||
@@ -65,11 +64,10 @@ impl MultiTermQuery {
|
||||
{
|
||||
let mut decode_timer = timer.open("decode_all");
|
||||
for term in self.terms.iter() {
|
||||
match reader.get_term(term) {
|
||||
Some(term_info) => {
|
||||
let _decode_one_timer = decode_timer.open("decode_one");
|
||||
let segment_posting = reader.read_postings(&term_info);
|
||||
segment_postings.push(segment_posting);
|
||||
let _decode_one_timer = decode_timer.open("decode_one");
|
||||
match reader.read_postings(term) {
|
||||
Some(postings) => {
|
||||
segment_postings.push(postings);
|
||||
}
|
||||
None => {
|
||||
// currently this is a strict intersection.
|
||||
|
||||
Reference in New Issue
Block a user