Added timer info.

This commit is contained in:
Paul Masurel
2016-04-13 10:14:48 +09:00
parent 32ede06fd3
commit 0087ec67b2
5 changed files with 139 additions and 67 deletions

View File

@@ -20,6 +20,7 @@ tempdir = "0.3.4"
bincode = "0.4.0"
serde = "0.6.11"
libc = "0.2.6"
argparse = "*"
num_cpus = "0.2"
lz4 = "1.13.131"
time = "0.1.34"
@@ -30,3 +31,7 @@ rand = "0.3.13"
[build-dependencies]
gcc = "0.3.24"
[[bin]]
name = "tantivy-merge"
path = "src/cli/merge.rs"

View File

@@ -15,6 +15,7 @@ pub mod index;
pub mod fastfield;
pub mod fastdivide;
pub mod merger;
pub mod timer;
use std::error;
use std::io;

View File

@@ -7,20 +7,23 @@ use core::directory::ReadOnlySource;
use std::io::Cursor;
use core::schema::DocId;
use core::index::SegmentComponent;
use core::postings::Postings;
use core::simdcompression::Decoder;
use std::io;
use std::iter;
use std::str;
use core::postings::TermInfo;
use core::fstmap::FstMap;
use std::fmt;
use rustc_serialize::json;
use core::index::SegmentInfo;
use core::timer::OpenTimer;
use core::schema::U32Field;
use core::convert_to_ioerror;
use core::serialize::BinarySerializable;
use core::fastfield::U32FastFieldsReader;
use core::fastfield::U32FastFieldReader;
use core::simdcompression;
use std::mem;
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -28,68 +31,95 @@ impl fmt::Debug for SegmentReader {
}
}
pub struct SegmentPostings {
doc_id: usize,
doc_ids: Vec<DocId>,
#[inline(never)]
pub fn intersection(mut postings: Vec<SegmentPostings>) -> SegmentPostings {
let min_len = postings
.iter()
.map(|v| v.len())
.min()
.unwrap();
let mut buffer: Vec<u32> = postings.pop().unwrap().0;
let mut output: Vec<u32> = Vec::with_capacity(min_len);
unsafe { output.set_len(min_len); }
let mut pair = (output, buffer);
for posting in postings.iter() {
pair = (pair.1, pair.0);
let output_len = simdcompression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice());
unsafe { pair.1.set_len(output_len); }
}
SegmentPostings(pair.1)
}
pub struct SegmentPostings(Vec<DocId>);
impl IntoIterator for SegmentPostings {
type Item = DocId;
type IntoIter = ::std::vec::IntoIter<DocId>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
impl SegmentPostings {
pub fn empty()-> SegmentPostings {
SegmentPostings {
doc_id: 0,
doc_ids: Vec::new(),
}
SegmentPostings(Vec::new())
}
pub fn len(&self,) -> usize {
self.0.len()
}
pub fn from_data(doc_freq: DocId, data: &[u8]) -> SegmentPostings {
let mut cursor = Cursor::new(data);
let data: Vec<u32> = Vec::deserialize(&mut cursor).unwrap();
let mut doc_ids: Vec<u32> = (0u32..doc_freq).collect();
let decoder = Decoder::new();
let num_doc_ids = decoder.decode_sorted(&data, &mut doc_ids);
doc_ids.truncate(num_doc_ids);
SegmentPostings {
doc_ids: doc_ids,
doc_id: 0,
let num_u32s = u32::deserialize(&mut cursor).unwrap();
let data_u32: &[u32] = unsafe { mem::transmute(data) };
let mut doc_ids: Vec<u32> = Vec::with_capacity(doc_freq as usize);
unsafe { doc_ids.set_len(doc_freq as usize); }
{
let decoder = Decoder::new();
let num_doc_ids = decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids);
SegmentPostings(doc_ids)
}
}
}
//
// impl Postings for SegmentPostings {
// fn skip_next(&mut self, target: DocId) -> Option<DocId> {
// loop {
// match Iterator::next(self) {
// Some(val) if val >= target => {
// return Some(val);
// },
// None => {
// return None;
// },
// _ => {}
// }
// }
// }
// }
impl Postings for SegmentPostings {
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
loop {
match Iterator::next(self) {
Some(val) if val >= target => {
return Some(val);
},
None => {
return None;
},
_ => {}
}
}
}
}
//
// impl Iterator for SegmentPostings {
//
// type Item = DocId;
//
// fn next(&mut self,) -> Option<DocId> {
// if self.doc_id < self.doc_ids.len() {
// let res = Some(self.doc_ids[self.doc_id]);
// self.doc_id += 1;
// return res;
// }
// else {
// None
// }
// }
// }
impl Iterator for SegmentPostings {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
if self.doc_id < self.doc_ids.len() {
let res = Some(self.doc_ids[self.doc_id]);
self.doc_id += 1;
return res;
}
else {
None
}
}
}
pub struct SegmentReader {
segment_info: SegmentInfo,
segment_id: SegmentId,
@@ -165,22 +195,40 @@ impl SegmentReader {
/// Returns the list of doc ids containing all of the
/// given terms.
pub fn search(&self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings> {
let mut segment_postings: Vec<SegmentPostings> = Vec::new();
for term in terms.iter() {
match self.get_term(term) {
pub fn search<'a>(&self, terms: &Vec<Term>, mut timer: OpenTimer<'a>) -> SegmentPostings {
if terms.len() == 1 {
match self.get_term(&terms[0]) {
Some(term_info) => {
let segment_posting = self.read_postings(&term_info);
segment_postings.push(segment_posting);
self.read_postings(&term_info)
}
None => {
segment_postings.clear();
segment_postings.push(SegmentPostings::empty());
break;
SegmentPostings::empty()
}
}
}
IntersectionPostings::from_postings(segment_postings)
else {
let mut segment_postings: Vec<SegmentPostings> = Vec::new();
{
let mut decode_timer = timer.open("decode_all");
for term in terms.iter() {
match self.get_term(term) {
Some(term_info) => {
let decode_one_timer = decode_timer.open("decode_one");
let segment_posting = self.read_postings(&term_info);
segment_postings.push(segment_posting);
}
None => {
// currently this is a strict intersection.
return SegmentPostings::empty();
}
}
}
}
{
let mut intersection_time = timer.open("intersection");
intersection(segment_postings)
}
}
}
}

View File

@@ -5,6 +5,7 @@ use core::schema::DocId;
use core::schema::Document;
use core::collector::Collector;
use std::io;
use core::timer::TimerTree;
use core::schema::Term;
#[derive(Debug)]
@@ -48,15 +49,26 @@ impl Searcher {
Ok(searcher)
}
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) -> io::Result<()> {
for (segment_ord, segment) in self.segments.iter().enumerate() {
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment));
let postings = segment.search(terms);
for doc_id in postings {
collector.collect(doc_id);
pub fn search<C: Collector>(&self, terms: &Vec<Term>, collector: &mut C) -> io::Result<TimerTree> {
let mut timer_tree = TimerTree::new();
{
let mut search_timer = timer_tree.open("search");
for (segment_ord, segment) in self.segments.iter().enumerate() {
let mut segment_search_timer = search_timer.open("segment_search");
{
let set_segment_timer = segment_search_timer.open("set_segment");
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment));
}
let postings = segment.search(terms, segment_search_timer.open("get_postings"));
{
let collection_timer = segment_search_timer.open("collection");
for doc_id in postings {
collector.collect(doc_id);
}
}
}
}
Ok(())
Ok(timer_tree)
}
}

View File

@@ -1,15 +1,21 @@
use libc::size_t;
use std::ptr;
// use std::cmp::min;
// use std::iter;
extern {
// fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
// fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
// fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
unsafe {
intersection_native(
left.as_ptr(), left.len(),
right.as_ptr(), right.len(),
output.as_mut_ptr())
}
}
pub struct Encoder {