Positions writes to an external Vec

This commit is contained in:
Paul Masurel
2018-02-24 11:14:45 +09:00
parent be830b03c5
commit 23387b0ed0
4 changed files with 74 additions and 66 deletions

View File

@@ -314,14 +314,15 @@ impl IndexMerger {
{
// we make sure to only write the term iff
// there is at least one document.
let positions: &[u32] = segment_postings.positions();
let term_freq = segment_postings.term_freq();
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
unreachable!();
// let positions: &[u32] = segment_postings.positions();
// let term_freq = segment_postings.term_freq();
// let delta_positions = delta_computer.compute_delta(positions);
// field_serializer.write_doc(
// remapped_doc_id,
// term_freq,
// delta_positions,
// )?;
}
if !segment_postings.advance() {
break;

View File

@@ -16,9 +16,5 @@ pub trait Postings: DocSet + 'static {
/// Returns the list of positions of the term, expressed as a list of
/// token ordinals.
fn positions_with_offset(&self, offset: u32) -> &[u32];
fn positions(&self) -> &[u32] {
self.positions_with_offset(0u32)
}
fn positions_with_offset(&self, offset: u32, output: &mut Vec<u32>);
}

View File

@@ -23,7 +23,6 @@ struct PositionComputer {
// if none, position are already loaded in
// the positions vec.
position_to_skip: Option<usize>,
positions: Vec<u32>,
positions_stream: CompressedIntStream,
}
@@ -31,7 +30,6 @@ impl PositionComputer {
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
PositionComputer {
position_to_skip: None,
positions: vec![],
positions_stream,
}
}
@@ -44,26 +42,19 @@ impl PositionComputer {
);
}
pub fn positions(&mut self, term_freq: usize, offset: u32) -> &[u32] {
pub fn positions(&mut self, offset: u32, output: &mut [u32]) {
let term_freq = output.len();
if let Some(num_skip) = self.position_to_skip {
let capacity = self.positions.capacity();
if capacity < term_freq {
let extra_capacity = term_freq - self.positions.len();
self.positions.reserve(extra_capacity);
}
unsafe {self.positions.set_len(term_freq)};
self.positions_stream.skip(num_skip);
let positions_buf = &mut self.positions[..term_freq];
self.positions_stream.read(positions_buf);
self.positions_stream.read(output);
self.position_to_skip = None;
let mut cum = offset;
for position_mut in positions_buf.iter_mut() {
cum += *position_mut;
*position_mut = cum;
for output_mut in output.iter_mut() {
cum += *output_mut;
*output_mut = cum;
}
positions_buf
} else {
&self.positions[..term_freq]
panic!("Failed positions");
}
}
}
@@ -321,14 +312,21 @@ impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
self.block_cursor.freq(self.cur)
}
fn positions_with_offset(&self, offset: u32) -> &[u32] {
let term_freq = self.term_freq();
self.position_computer
.as_ref()
.map(|position_computer| unsafe {
(&mut *position_computer.get()).positions(term_freq as usize, offset)
})
.unwrap_or(&EMPTY_POSITIONS[..])
fn positions_with_offset(&self, offset: u32, output: &mut Vec<u32>) {
if let Some(ref position_computer) = self.position_computer.as_ref() {
let prev_capacity = output.capacity();
let term_freq = self.term_freq() as usize;
if term_freq > prev_capacity {
let additional_len = term_freq - output.len();
output.reserve(additional_len);
}
unsafe {
output.set_len(term_freq);
(&mut *position_computer.get()).positions(offset, &mut output[..])
}
} else {
unimplemented!("You may not read positions twice!");
}
}
}

View File

@@ -18,8 +18,8 @@ impl<TPostings: Postings> PostingsWithOffset<TPostings> {
}
}
pub fn positions(&self) -> &[u32] {
self.postings.positions_with_offset(self.offset)
pub fn positions(&self, output: &mut Vec<u32>) {
self.postings.positions_with_offset(self.offset, output)
}
}
@@ -44,11 +44,11 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
pub struct PhraseScorer<TPostings: Postings> {
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
num_docsets: usize,
source: Vec<u32>,
result: Vec<u32>
left: Vec<u32>,
right: Vec<u32>
}
fn intersection_arr(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
let mut left_i = 0;
let mut right_i = 0;
let mut count = 0;
@@ -58,7 +58,7 @@ fn intersection_arr(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
} else if right[right_i] < left[left_i] {
right_i += 1;
} else {
output[count] = left[left_i];
left[count] = left[left_i];
count+=1;
left_i += 1;
right_i += 1;
@@ -67,6 +67,7 @@ fn intersection_arr(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
count
}
impl<TPostings: Postings> PhraseScorer<TPostings> {
pub fn new(term_postings: Vec<TPostings>) -> PhraseScorer<TPostings> {
@@ -79,34 +80,18 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_docsets,
source: Vec::with_capacity(100),
result: Vec::with_capacity(100)
left: Vec::with_capacity(100),
right: Vec::with_capacity(100)
}
}
fn phrase_match(&mut self) -> bool {
// TODO early exit when we don't care about th phrase frequency
let mut intersection_len;
{
let left = self.intersection_docset.docset(0).positions();
let right = self.intersection_docset.docset(1).positions();
let max_intersection_len = left.len().min(right.len());
if max_intersection_len > self.result.len() {
self.result.resize(max_intersection_len, 0u32);
self.source.resize(max_intersection_len, 0u32)
}
intersection_len = intersection_arr(left, right, &mut self.result[..]);
}
if intersection_len == 0 {
return false;
}
for i in 2..self.num_docsets {
mem::swap(&mut self.source, &mut self.result);
let term_positions = self.intersection_docset.docset(i).positions();
intersection_len = intersection_arr(
&self.source[..intersection_len],
term_positions,
&mut self.result[..]);
self.intersection_docset.docset(0).positions(&mut self.left);
let mut intersection_len = self.left.len();
for i in 1..self.num_docsets {
self.intersection_docset.docset(i).positions(&mut self.right);
intersection_len = intersection_arr(&mut self.left[..intersection_len], &self.right[..]);
if intersection_len == 0 {
return false;
}
@@ -157,3 +142,31 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
1f32
}
}
#[cfg(test)]
mod tests {
use tests;
use test::Bencher;
use super::{intersection_arr, intersection_avx};
#[bench]
fn bench_intersection(b: &mut Bencher) {
let left = tests::sample_with_seed(100_000, 0.1, 1);
let right = tests::sample_with_seed(200_000, 0.05, 2);
let mut output = vec![0u32; 200_000];
b.iter(|| {
intersection_arr(&left, &right, &mut output);
});
}
#[bench]
fn bench_intersection_avx(b: &mut Bencher) {
let left = tests::sample_with_seed(100_000, 0.1, 1);
let right = tests::sample_with_seed(200_000, 0.05, 2);
let mut output = vec![0u32; 200_000];
b.iter(|| {
intersection_avx(&left, &right, &mut output);
});
}
}