In sync with master. Fixed merging

This commit is contained in:
Paul Masurel
2018-03-19 12:58:42 +09:00
parent b0e5e1f61d
commit 59639cd311
6 changed files with 49 additions and 49 deletions

View File

@@ -58,7 +58,7 @@ impl InvertedIndexReader {
TermDictionaryImpl::empty(field_type),
ReadOnlySource::empty(),
ReadOnlySource::empty(),
DeleteBitSet::empty(),
None,
record_option,
)
}

View File

@@ -6,7 +6,6 @@ use core::SerializableSegment;
use indexer::SegmentSerializer;
use postings::InvertedIndexSerializer;
use itertools::Itertools;
use postings::Postings;
use docset::DocSet;
use fastfield::DeleteBitSet;
use schema::{Field, Schema};
@@ -18,6 +17,7 @@ use std::cmp::{max, min};
use termdict::TermDictionary;
use termdict::TermStreamer;
use postings::DeleteSet;
use postings::Postings;
pub struct IndexMerger {
schema: Schema,
@@ -206,6 +206,8 @@ impl IndexMerger {
}
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
let mut indexed_fields = vec![];
@@ -314,15 +316,15 @@ impl IndexMerger {
{
// we make sure to only write the term iff
// there is at least one document.
unreachable!();
// let positions: &[u32] = segment_postings.positions();
// let term_freq = segment_postings.term_freq();
// let delta_positions = delta_computer.compute_delta(positions);
// field_serializer.write_doc(
// remapped_doc_id,
// term_freq,
// delta_positions,
// )?;
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
}
if !segment_postings.advance() {
break;

View File

@@ -103,15 +103,18 @@ pub mod tests {
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new();
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
postings.advance();
assert_eq!(&[0, 1, 2], postings.positions());
postings.positions(&mut positions);
assert_eq!(&[0, 1, 2], &positions[..]);
postings.advance();
assert_eq!(&[0, 5], postings.positions());
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
@@ -119,7 +122,8 @@ pub mod tests {
.unwrap();
postings.advance();
postings.advance();
assert_eq!(&[0, 5], postings.positions());
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
@@ -128,7 +132,8 @@ pub mod tests {
.unwrap();
assert_eq!(postings.skip_next(1), SkipResult::Reached);
assert_eq!(postings.doc(), 1);
assert_eq!(&[0, 5], postings.positions());
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
@@ -136,7 +141,8 @@ pub mod tests {
.unwrap();
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
assert_eq!(postings.doc(), 1002);
assert_eq!(&[0, 5], postings.positions());
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
@@ -145,12 +151,14 @@ pub mod tests {
assert_eq!(postings.skip_next(100), SkipResult::Reached);
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
assert_eq!(postings.doc(), 1002);
assert_eq!(&[0, 5], postings.positions());
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
}
#[test]
pub fn test_position_and_fieldnorm1() {
let mut positions = Vec::new();
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -223,15 +231,16 @@ pub mod tests {
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
postings_a.positions(&mut positions);
assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1);
for i in 2u32..1000u32 {
assert!(postings_a.advance());
assert_eq!(postings_a.term_freq(), 1);
assert_eq!(postings_a.positions(), [i]);
postings_a.positions(&mut positions);
assert_eq!(&positions[..], [i]);
assert_eq!(postings_a.doc(), i);
}
assert!(!postings_a.advance());
@@ -246,7 +255,7 @@ pub mod tests {
for i in 2u32..1000u32 {
assert!(postings_e.advance());
assert_eq!(postings_e.term_freq(), i);
let positions = postings_e.positions();
postings_e.positions(&mut positions);
assert_eq!(positions.len(), i as usize);
for j in 0..positions.len() {
assert_eq!(positions[j], (j as u32));
@@ -260,6 +269,7 @@ pub mod tests {
#[test]
pub fn test_position_and_fieldnorm2() {
let mut positions: Vec<u32> = Vec::new();
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -288,7 +298,8 @@ pub mod tests {
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 1u32);
assert_eq!(postings.positions(), &[1u32, 4]);
postings.positions(&mut positions);
assert_eq!(&positions[..], &[1u32, 4]);
}
#[test]

View File

@@ -17,4 +17,8 @@ pub trait Postings: DocSet + 'static {
/// Returns the list of positions of the term, expressed as a list of
/// token ordinals.
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
fn positions(&mut self, output: &mut Vec<u32>) {
self.positions_with_offset(0u32, output);
}
}

View File

@@ -9,13 +9,10 @@ use std::cmp;
use fst::Streamer;
use compression::compressed_block_size;
use postings::{NoDelete, DeleteSet};
use std::cell::UnsafeCell;
use directory::{ReadOnlySource, SourceRead};
use postings::FreqReadingOption;
use postings::serializer::PostingsSerializer;
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
struct PositionComputer {
// store the amount of position int
// before reading positions.
@@ -41,8 +38,7 @@ impl PositionComputer {
}
// Positions can only be read once.
pub fn positions(&mut self, offset: u32, output: &mut [u32]) {
let term_freq = output.len();
pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) {
if let Some(num_skip) = self.position_to_skip {
self.positions_stream.skip(num_skip);
self.positions_stream.read(output);
@@ -183,7 +179,7 @@ impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
// add the term freq.
if self.position_computer.is_some() {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
let sum_freq: u32 = freqs_skipped.iter().sum()
let sum_freq: u32 = freqs_skipped.iter().sum();
self.position_computer.as_mut()
.unwrap()
.add_skip(sum_freq as usize);
@@ -319,10 +315,10 @@ impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
}
unsafe {
output.set_len(term_freq);
self.position_computer.as_mut().unwrap().positions(offset, &mut output[..])
self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
}
} else {
unimplemented!("You may not read positions twice!");
output.clear();
}
}
}

View File

@@ -2,7 +2,6 @@ use DocId;
use docset::{DocSet, SkipResult};
use postings::Postings;
use query::{Intersection, Scorer};
use std::mem;
struct PostingsWithOffset<TPostings> {
@@ -48,7 +47,7 @@ pub struct PhraseScorer<TPostings: Postings> {
right: Vec<u32>
}
fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
fn intersection_count(left: &[u32], right: &[u32]) -> usize {
let mut left_i = 0;
let mut right_i = 0;
let mut count = 0;
@@ -58,7 +57,6 @@ fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
} else if right[right_i] < left[left_i] {
right_i += 1;
} else {
left[count] = left[left_i];
count+=1;
left_i += 1;
right_i += 1;
@@ -95,7 +93,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
{
self.intersection_docset.docset_mut_specialized(i).positions(&mut self.right);
}
intersection_len = intersection_arr(&mut self.left[..intersection_len], &self.right[..]);
intersection_len = intersection_count(&mut self.left[..intersection_len], &self.right[..]);
if intersection_len == 0 {
return false;
}
@@ -152,25 +150,14 @@ mod tests {
use tests;
use test::Bencher;
use super::{intersection_arr, intersection_avx};
use super::intersection_count;
#[bench]
fn bench_intersection(b: &mut Bencher) {
let left = tests::sample_with_seed(100_000, 0.1, 1);
let right = tests::sample_with_seed(200_000, 0.05, 2);
let mut output = vec![0u32; 200_000];
let left = tests::sample_with_seed(10, 0.1, 1);
let right = tests::sample_with_seed(2, 0.05, 2);
b.iter(|| {
intersection_arr(&left, &right, &mut output);
});
}
#[bench]
fn bench_intersection_avx(b: &mut Bencher) {
let left = tests::sample_with_seed(100_000, 0.1, 1);
let right = tests::sample_with_seed(200_000, 0.05, 2);
let mut output = vec![0u32; 200_000];
b.iter(|| {
intersection_avx(&left, &right, &mut output);
intersection_count(&left, &right);
});
}
}