From 59639cd31180375cc3f0e94ffe12817435c687be Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 19 Mar 2018 12:58:42 +0900 Subject: [PATCH] In sync with master. Fixed merging --- src/core/inverted_index_reader.rs | 2 +- src/indexer/merger.rs | 22 +++++++++-------- src/postings/mod.rs | 33 ++++++++++++++++--------- src/postings/postings.rs | 4 +++ src/postings/segment_postings.rs | 12 +++------ src/query/phrase_query/phrase_scorer.rs | 25 +++++-------------- 6 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index aff30704c..c6f730f19 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -58,7 +58,7 @@ impl InvertedIndexReader { TermDictionaryImpl::empty(field_type), ReadOnlySource::empty(), ReadOnlySource::empty(), - DeleteBitSet::empty(), + None, record_option, ) } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index bbdf7c2f7..e47943c45 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -6,7 +6,6 @@ use core::SerializableSegment; use indexer::SegmentSerializer; use postings::InvertedIndexSerializer; use itertools::Itertools; -use postings::Postings; use docset::DocSet; use fastfield::DeleteBitSet; use schema::{Field, Schema}; @@ -18,6 +17,7 @@ use std::cmp::{max, min}; use termdict::TermDictionary; use termdict::TermStreamer; use postings::DeleteSet; +use postings::Postings; pub struct IndexMerger { schema: Schema, @@ -206,6 +206,8 @@ impl IndexMerger { } fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { + + let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); let mut indexed_fields = vec![]; @@ -314,15 +316,15 @@ impl IndexMerger { { // we make sure to only write the term iff // there is at least one document. - unreachable!(); -// let positions: &[u32] = segment_postings.positions(); -// let term_freq = segment_postings.term_freq(); -// let delta_positions = delta_computer.compute_delta(positions); -// field_serializer.write_doc( -// remapped_doc_id, -// term_freq, -// delta_positions, -// )?; + let term_freq = segment_postings.term_freq(); + segment_postings.positions(&mut positions_buffer); + + let delta_positions = delta_computer.compute_delta(&positions_buffer); + field_serializer.write_doc( + remapped_doc_id, + term_freq, + delta_positions, + )?; } if !segment_postings.advance() { break; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 7734d1d6e..1c30e00a2 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -103,15 +103,18 @@ pub mod tests { let inverted_index = searcher.segment_reader(0u32).inverted_index(title); let term = Term::from_field_text(title, "abc"); + let mut positions = Vec::new(); { let mut postings = inverted_index .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .unwrap(); postings.advance(); - assert_eq!(&[0, 1, 2], postings.positions()); + postings.positions(&mut positions); + assert_eq!(&[0, 1, 2], &positions[..]); postings.advance(); - assert_eq!(&[0, 5], postings.positions()); + postings.positions(&mut positions); + assert_eq!(&[0, 5], &positions[..]); } { let mut postings = inverted_index @@ -119,7 +122,8 @@ pub mod tests { .unwrap(); postings.advance(); postings.advance(); - assert_eq!(&[0, 5], postings.positions()); + postings.positions(&mut positions); + assert_eq!(&[0, 5], &positions[..]); } { @@ -128,7 +132,8 @@ pub mod tests { .unwrap(); assert_eq!(postings.skip_next(1), SkipResult::Reached); assert_eq!(postings.doc(), 1); - assert_eq!(&[0, 5], postings.positions()); + postings.positions(&mut positions); + assert_eq!(&[0, 5], &positions[..]); } { let mut postings = inverted_index @@ -136,7 +141,8 @@ pub mod tests { .unwrap(); assert_eq!(postings.skip_next(1002), SkipResult::Reached); assert_eq!(postings.doc(), 1002); - assert_eq!(&[0, 5], postings.positions()); + postings.positions(&mut positions); + assert_eq!(&[0, 5], &positions[..]); } { let mut postings = inverted_index @@ -145,12 +151,14 @@ pub mod tests { assert_eq!(postings.skip_next(100), SkipResult::Reached); assert_eq!(postings.skip_next(1002), SkipResult::Reached); assert_eq!(postings.doc(), 1002); - assert_eq!(&[0, 5], postings.positions()); + postings.positions(&mut positions); + assert_eq!(&[0, 5], &positions[..]); } } #[test] pub fn test_position_and_fieldnorm1() { + let mut positions = Vec::new(); let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -223,15 +231,16 @@ pub mod tests { assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.term_freq(), 6); - assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); - assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); + postings_a.positions(&mut positions); + assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 1u32); assert_eq!(postings_a.term_freq(), 1); for i in 2u32..1000u32 { assert!(postings_a.advance()); assert_eq!(postings_a.term_freq(), 1); - assert_eq!(postings_a.positions(), [i]); + postings_a.positions(&mut positions); + assert_eq!(&positions[..], [i]); assert_eq!(postings_a.doc(), i); } assert!(!postings_a.advance()); @@ -246,7 +255,7 @@ pub mod tests { for i in 2u32..1000u32 { assert!(postings_e.advance()); assert_eq!(postings_e.term_freq(), i); - let positions = postings_e.positions(); + postings_e.positions(&mut positions); assert_eq!(positions.len(), i as usize); for j in 0..positions.len() { assert_eq!(positions[j], (j as u32)); @@ -260,6 +269,7 @@ pub mod tests { #[test] pub fn test_position_and_fieldnorm2() { + let mut positions: Vec = Vec::new(); let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -288,7 +298,8 @@ pub mod tests { .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 1u32); - assert_eq!(postings.positions(), &[1u32, 4]); + postings.positions(&mut positions); + assert_eq!(&positions[..], &[1u32, 4]); } #[test] diff --git a/src/postings/postings.rs b/src/postings/postings.rs index f66c6434d..b415860d5 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -17,4 +17,8 @@ pub trait Postings: DocSet + 'static { /// Returns the list of positions of the term, expressed as a list of /// token ordinals. fn positions_with_offset(&mut self, offset: u32, output: &mut Vec); + + fn positions(&mut self, output: &mut Vec) { + self.positions_with_offset(0u32, output); + } } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 292239f56..154381fbd 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -9,13 +9,10 @@ use std::cmp; use fst::Streamer; use compression::compressed_block_size; use postings::{NoDelete, DeleteSet}; -use std::cell::UnsafeCell; use directory::{ReadOnlySource, SourceRead}; use postings::FreqReadingOption; use postings::serializer::PostingsSerializer; -const EMPTY_POSITIONS: [u32; 0] = [0u32; 0]; - struct PositionComputer { // store the amount of position int // before reading positions. @@ -41,8 +38,7 @@ impl PositionComputer { } // Positions can only be read once. - pub fn positions(&mut self, offset: u32, output: &mut [u32]) { - let term_freq = output.len(); + pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) { if let Some(num_skip) = self.position_to_skip { self.positions_stream.skip(num_skip); self.positions_stream.read(output); @@ -183,7 +179,7 @@ impl DocSet for SegmentPostings { // add the term freq. if self.position_computer.is_some() { let freqs_skipped = &self.block_cursor.freqs()[self.cur..]; - let sum_freq: u32 = freqs_skipped.iter().sum() + let sum_freq: u32 = freqs_skipped.iter().sum(); self.position_computer.as_mut() .unwrap() .add_skip(sum_freq as usize); @@ -319,10 +315,10 @@ impl Postings for SegmentPostings { } unsafe { output.set_len(term_freq); - self.position_computer.as_mut().unwrap().positions(offset, &mut output[..]) + self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..]) } } else { - unimplemented!("You may not read positions twice!"); + output.clear(); } } } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index f77c63d68..f31b4238e 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -2,7 +2,6 @@ use DocId; use docset::{DocSet, SkipResult}; use postings::Postings; use query::{Intersection, Scorer}; -use std::mem; struct PostingsWithOffset { @@ -48,7 +47,7 @@ pub struct PhraseScorer { right: Vec } -fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize { +fn intersection_count(left: &[u32], right: &[u32]) -> usize { let mut left_i = 0; let mut right_i = 0; let mut count = 0; @@ -58,7 +57,6 @@ fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize { } else if right[right_i] < left[left_i] { right_i += 1; } else { - left[count] = left[left_i]; count+=1; left_i += 1; right_i += 1; @@ -95,7 +93,7 @@ impl PhraseScorer { { self.intersection_docset.docset_mut_specialized(i).positions(&mut self.right); } - intersection_len = intersection_arr(&mut self.left[..intersection_len], &self.right[..]); + intersection_len = intersection_count(&mut self.left[..intersection_len], &self.right[..]); if intersection_len == 0 { return false; } @@ -152,25 +150,14 @@ mod tests { use tests; use test::Bencher; - use super::{intersection_arr, intersection_avx}; + use super::intersection_count; #[bench] fn bench_intersection(b: &mut Bencher) { - let left = tests::sample_with_seed(100_000, 0.1, 1); - let right = tests::sample_with_seed(200_000, 0.05, 2); - let mut output = vec![0u32; 200_000]; + let left = tests::sample_with_seed(10, 0.1, 1); + let right = tests::sample_with_seed(2, 0.05, 2); b.iter(|| { - intersection_arr(&left, &right, &mut output); - }); - } - - #[bench] - fn bench_intersection_avx(b: &mut Bencher) { - let left = tests::sample_with_seed(100_000, 0.1, 1); - let right = tests::sample_with_seed(200_000, 0.05, 2); - let mut output = vec![0u32; 200_000]; - b.iter(|| { - intersection_avx(&left, &right, &mut output); + intersection_count(&left, &right); }); } } \ No newline at end of file