In sync with master. Fixed merging

2026-01-09 02:22:54 +00:00 · 2018-03-19 12:58:42 +09:00
parent b0e5e1f61d
commit 59639cd311
6 changed files with 49 additions and 49 deletions
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -58,7 +58,7 @@ impl InvertedIndexReader {
            TermDictionaryImpl::empty(field_type),
            ReadOnlySource::empty(),
            ReadOnlySource::empty(),
-            DeleteBitSet::empty(),
+            None,
            record_option,
        )
    }
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -6,7 +6,6 @@ use core::SerializableSegment;
 use indexer::SegmentSerializer;
 use postings::InvertedIndexSerializer;
 use itertools::Itertools;
-use postings::Postings;
 use docset::DocSet;
 use fastfield::DeleteBitSet;
 use schema::{Field, Schema};
@@ -18,6 +17,7 @@ use std::cmp::{max, min};
 use termdict::TermDictionary;
 use termdict::TermStreamer;
 use postings::DeleteSet;
+use postings::Postings;

 pub struct IndexMerger {
    schema: Schema,
@@ -206,6 +206,8 @@ impl IndexMerger {
    }

    fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
+
+        let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
        let mut delta_computer = DeltaComputer::new();

        let mut indexed_fields = vec![];
@@ -314,15 +316,15 @@ impl IndexMerger {
                            {
                                // we make sure to only write the term iff
                                // there is at least one document.
-                                unreachable!();
-//                                let positions: &[u32] = segment_postings.positions();
-//                                let term_freq = segment_postings.term_freq();
-//                                let delta_positions = delta_computer.compute_delta(positions);
-//                                field_serializer.write_doc(
-//                                    remapped_doc_id,
-//                                    term_freq,
-//                                    delta_positions,
-//                                )?;
+                                let term_freq = segment_postings.term_freq();
+                                segment_postings.positions(&mut positions_buffer);
+
+                                let delta_positions = delta_computer.compute_delta(&positions_buffer);
+                                field_serializer.write_doc(
+                                    remapped_doc_id,
+                                    term_freq,
+                                    delta_positions,
+                                )?;
                            }
                            if !segment_postings.advance() {
                                break;
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -103,15 +103,18 @@ pub mod tests {
        let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
        let term = Term::from_field_text(title, "abc");

+        let mut positions = Vec::new();

        {
            let mut postings = inverted_index
                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
                .unwrap();
            postings.advance();
-            assert_eq!(&[0, 1, 2], postings.positions());
+            postings.positions(&mut positions);
+            assert_eq!(&[0, 1, 2], &positions[..]);
            postings.advance();
-            assert_eq!(&[0, 5], postings.positions());
+            postings.positions(&mut positions);
+            assert_eq!(&[0, 5], &positions[..]);
        }
        {
            let mut postings = inverted_index
@@ -119,7 +122,8 @@ pub mod tests {
                .unwrap();
            postings.advance();
            postings.advance();
-            assert_eq!(&[0, 5], postings.positions());
+            postings.positions(&mut positions);
+            assert_eq!(&[0, 5], &positions[..]);
        }
        {

@@ -128,7 +132,8 @@ pub mod tests {
                .unwrap();
            assert_eq!(postings.skip_next(1), SkipResult::Reached);
            assert_eq!(postings.doc(), 1);
-            assert_eq!(&[0, 5], postings.positions());
+            postings.positions(&mut positions);
+            assert_eq!(&[0, 5], &positions[..]);
        }
        {
            let mut postings = inverted_index
@@ -136,7 +141,8 @@ pub mod tests {
                .unwrap();
            assert_eq!(postings.skip_next(1002), SkipResult::Reached);
            assert_eq!(postings.doc(), 1002);
-            assert_eq!(&[0, 5], postings.positions());
+            postings.positions(&mut positions);
+            assert_eq!(&[0, 5], &positions[..]);
        }
        {
            let mut postings = inverted_index
@@ -145,12 +151,14 @@ pub mod tests {
            assert_eq!(postings.skip_next(100), SkipResult::Reached);
            assert_eq!(postings.skip_next(1002), SkipResult::Reached);
            assert_eq!(postings.doc(), 1002);
-            assert_eq!(&[0, 5], postings.positions());
+            postings.positions(&mut positions);
+            assert_eq!(&[0, 5], &positions[..]);
        }
    }

    #[test]
    pub fn test_position_and_fieldnorm1() {
+        let mut positions = Vec::new();
        let mut schema_builder = SchemaBuilder::default();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
@@ -223,15 +231,16 @@ pub mod tests {
                assert!(postings_a.advance());
                assert_eq!(postings_a.doc(), 0);
                assert_eq!(postings_a.term_freq(), 6);
-                assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
-                assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
+                postings_a.positions(&mut positions);
+                assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
                assert!(postings_a.advance());
                assert_eq!(postings_a.doc(), 1u32);
                assert_eq!(postings_a.term_freq(), 1);
                for i in 2u32..1000u32 {
                    assert!(postings_a.advance());
                    assert_eq!(postings_a.term_freq(), 1);
-                    assert_eq!(postings_a.positions(), [i]);
+                    postings_a.positions(&mut positions);
+                    assert_eq!(&positions[..], [i]);
                    assert_eq!(postings_a.doc(), i);
                }
                assert!(!postings_a.advance());
@@ -246,7 +255,7 @@ pub mod tests {
                for i in 2u32..1000u32 {
                    assert!(postings_e.advance());
                    assert_eq!(postings_e.term_freq(), i);
-                    let positions = postings_e.positions();
+                    postings_e.positions(&mut positions);
                    assert_eq!(positions.len(), i as usize);
                    for j in 0..positions.len() {
                        assert_eq!(positions[j], (j as u32));
@@ -260,6 +269,7 @@ pub mod tests {

    #[test]
    pub fn test_position_and_fieldnorm2() {
+        let mut positions: Vec<u32> = Vec::new();
        let mut schema_builder = SchemaBuilder::default();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
@@ -288,7 +298,8 @@ pub mod tests {
            .unwrap();
        assert!(postings.advance());
        assert_eq!(postings.doc(), 1u32);
-        assert_eq!(postings.positions(), &[1u32, 4]);
+        postings.positions(&mut positions);
+        assert_eq!(&positions[..], &[1u32, 4]);
    }

    #[test]
--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -17,4 +17,8 @@ pub trait Postings: DocSet + 'static {
    /// Returns the list of positions of the term, expressed as a list of
    /// token ordinals.
    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
+
+    fn positions(&mut self, output: &mut Vec<u32>) {
+        self.positions_with_offset(0u32, output);
+    }
 }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -9,13 +9,10 @@ use std::cmp;
 use fst::Streamer;
 use compression::compressed_block_size;
 use postings::{NoDelete, DeleteSet};
-use std::cell::UnsafeCell;
 use directory::{ReadOnlySource, SourceRead};
 use postings::FreqReadingOption;
 use postings::serializer::PostingsSerializer;

-const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
-
 struct PositionComputer {
    // store the amount of position int
    // before reading positions.
@@ -41,8 +38,7 @@ impl PositionComputer {
    }

    // Positions can only be read once.
-    pub fn positions(&mut self, offset: u32, output: &mut [u32]) {
-        let term_freq = output.len();
+    pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) {
        if let Some(num_skip) = self.position_to_skip {
            self.positions_stream.skip(num_skip);
            self.positions_stream.read(output);
@@ -183,7 +179,7 @@ impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
                // add the term freq.
                if self.position_computer.is_some() {
                    let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
-                    let sum_freq: u32 = freqs_skipped.iter().sum()
+                    let sum_freq: u32 = freqs_skipped.iter().sum();
                    self.position_computer.as_mut()
                        .unwrap()
                        .add_skip(sum_freq as usize);
@@ -319,10 +315,10 @@ impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
            }
            unsafe {
                output.set_len(term_freq);
-                self.position_computer.as_mut().unwrap().positions(offset, &mut output[..])
+                self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
            }
        } else {
-            unimplemented!("You may not read positions twice!");
+            output.clear();
        }
    }
 }
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -2,7 +2,6 @@ use DocId;
 use docset::{DocSet, SkipResult};
 use postings::Postings;
 use query::{Intersection, Scorer};
-use std::mem;


 struct PostingsWithOffset<TPostings> {
@@ -48,7 +47,7 @@ pub struct PhraseScorer<TPostings: Postings> {
    right: Vec<u32>
 }

-fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
+fn intersection_count(left: &[u32], right: &[u32]) -> usize {
    let mut left_i = 0;
    let mut right_i = 0;
    let mut count = 0;
@@ -58,7 +57,6 @@ fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
        } else if right[right_i] < left[left_i] {
            right_i += 1;
        } else {
-            left[count] = left[left_i];
            count+=1;
            left_i += 1;
            right_i += 1;
@@ -95,7 +93,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
            {
                self.intersection_docset.docset_mut_specialized(i).positions(&mut self.right);
            }
-            intersection_len = intersection_arr(&mut self.left[..intersection_len], &self.right[..]);
+            intersection_len = intersection_count(&mut self.left[..intersection_len], &self.right[..]);
            if intersection_len == 0 {
                return false;
            }
@@ -152,25 +150,14 @@ mod tests {

    use tests;
    use test::Bencher;
-    use super::{intersection_arr, intersection_avx};
+    use super::intersection_count;

    #[bench]
    fn bench_intersection(b: &mut Bencher) {
-        let left = tests::sample_with_seed(100_000, 0.1, 1);
-        let right = tests::sample_with_seed(200_000, 0.05, 2);
-        let mut output = vec![0u32; 200_000];
+        let left = tests::sample_with_seed(10, 0.1, 1);
+        let right = tests::sample_with_seed(2, 0.05, 2);
        b.iter(|| {
-            intersection_arr(&left, &right, &mut output);
-        });
-    }
-
-    #[bench]
-    fn bench_intersection_avx(b: &mut Bencher) {
-        let left = tests::sample_with_seed(100_000, 0.1, 1);
-        let right = tests::sample_with_seed(200_000, 0.05, 2);
-        let mut output = vec![0u32; 200_000];
-        b.iter(|| {
-            intersection_avx(&left, &right, &mut output);
+            intersection_count(&left, &right);
        });
    }
 }