Positions almost working.

2026-06-04 01:20:41 +00:00 · 2017-08-05 23:17:35 +09:00
parent 63b35dd87b
commit 236fa74767
4 changed files with 103 additions and 22 deletions
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -24,7 +24,6 @@ use postings::SegmentPostingsOption;
 use postings::{SegmentPostings, BlockSegmentPostings};
 use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
 use schema::Schema;
-use postings::FreqHandler;



@@ -198,10 +197,10 @@ impl SegmentReader {
    /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
    /// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
    /// with `DocId`s and frequencies.
-    pub fn read_postings(&self,
+    pub fn read_postings<'a>(&'a self,
                         term: &Term,
                         option: SegmentPostingsOption)
-                         -> Option<SegmentPostings> {
+                         -> Option<SegmentPostings<'a>> {
        let field = term.field();
        let field_entry = self.schema.get_field_entry(field);
        let term_info = get!(self.get_term_info(term));
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -12,7 +12,6 @@ mod term_info;
 mod vec_postings;
 mod segment_postings;
 mod intersection;
-mod freq_handler;
 mod docset;
 mod segment_postings_option;

@@ -28,7 +27,6 @@ pub use self::vec_postings::VecPostings;

 pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
 pub use self::intersection::IntersectionDocSet;
-pub use self::freq_handler::FreqHandler;
 pub use self::segment_postings_option::SegmentPostingsOption;
 pub use common::HasLen;

@@ -63,18 +61,18 @@ mod tests {
        let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
        posting_serializer.new_field(text_field);
        posting_serializer.new_term("abc".as_bytes()).unwrap();
-        for doc_id in 0u32..3u32 {
-            let positions = vec![1, 2, 3, 2];
-            posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
+        for doc_id in 0u32..120u32 {
+            let delta_positions = vec![1, 2, 3, 2];
+            posting_serializer.write_doc(doc_id, 2, &delta_positions).unwrap();
        }
        posting_serializer.close_term().unwrap();
        posting_serializer.close().unwrap();
        let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
-        assert!(read.len() <= 16);
+        assert!(read.len() <= 140);
    }

    #[test]
-    pub fn test_position_and_fieldnorm() {
+    pub fn test_position_and_fieldnorm1() {
        let mut schema_builder = SchemaBuilder::default();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
@@ -144,6 +142,7 @@ mod tests {
                assert_eq!(postings_a.doc(), 0);
                assert_eq!(postings_a.term_freq(), 6);
                assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
+                assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
                assert!(postings_a.advance());
                assert_eq!(postings_a.doc(), 1u32);
                assert_eq!(postings_a.term_freq(), 1);
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -4,9 +4,65 @@ use postings::{Postings, DocSet, HasLen, SkipResult};
 use std::cmp;
 use fst::Streamer;
 use fastfield::DeleteBitSet;
-
+use std::cell::UnsafeCell;

 const EMPTY_DATA: [u8; 0] = [0u8; 0];
+const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
+
+struct PositionComputer<'a> {
+    // store the amount of position int
+    // before reading positions.
+    //
+    // if none, position are already loaded in
+    // the positions vec.
+    position_to_skip: Option<usize>,
+
+    delta_positions: Vec<u32>,
+    positions: Vec<u32>,
+    positions_stream: CompressedIntStream<'a>,
+}
+
+impl<'a> PositionComputer<'a> {
+
+    pub fn new(positions_stream: CompressedIntStream<'a>) -> PositionComputer<'a> {
+        PositionComputer {
+            position_to_skip: None,
+            positions: vec!(),
+            delta_positions: vec!(),
+            positions_stream: positions_stream,
+        }
+    }
+
+    pub fn add_skip(&mut self, num_skip: usize) {
+        self.position_to_skip = Some(
+            self.position_to_skip
+                .map(|prev_skip| prev_skip + num_skip)
+                .unwrap_or(0)
+            );
+        }
+
+    pub fn positions(&mut self, term_freq: usize) -> &[u32] {
+        self.delta_positions(term_freq);
+        &self.positions[..term_freq]
+    }
+
+    pub fn delta_positions(&mut self, term_freq: usize) -> &[u32] {
+        if let Some(num_skip) = self.position_to_skip {
+            self.delta_positions.resize(term_freq, 0u32);
+            self.positions_stream.skip(num_skip);
+            self.positions_stream.read(&mut self.delta_positions[..term_freq]);
+            self.positions.resize(term_freq, 0u32);
+            let mut cum = 0u32;
+            for i in 0..term_freq as usize {
+                cum += self.delta_positions[i];
+                self.positions[i] = cum;
+            }
+            self.position_to_skip = None;
+        }
+        &self.delta_positions[..term_freq]
+    }
+}
+


 /// `SegmentPostings` represents the inverted list or postings associated to
@@ -18,9 +74,11 @@ pub struct SegmentPostings<'a> {
    block_cursor: BlockSegmentPostings<'a>,
    cur: usize,
    delete_bitset: DeleteBitSet,
-    positions_stream: Option<CompressedIntStream<'a>>,
+
+    position_computer: Option<UnsafeCell<PositionComputer<'a>>>,
 }

+
 impl<'a> SegmentPostings<'a> {
    /// Reads a Segment postings from an &[u8]
    ///
@@ -30,24 +88,27 @@ impl<'a> SegmentPostings<'a> {
    ///   frequencies and/or positions
    pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
                               delete_bitset: DeleteBitSet,
-                               positions_stream: Option<CompressedIntStream<'a>>)
+                               positions_stream_opt: Option<CompressedIntStream<'a>>)
                               -> SegmentPostings<'a> {
+        let position_computer = positions_stream_opt.map(|stream| {
+            UnsafeCell::new(PositionComputer::new(stream))
+        });
        SegmentPostings {
            block_cursor: segment_block_postings,
            cur: NUM_DOCS_PER_BLOCK, // cursor within the block
            delete_bitset: delete_bitset,
-            positions_stream: positions_stream
+            position_computer: position_computer,
        }
    }

    /// Returns an empty segment postings object
-    pub fn empty() -> SegmentPostings<'static> {
+    pub fn empty() -> SegmentPostings<'a> {
        let empty_block_cursor = BlockSegmentPostings::empty();
        SegmentPostings {
            block_cursor: empty_block_cursor,
            delete_bitset: DeleteBitSet::empty(),
            cur: NUM_DOCS_PER_BLOCK,
-            positions_stream: None,
+            position_computer: None,
        }
    }
 }
@@ -58,7 +119,9 @@ impl<'a> DocSet for SegmentPostings<'a> {
    // next needs to be called a first time to point to the correct element.
    #[inline]
    fn advance(&mut self) -> bool {
+        let mut pos_to_skip = 0u32;
        loop {
+            pos_to_skip += self.term_freq();
            self.cur += 1;
            if self.cur >= self.block_cursor.block_len() {
                self.cur = 0;
@@ -68,6 +131,11 @@ impl<'a> DocSet for SegmentPostings<'a> {
                }
            }
            if !self.delete_bitset.is_deleted(self.doc()) {
+                if let Some(ref mut position_computer) = self.position_computer.as_mut() {
+                    unsafe {
+                        (*position_computer.get()).add_skip(pos_to_skip as usize);
+                    }
+                }
                return true;
            }
        }
@@ -181,11 +249,26 @@ impl<'a> Postings for SegmentPostings<'a> {
    }

    fn positions(&self) -> &[u32] {
-        unimplemented!();
+        let term_freq = self.term_freq();
+        let position_computer_ptr: *mut PositionComputer = self.position_computer
+            .as_ref()
+            .expect("Segment reader does not have positions.")
+            .get();
+        unsafe {
+            (&mut *position_computer_ptr).positions(term_freq as usize)
+        }
    }

    fn delta_positions(&self) -> &[u32] {
-        unimplemented!();
+        let term_freq = self.term_freq();
+        self.position_computer
+            .as_ref()
+            .map(|position_computer| {
+                unsafe {
+                    (&mut *position_computer.get()).delta_positions(term_freq as usize)
+                }
+            })
+            .unwrap_or(&EMPTY_POSITIONS[..])
    }

 }
@@ -333,7 +416,7 @@ impl<'a> BlockSegmentPostings<'a> {
            num_vint_docs: 0,

            doc_decoder: BlockDecoder::new(),
-            freq_decoder: BlockDecoder::new(),
+            freq_decoder: BlockDecoder::with_val(1),
            has_freq: false,

            remaining_data: &EMPTY_DATA,
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -27,13 +27,13 @@ impl TermWeight {
        1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
    }

-    pub fn specialized_scorer<'a>(&'a self,
+    pub fn specialized_scorer<'a>(&self,
                                  reader: &'a SegmentReader)
                                  -> Result<TermScorer<SegmentPostings<'a>>> {
        let field = self.term.field();
        let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
-        Ok(reader
-               .read_postings(&self.term, self.segment_postings_options)
+        let postings: Option<SegmentPostings<'a>> = reader.read_postings(&self.term, self.segment_postings_options);
+        Ok(postings
               .map(|segment_postings| {
                        TermScorer {
                            idf: self.idf(),