From 2be5f08cd6b567806531e8a3f6ce840e48adadce Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Sat, 20 May 2017 11:46:40 +0900
Subject: [PATCH] issue/162 Added block iteration API

---
 src/core/segment_reader.rs       |   2 -
 src/postings/mod.rs              |   3 +-
 src/postings/segment_postings.rs | 295 ++++++++++---------------------
 3 files changed, 90 insertions(+), 210 deletions(-)
diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
index 62a9347d8..03ad7d248 100644
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -22,9 +22,7 @@ use postings::SegmentPostingsOption;
 use postings::{SegmentPostings, BlockSegmentPostings};
 use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
 use schema::Schema;
-use schema::FieldType;
 use postings::FreqHandler;
-use schema::TextIndexingOptions;
 
 
 
diff --git a/src/postings/mod.rs b/src/postings/mod.rs
index 483b7ed46..a338ae8db 100644
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -468,7 +468,6 @@ mod tests {
         });
     }
 
-
     fn bench_skip_next(p: f32, b: &mut Bencher) {
         let searcher = INDEX.searcher();
         let segment_reader = searcher.segment_reader(0);
@@ -479,6 +478,7 @@ mod tests {
             .unwrap();
         
         let mut existing_docs = Vec::new();
+        segment_postings.advance();
         for doc in &docs {
             if *doc >= segment_postings.doc() {
                 existing_docs.push(*doc);
@@ -493,7 +493,6 @@ mod tests {
                 .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
                 .unwrap();
             for doc in &existing_docs {
-                println!("doc {}", doc);
                 if segment_postings.skip_next(*doc) == SkipResult::End {
                     break;
                 }
diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs
index f917449b2..b5a191d3c 100644
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -2,14 +2,12 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
 use DocId;
 use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult};
 use std::cmp;
-use std::num::Wrapping;
 use fastfield::DeleteBitSet;
 
 
 const EMPTY_DATA: [u8; 0] = [0u8; 0];
 
 
-/*
 /// `SegmentPostings` represents the inverted list or postings associated to
 /// a term in a `Segment`.
 ///
@@ -17,142 +15,7 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0];
 /// Positions on the other hand, are optionally entirely decoded upfront.
 pub struct SegmentPostings<'a> {
     len: usize,
-    // Removing this makes the code slower
-    // See https://github.com/tantivy-search/tantivy/issues/89
-    block_len: usize,
-    doc_offset: u32,
-    block_decoder: BlockDecoder,
-    freq_handler: FreqHandler,
-    remaining_data: &'a [u8],
-    cur: Wrapping<usize>,
-    delete_bitset: DeleteBitSet,
-}
-
-impl<'a> SegmentPostings<'a> {
-    fn load_next_block(&mut self) {
-        let num_remaining_docs = self.len - self.cur.0;
-        if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
-            self.remaining_data =
-                self.block_decoder
-                    .uncompress_block_sorted(self.remaining_data, self.doc_offset);
-            self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
-            self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
-            self.block_len = NUM_DOCS_PER_BLOCK;
-        } else {
-            self.remaining_data =
-                self.block_decoder
-                    .uncompress_vint_sorted(self.remaining_data,
-                                            self.doc_offset,
-                                            num_remaining_docs);
-            self.freq_handler
-                .read_freq_vint(self.remaining_data, num_remaining_docs);
-            self.block_len = num_remaining_docs;
-        }
-    }
-
-    /// Reads a Segment postings from an &[u8]
-    ///
-    /// * `len` - number of document in the posting lists.
-    /// * `data` - data array. The complete data is not necessarily used.
-    /// * `freq_handler` - the freq handler is in charge of decoding
-    ///   frequencies and/or positions
-    pub fn from_data(len: u32,
-                     data: &'a [u8],
-                     delete_bitset: &'a DeleteBitSet,
-                     freq_handler: FreqHandler)
-                     -> SegmentPostings<'a> {
-        SegmentPostings {
-            len: len as usize,
-            block_len: len as usize,
-            doc_offset: 0,
-            block_decoder: BlockDecoder::new(),
-            freq_handler: freq_handler,
-            remaining_data: data,
-            cur: Wrapping(usize::max_value()),
-            delete_bitset: delete_bitset.clone(),
-        }
-    }
-
-    /// Returns an empty segment postings object
-    pub fn empty() -> SegmentPostings<'static> {
-        SegmentPostings {
-            len: 0,
-            block_len: 0,
-            doc_offset: 0,
-            block_decoder: BlockDecoder::new(),
-            freq_handler: FreqHandler::new_without_freq(),
-            remaining_data: &EMPTY_DATA,
-            delete_bitset: DeleteBitSet::empty(),
-            cur: Wrapping(usize::max_value()),
-        }
-    }
-
-
-
-    /// Sets the current position to a location relative
-    /// to the current block
-    #[inline]
-    fn set_within_block(&mut self, inner_pos: usize) {
-        self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos)
-    }
-}
-
-
-impl<'a> DocSet for SegmentPostings<'a> {
-    // goes to the next element.
-    // next needs to be called a first time to point to the correct element.
-    #[inline]
-    fn advance(&mut self) -> bool {
-        loop {
-            self.cur += Wrapping(1);
-            if self.cur.0 >= self.len {
-                return false;
-            }
-            if self.index_within_block() == 0 {
-                self.load_next_block();
-            }
-            if !self.delete_bitset.is_deleted(self.doc()) {
-                return true;
-            }
-        }
-    }
-
-    
-
-    #[inline]
-    fn doc(&self) -> DocId {
-        self.block_decoder.output(self.index_within_block())
-    }
-}
-
-impl<'a> HasLen for SegmentPostings<'a> {
-    fn len(&self) -> usize {
-        self.len
-    }
-}
-
-impl<'a> Postings for SegmentPostings<'a> {
-    fn term_freq(&self) -> u32 {
-        self.freq_handler.freq(self.index_within_block())
-    }
-
-    fn positions(&self) -> &[u32] {
-        self.freq_handler.positions(self.index_within_block())
-    }
-}
-
-*/
-
-
-
-/// `SegmentPostings` represents the inverted list or postings associated to
-/// a term in a `Segment`.
-///
-/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
-/// Positions on the other hand, are optionally entirely decoded upfront.
-pub struct SegmentPostings<'a> {
-    len: usize,
-    cur: Wrapping<usize>,
+    cur: usize,
     block_cursor: BlockSegmentPostings<'a>,
     cur_block_len: usize,
     delete_bitset: DeleteBitSet,
@@ -173,7 +36,7 @@ impl<'a> SegmentPostings<'a> {
         SegmentPostings {
             len: segment_block_postings.len,
             block_cursor: segment_block_postings,
-            cur: Wrapping(usize::max_value()),
+            cur: NUM_DOCS_PER_BLOCK,  // cursor within the block
             cur_block_len: 0,
             delete_bitset: delete_bitset,
         }
@@ -186,7 +49,7 @@ impl<'a> SegmentPostings<'a> {
             len: 0,
             block_cursor: empty_block_cursor,
             delete_bitset: DeleteBitSet::empty(),
-            cur: Wrapping(usize::max_value()),
+            cur: NUM_DOCS_PER_BLOCK,
             cur_block_len: 0,
         }
     }
@@ -198,15 +61,13 @@ impl<'a> DocSet for SegmentPostings<'a> {
     // next needs to be called a first time to point to the correct element.
     #[inline]
     fn advance(&mut self) -> bool {
-        loop {   
-            self.cur += Wrapping(1);
-            assert!(self.cur.0 >= 0);
-            assert!(self.cur.0 <= self.cur_block_len);
-            if self.cur.0 == self.cur_block_len {
-                self.cur = Wrapping(0);
+        loop {
+            self.cur += 1;
+            if self.cur >= self.cur_block_len {
+                self.cur = 0;
                 if !self.block_cursor.advance() {
                     self.cur_block_len = 0;
-                    self.cur = Wrapping(usize::max_value());
+                    self.cur = NUM_DOCS_PER_BLOCK;
                     return false;
                 }
                 self.cur_block_len = self.block_cursor.docs().len();
@@ -217,90 +78,96 @@ impl<'a> DocSet for SegmentPostings<'a> {
         }
     }
 
-    /*
+    
     fn skip_next(&mut self, target: DocId) -> SkipResult {
         if !self.advance() {
             return SkipResult::End;
         }
 
-        let mut pos = self.index_within_block();
         // skip blocks until one that might contain the target
         loop {
             // check if we need to go to the next block
-            if target > self.block_decoder.output(self.block_len - 1) {
-                self.cur += Wrapping(self.block_len - pos);
-                self.load_next_block();
-                pos = 0;
-
-                // there was no more data
-                if self.cur.0 == self.len {
+            let last_doc_in_block = {
+                let block_docs = self.block_cursor.docs();
+                block_docs[block_docs.len() - 1]
+            };
+            if target > last_doc_in_block {
+                if !self.block_cursor.advance() {
                     return SkipResult::End;
                 }
-            } else if target < self.block_decoder.output(pos) {
-                // We've overpassed the target after the first `advance` call
-                // or we're at the beginning of a block.
-                // Either way, we're on the first `DocId` greater than `target`
-                return SkipResult::OverStep;
+                self.cur = 0;
             } else {
+                let block_docs = self.block_cursor.docs();
+                if target < block_docs[self.cur] {
+                    // We've overpassed the target after the first `advance` call
+                    // or we're at the beginning of a block.
+                    // Either way, we're on the first `DocId` greater than `target`
+                    return SkipResult::OverStep;
+                }
                 break;
             }
         }
+        {
+            // we're in the right block now, start with an exponential search
+            let block_docs = self.block_cursor.docs();
+            let block_len = block_docs.len();
 
-        debug_assert!(target >= self.block_decoder.output(pos));
-        debug_assert!(target <= self.block_decoder.output(self.block_len - 1));
+            debug_assert!(target >= block_docs[self.cur]);
+            debug_assert!(target <= block_docs[block_len - 1]);
 
-        // we're in the right block now, start with an exponential search
-        let mut start = pos;
-        let mut end = self.block_len;
-        let mut count = 1;
-        loop {
-            let new = start + count;
-            if new < end && self.block_decoder.output(new) < target {
-                start = new;
-                count *= 2;
-            } else {
-                break;
+            let mut start = 0;
+            let mut end = block_len;
+            let mut count = 1;
+            loop {
+                let new = start + count;
+                if new < end && block_docs[new] < target {
+                    start = new;
+                    count *= 2;
+                } else {
+                    break;
+                }
+            }
+            end = cmp::min(start + count, end);
+
+            // now do a binary search
+            let mut count = end - start;
+            while count > 0 {
+                let step = count / 2;
+                let mid = start + step;
+                let doc = block_docs[mid];
+                if doc < target {
+                    start = mid + 1;
+                    count -= step + 1;
+                } else {
+                    count = step;
+                }
+            }
+
+            // `doc` is now >= `target`
+            let doc = block_docs[start];
+            self.cur = start;
+
+            if !self.delete_bitset.is_deleted(doc) {
+                if doc == target {
+                    return SkipResult::Reached;
+                } else {
+                    return SkipResult::OverStep;
+                }
             }
         }
-        end = cmp::min(start + count, end);
-
-        // now do a binary search
-        let mut count = end - start;
-        while count > 0 {
-            let step = count / 2;
-            let mid = start + step;
-            let doc = self.block_decoder.output(mid);
-            if doc < target {
-                start = mid + 1;
-                count -= step + 1;
-            } else {
-                count = step;
-            }
-        }
-
-        // `doc` is now >= `target`
-        let doc = self.block_decoder.output(start);
-        self.set_within_block(start);
-
-        if !self.delete_bitset.is_deleted(doc) {
-            if doc == target {
-                return SkipResult::Reached;
-            } else {
-                return SkipResult::OverStep;
-            }
-        }
-
         if self.advance() {
             SkipResult::OverStep
         } else {
             SkipResult::End
         }
     }
-    */
     
+
     #[inline]
     fn doc(&self) -> DocId {
-        self.block_cursor.docs()[self.cur.0]
+        let docs = self.block_cursor.docs();
+        assert!(self.cur < docs.len(), "Have you forgotten to call `.advance()` at least once before calling .doc().");
+        docs[self.cur]
     }
 }
 
@@ -312,11 +179,11 @@ impl<'a> HasLen for SegmentPostings<'a> {
 
 impl<'a> Postings for SegmentPostings<'a> {
     fn term_freq(&self) -> u32 {
-        self.block_cursor.freq_handler().freq(self.cur.0)
+        self.block_cursor.freq_handler().freq(self.cur)
     }
 
     fn positions(&self) -> &[u32] {
-        self.block_cursor.freq_handler().positions(self.cur.0)
+        self.block_cursor.freq_handler().positions(self.cur)
     }
 }
 
@@ -359,6 +226,8 @@ impl<'a> BlockSegmentPostings<'a> {
         self.len = len;
     }
 
+
+    /// Returns the array of docs in the current block.
     pub fn docs(&self) -> &[DocId] {
         self.block_decoder.output_array()
     }
@@ -402,3 +271,17 @@ impl<'a> BlockSegmentPostings<'a> {
     }
     
 }
+
+#[cfg(test)]
+mod tests {
+
+    use DocSet;
+    use super::SegmentPostings;
+
+    #[test]
+    fn test_empty_segment_postings() {
+        let mut postings = SegmentPostings::empty();
+        assert!(!postings.advance());
+        assert!(!postings.advance());
+    }
+}
\ No newline at end of file