Issue #67 - Removed segment ord array from term iteration.

This was probably an early optimization.
2026-05-29 14:40:40 +00:00 · 2016-12-17 09:38:57 +01:00
parent ca5f3e1d46
commit 4d7d201f21
4 changed files with 49 additions and 66 deletions
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -54,10 +54,13 @@ impl Searcher {
    ///
    /// This includes all of the fields from all of the segment_readers.
    /// See [TermIterator](struct.TermIterator.html).
+    ///
+    /// # Warning
+    /// This API is very likely to change in the future.
    pub fn terms<'a>(&'a self) -> TermIterator<'a> {
        TermIterator::from(self.segment_readers())
    }
-    
+
    /// Return the list of segment readers
    pub fn segment_readers(&self,) -> &[SegmentReader] {
        &self.segment_readers
--- a/src/core/term_iterator.rs
+++ b/src/core/term_iterator.rs
@@ -2,12 +2,12 @@ use fst::Streamer;
 use std::mem;
 use std::collections::BinaryHeap;
 use fst::map::Keys;
+use schema::Field;
 use schema::Term;
 use core::SegmentReader;
 use std::cmp::Ordering;


-static EMPTY: [u8; 0] = [];

 #[derive(PartialEq, Eq, Debug)]
 struct HeapItem {
@@ -49,7 +49,7 @@ impl<'a> TermIterator<'a> {
        let mut term_iterator = TermIterator {
            key_streams: key_streams,
            heap: BinaryHeap::new(),
-            current_term: Term::from(&EMPTY[..]),
+            current_term: Term::from_field_text(Field(0), ""),
            current_segment_ords: vec![],
        };
        for segment_ord in 0..key_streams_len {
@@ -70,7 +70,7 @@ impl<'a> TermIterator<'a> {
 }

 impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
-    type Item = (&'a Term, &'a [usize]);
+    type Item = &'a Term;

    fn next(&'a mut self) -> Option<Self::Item> {
        self.current_segment_ords.clear();
@@ -82,26 +82,24 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
                loop {
                    match self.heap.peek() {
                        Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
-                        _ => {
-                            break;
-                        }
+                        _ => { break; }
                    }
-                    let next_heap_it = self.heap
-                                           .pop()
-                                           .expect("This is only reached if an element was \
-                                                    peeked beforehand.");
+                    let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
                    self.push_next_segment_el(next_heap_it.segment_ord);
                }
-                (&self.current_term, self.current_segment_ords.as_slice())
+                &self.current_term
            })
    }
 }

 impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
    fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
-        TermIterator::new(segment_readers.iter()
-                                         .map(|reader| reader.term_infos().keys())
-                                         .collect())
+        TermIterator::new(
+            segment_readers
+            .iter()
+            .map(|reader| reader.term_infos().keys())
+            .collect()
+        )
    }
 }

@@ -148,44 +146,13 @@ mod tests {
        }
        let searcher = index.searcher();
        let mut term_it = searcher.terms();
-        {
-
-            let (term, segments) = term_it.next().unwrap();
-            assert_eq!(term.value(), "a".as_bytes());
-            let expected_segments = [0, 1];
-            assert_eq!(segments, &expected_segments);
-
-        }
-        {
-            let (term, segments): (&Term, &[usize]) = term_it.next().unwrap();
-            assert_eq!(term.value(), "b".as_bytes());
-            let expected_segments = [0, 1];
-            assert_eq!(segments, &expected_segments);
-        }
-        {
-            let (ref term, ref segments) = term_it.next().unwrap();
-            assert_eq!(term.value(), "c".as_bytes());
-            let expected_segments = [1];
-            assert_eq!(segments, &expected_segments);
-        }
-        {
-            let (term, segments) = term_it.next().unwrap();
-            assert_eq!(term.value(), "d".as_bytes());
-            let expected_segments = [0, 1];
-            assert_eq!(segments, &expected_segments);
-        }
-        {
-            let (term, segments) = term_it.next().unwrap();
-            assert_eq!(term.value(), "e".as_bytes());
-            let expected_segments = [2];
-            assert_eq!(segments, &expected_segments);
-        }
-        {
-            let (term, segments) = term_it.next().unwrap();
-            assert_eq!(term.value(), "f".as_bytes());
-            let expected_segments = [0, 1, 2];
-            assert_eq!(segments, &expected_segments);
+        let mut terms = String::new();
+        while let Some(term) = term_it.next() {
+            unsafe {
+                terms.push_str(term.text());
+            }
        }
+        assert_eq!(terms, "abcdef");
    }

 }
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -132,7 +132,7 @@ impl IndexMerger {
            max_doc += reader.max_doc();
        }

-        while let Some((term, segment_ords)) = merged_terms.next() {
+        while let Some(term) = merged_terms.next() {
            // Create the total list of doc ids
            // by stacking the doc ids from the different segment.
            //
@@ -143,17 +143,18 @@ impl IndexMerger {
            // - Segment 2's doc ids become  [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc]
            // ...
            let mut merged_postings =
-                ChainedPostings::from(segment_ords.iter()
-                                                  .flat_map(|segment_ord| {
-                                                      let offset = offsets[*segment_ord];
-                                                      self.readers[*segment_ord]
-                                                          .read_postings_all_info(&term)
-                                                          .map(|segment_postings| {
-                                                              OffsetPostings::new(segment_postings,
-                                                                                  offset)
-                                                          })
-                                                  })
-                                                  .collect::<Vec<_>>());
+                ChainedPostings::from(
+                    self.readers
+                        .iter()
+                        .enumerate()
+                        .flat_map(|(segment_ord, reader)| {
+                            let offset = offsets[segment_ord];
+                            reader
+                                .read_postings_all_info(&term)
+                                .map(|segment_postings| OffsetPostings::new(segment_postings, offset))
+                        })
+                        .collect::<Vec<_>>()
+            );

            // We can now serialize this postings, by pushing each document to the
            // postings serializer.
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -2,7 +2,7 @@ use std::fmt;

 use common::BinarySerializable;
 use super::Field;
-
+use std::str;


 /// Term represents the value that the token can take.
@@ -65,7 +65,9 @@ impl Term {
        Term(buffer)
    }
    
-    /// Returns the serialized value associated to the field.
+    /// Returns the serialized value of the term.
+    /// (this does not include the field.)
+    ///
    /// If the term is a string, its value is utf-8 encoded.
    /// If the term is a u32, its value is encoded according
    /// to `byteorder::LittleEndian`. 
@@ -73,6 +75,16 @@ impl Term {
        &self.0[1..]
    }

+    /// Returns the text associated with the term.
+    ///
+    /// # Panics
+    /// If the value is not valid utf-8. This may happen
+    /// if the index is corrupted or if you try to 
+    /// call this method on a non-string type.
+    pub unsafe fn text(&self) -> &str {
+        str::from_utf8_unchecked(self.value())
+    }
+
    /// Set the texts only, keeping the field untouched. 
    pub fn set_text(&mut self, text: &str) {
        self.0.resize(1, 0u8);