Issue #67 - Removed segment ord array from term iteration.

This was probably an early optimization.
This commit is contained in:
Paul Masurel
2016-12-17 09:38:57 +01:00
parent ca5f3e1d46
commit 4d7d201f21
4 changed files with 49 additions and 66 deletions

View File

@@ -54,10 +54,13 @@ impl Searcher {
///
/// This includes all of the fields from all of the segment_readers.
/// See [TermIterator](struct.TermIterator.html).
///
/// # Warning
/// This API is very likely to change in the future.
pub fn terms<'a>(&'a self) -> TermIterator<'a> {
TermIterator::from(self.segment_readers())
}
/// Return the list of segment readers
pub fn segment_readers(&self,) -> &[SegmentReader] {
&self.segment_readers

View File

@@ -2,12 +2,12 @@ use fst::Streamer;
use std::mem;
use std::collections::BinaryHeap;
use fst::map::Keys;
use schema::Field;
use schema::Term;
use core::SegmentReader;
use std::cmp::Ordering;
static EMPTY: [u8; 0] = [];
#[derive(PartialEq, Eq, Debug)]
struct HeapItem {
@@ -49,7 +49,7 @@ impl<'a> TermIterator<'a> {
let mut term_iterator = TermIterator {
key_streams: key_streams,
heap: BinaryHeap::new(),
current_term: Term::from(&EMPTY[..]),
current_term: Term::from_field_text(Field(0), ""),
current_segment_ords: vec![],
};
for segment_ord in 0..key_streams_len {
@@ -70,7 +70,7 @@ impl<'a> TermIterator<'a> {
}
impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
type Item = (&'a Term, &'a [usize]);
type Item = &'a Term;
fn next(&'a mut self) -> Option<Self::Item> {
self.current_segment_ords.clear();
@@ -82,26 +82,24 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
loop {
match self.heap.peek() {
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
_ => {
break;
}
_ => { break; }
}
let next_heap_it = self.heap
.pop()
.expect("This is only reached if an element was \
peeked beforehand.");
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
self.push_next_segment_el(next_heap_it.segment_ord);
}
(&self.current_term, self.current_segment_ords.as_slice())
&self.current_term
})
}
}
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
TermIterator::new(segment_readers.iter()
.map(|reader| reader.term_infos().keys())
.collect())
TermIterator::new(
segment_readers
.iter()
.map(|reader| reader.term_infos().keys())
.collect()
)
}
}
@@ -148,44 +146,13 @@ mod tests {
}
let searcher = index.searcher();
let mut term_it = searcher.terms();
{
let (term, segments) = term_it.next().unwrap();
assert_eq!(term.value(), "a".as_bytes());
let expected_segments = [0, 1];
assert_eq!(segments, &expected_segments);
}
{
let (term, segments): (&Term, &[usize]) = term_it.next().unwrap();
assert_eq!(term.value(), "b".as_bytes());
let expected_segments = [0, 1];
assert_eq!(segments, &expected_segments);
}
{
let (ref term, ref segments) = term_it.next().unwrap();
assert_eq!(term.value(), "c".as_bytes());
let expected_segments = [1];
assert_eq!(segments, &expected_segments);
}
{
let (term, segments) = term_it.next().unwrap();
assert_eq!(term.value(), "d".as_bytes());
let expected_segments = [0, 1];
assert_eq!(segments, &expected_segments);
}
{
let (term, segments) = term_it.next().unwrap();
assert_eq!(term.value(), "e".as_bytes());
let expected_segments = [2];
assert_eq!(segments, &expected_segments);
}
{
let (term, segments) = term_it.next().unwrap();
assert_eq!(term.value(), "f".as_bytes());
let expected_segments = [0, 1, 2];
assert_eq!(segments, &expected_segments);
let mut terms = String::new();
while let Some(term) = term_it.next() {
unsafe {
terms.push_str(term.text());
}
}
assert_eq!(terms, "abcdef");
}
}

View File

@@ -132,7 +132,7 @@ impl IndexMerger {
max_doc += reader.max_doc();
}
while let Some((term, segment_ords)) = merged_terms.next() {
while let Some(term) = merged_terms.next() {
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
@@ -143,17 +143,18 @@ impl IndexMerger {
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
let mut merged_postings =
ChainedPostings::from(segment_ords.iter()
.flat_map(|segment_ord| {
let offset = offsets[*segment_ord];
self.readers[*segment_ord]
.read_postings_all_info(&term)
.map(|segment_postings| {
OffsetPostings::new(segment_postings,
offset)
})
})
.collect::<Vec<_>>());
ChainedPostings::from(
self.readers
.iter()
.enumerate()
.flat_map(|(segment_ord, reader)| {
let offset = offsets[segment_ord];
reader
.read_postings_all_info(&term)
.map(|segment_postings| OffsetPostings::new(segment_postings, offset))
})
.collect::<Vec<_>>()
);
// We can now serialize this postings, by pushing each document to the
// postings serializer.

View File

@@ -2,7 +2,7 @@ use std::fmt;
use common::BinarySerializable;
use super::Field;
use std::str;
/// Term represents the value that the token can take.
@@ -65,7 +65,9 @@ impl Term {
Term(buffer)
}
/// Returns the serialized value associated to the field.
/// Returns the serialized value of the term.
/// (this does not include the field.)
///
/// If the term is a string, its value is utf-8 encoded.
/// If the term is a u32, its value is encoded according
/// to `byteorder::LittleEndian`.
@@ -73,6 +75,16 @@ impl Term {
&self.0[1..]
}
/// Returns the text associated with the term.
///
/// # Panics
/// If the value is not valid utf-8. This may happen
/// if the index is corrupted or if you try to
/// call this method on a non-string type.
pub unsafe fn text(&self) -> &str {
str::from_utf8_unchecked(self.value())
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.0.resize(1, 0u8);