mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Issue #67 - Removed segment ord array from term iteration.
This was probably an early optimization.
This commit is contained in:
@@ -54,10 +54,13 @@ impl Searcher {
|
||||
///
|
||||
/// This includes all of the fields from all of the segment_readers.
|
||||
/// See [TermIterator](struct.TermIterator.html).
|
||||
///
|
||||
/// # Warning
|
||||
/// This API is very likely to change in the future.
|
||||
pub fn terms<'a>(&'a self) -> TermIterator<'a> {
|
||||
TermIterator::from(self.segment_readers())
|
||||
}
|
||||
|
||||
|
||||
/// Return the list of segment readers
|
||||
pub fn segment_readers(&self,) -> &[SegmentReader] {
|
||||
&self.segment_readers
|
||||
|
||||
@@ -2,12 +2,12 @@ use fst::Streamer;
|
||||
use std::mem;
|
||||
use std::collections::BinaryHeap;
|
||||
use fst::map::Keys;
|
||||
use schema::Field;
|
||||
use schema::Term;
|
||||
use core::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
|
||||
static EMPTY: [u8; 0] = [];
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
struct HeapItem {
|
||||
@@ -49,7 +49,7 @@ impl<'a> TermIterator<'a> {
|
||||
let mut term_iterator = TermIterator {
|
||||
key_streams: key_streams,
|
||||
heap: BinaryHeap::new(),
|
||||
current_term: Term::from(&EMPTY[..]),
|
||||
current_term: Term::from_field_text(Field(0), ""),
|
||||
current_segment_ords: vec![],
|
||||
};
|
||||
for segment_ord in 0..key_streams_len {
|
||||
@@ -70,7 +70,7 @@ impl<'a> TermIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
|
||||
type Item = (&'a Term, &'a [usize]);
|
||||
type Item = &'a Term;
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
self.current_segment_ords.clear();
|
||||
@@ -82,26 +82,24 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
|
||||
loop {
|
||||
match self.heap.peek() {
|
||||
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
_ => { break; }
|
||||
}
|
||||
let next_heap_it = self.heap
|
||||
.pop()
|
||||
.expect("This is only reached if an element was \
|
||||
peeked beforehand.");
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.push_next_segment_el(next_heap_it.segment_ord);
|
||||
}
|
||||
(&self.current_term, self.current_segment_ords.as_slice())
|
||||
&self.current_term
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
|
||||
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
|
||||
TermIterator::new(segment_readers.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.collect())
|
||||
TermIterator::new(
|
||||
segment_readers
|
||||
.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,44 +146,13 @@ mod tests {
|
||||
}
|
||||
let searcher = index.searcher();
|
||||
let mut term_it = searcher.terms();
|
||||
{
|
||||
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "a".as_bytes());
|
||||
let expected_segments = [0, 1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
|
||||
}
|
||||
{
|
||||
let (term, segments): (&Term, &[usize]) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "b".as_bytes());
|
||||
let expected_segments = [0, 1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (ref term, ref segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "c".as_bytes());
|
||||
let expected_segments = [1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "d".as_bytes());
|
||||
let expected_segments = [0, 1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "e".as_bytes());
|
||||
let expected_segments = [2];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "f".as_bytes());
|
||||
let expected_segments = [0, 1, 2];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
let mut terms = String::new();
|
||||
while let Some(term) = term_it.next() {
|
||||
unsafe {
|
||||
terms.push_str(term.text());
|
||||
}
|
||||
}
|
||||
assert_eq!(terms, "abcdef");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -132,7 +132,7 @@ impl IndexMerger {
|
||||
max_doc += reader.max_doc();
|
||||
}
|
||||
|
||||
while let Some((term, segment_ords)) = merged_terms.next() {
|
||||
while let Some(term) = merged_terms.next() {
|
||||
// Create the total list of doc ids
|
||||
// by stacking the doc ids from the different segment.
|
||||
//
|
||||
@@ -143,17 +143,18 @@ impl IndexMerger {
|
||||
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc]
|
||||
// ...
|
||||
let mut merged_postings =
|
||||
ChainedPostings::from(segment_ords.iter()
|
||||
.flat_map(|segment_ord| {
|
||||
let offset = offsets[*segment_ord];
|
||||
self.readers[*segment_ord]
|
||||
.read_postings_all_info(&term)
|
||||
.map(|segment_postings| {
|
||||
OffsetPostings::new(segment_postings,
|
||||
offset)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>());
|
||||
ChainedPostings::from(
|
||||
self.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.flat_map(|(segment_ord, reader)| {
|
||||
let offset = offsets[segment_ord];
|
||||
reader
|
||||
.read_postings_all_info(&term)
|
||||
.map(|segment_postings| OffsetPostings::new(segment_postings, offset))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// We can now serialize this postings, by pushing each document to the
|
||||
// postings serializer.
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fmt;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use super::Field;
|
||||
|
||||
use std::str;
|
||||
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
@@ -65,7 +65,9 @@ impl Term {
|
||||
Term(buffer)
|
||||
}
|
||||
|
||||
/// Returns the serialized value associated to the field.
|
||||
/// Returns the serialized value of the term.
|
||||
/// (this does not include the field.)
|
||||
///
|
||||
/// If the term is a string, its value is utf-8 encoded.
|
||||
/// If the term is a u32, its value is encoded according
|
||||
/// to `byteorder::LittleEndian`.
|
||||
@@ -73,6 +75,16 @@ impl Term {
|
||||
&self.0[1..]
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not valid utf-8. This may happen
|
||||
/// if the index is corrupted or if you try to
|
||||
/// call this method on a non-string type.
|
||||
pub unsafe fn text(&self) -> &str {
|
||||
str::from_utf8_unchecked(self.value())
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.0.resize(1, 0u8);
|
||||
|
||||
Reference in New Issue
Block a user