diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 548ffc2cb..0d99a2897 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -54,10 +54,13 @@ impl Searcher { /// /// This includes all of the fields from all of the segment_readers. /// See [TermIterator](struct.TermIterator.html). + /// + /// # Warning + /// This API is very likely to change in the future. pub fn terms<'a>(&'a self) -> TermIterator<'a> { TermIterator::from(self.segment_readers()) } - + /// Return the list of segment readers pub fn segment_readers(&self,) -> &[SegmentReader] { &self.segment_readers diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs index 108bc03b6..3a9adf2d5 100644 --- a/src/core/term_iterator.rs +++ b/src/core/term_iterator.rs @@ -2,12 +2,12 @@ use fst::Streamer; use std::mem; use std::collections::BinaryHeap; use fst::map::Keys; +use schema::Field; use schema::Term; use core::SegmentReader; use std::cmp::Ordering; -static EMPTY: [u8; 0] = []; #[derive(PartialEq, Eq, Debug)] struct HeapItem { @@ -49,7 +49,7 @@ impl<'a> TermIterator<'a> { let mut term_iterator = TermIterator { key_streams: key_streams, heap: BinaryHeap::new(), - current_term: Term::from(&EMPTY[..]), + current_term: Term::from_field_text(Field(0), ""), current_segment_ords: vec![], }; for segment_ord in 0..key_streams_len { @@ -70,7 +70,7 @@ impl<'a> TermIterator<'a> { } impl<'a, 'f> Streamer<'a> for TermIterator<'f> { - type Item = (&'a Term, &'a [usize]); + type Item = &'a Term; fn next(&'a mut self) -> Option { self.current_segment_ords.clear(); @@ -82,26 +82,24 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> { loop { match self.heap.peek() { Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {} - _ => { - break; - } + _ => { break; } } - let next_heap_it = self.heap - .pop() - .expect("This is only reached if an element was \ - peeked beforehand."); + let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand self.push_next_segment_el(next_heap_it.segment_ord); } - (&self.current_term, self.current_segment_ords.as_slice()) + &self.current_term }) } } impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> { fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> { - TermIterator::new(segment_readers.iter() - .map(|reader| reader.term_infos().keys()) - .collect()) + TermIterator::new( + segment_readers + .iter() + .map(|reader| reader.term_infos().keys()) + .collect() + ) } } @@ -148,44 +146,13 @@ mod tests { } let searcher = index.searcher(); let mut term_it = searcher.terms(); - { - - let (term, segments) = term_it.next().unwrap(); - assert_eq!(term.value(), "a".as_bytes()); - let expected_segments = [0, 1]; - assert_eq!(segments, &expected_segments); - - } - { - let (term, segments): (&Term, &[usize]) = term_it.next().unwrap(); - assert_eq!(term.value(), "b".as_bytes()); - let expected_segments = [0, 1]; - assert_eq!(segments, &expected_segments); - } - { - let (ref term, ref segments) = term_it.next().unwrap(); - assert_eq!(term.value(), "c".as_bytes()); - let expected_segments = [1]; - assert_eq!(segments, &expected_segments); - } - { - let (term, segments) = term_it.next().unwrap(); - assert_eq!(term.value(), "d".as_bytes()); - let expected_segments = [0, 1]; - assert_eq!(segments, &expected_segments); - } - { - let (term, segments) = term_it.next().unwrap(); - assert_eq!(term.value(), "e".as_bytes()); - let expected_segments = [2]; - assert_eq!(segments, &expected_segments); - } - { - let (term, segments) = term_it.next().unwrap(); - assert_eq!(term.value(), "f".as_bytes()); - let expected_segments = [0, 1, 2]; - assert_eq!(segments, &expected_segments); + let mut terms = String::new(); + while let Some(term) = term_it.next() { + unsafe { + terms.push_str(term.text()); + } } + assert_eq!(terms, "abcdef"); } } \ No newline at end of file diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index ed575ecd2..4bc7e049e 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -132,7 +132,7 @@ impl IndexMerger { max_doc += reader.max_doc(); } - while let Some((term, segment_ords)) = merged_terms.next() { + while let Some(term) = merged_terms.next() { // Create the total list of doc ids // by stacking the doc ids from the different segment. // @@ -143,17 +143,18 @@ impl IndexMerger { // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... let mut merged_postings = - ChainedPostings::from(segment_ords.iter() - .flat_map(|segment_ord| { - let offset = offsets[*segment_ord]; - self.readers[*segment_ord] - .read_postings_all_info(&term) - .map(|segment_postings| { - OffsetPostings::new(segment_postings, - offset) - }) - }) - .collect::>()); + ChainedPostings::from( + self.readers + .iter() + .enumerate() + .flat_map(|(segment_ord, reader)| { + let offset = offsets[segment_ord]; + reader + .read_postings_all_info(&term) + .map(|segment_postings| OffsetPostings::new(segment_postings, offset)) + }) + .collect::>() + ); // We can now serialize this postings, by pushing each document to the // postings serializer. diff --git a/src/schema/term.rs b/src/schema/term.rs index 4cb2adab4..c0c5009fe 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -2,7 +2,7 @@ use std::fmt; use common::BinarySerializable; use super::Field; - +use std::str; /// Term represents the value that the token can take. @@ -65,7 +65,9 @@ impl Term { Term(buffer) } - /// Returns the serialized value associated to the field. + /// Returns the serialized value of the term. + /// (this does not include the field.) + /// /// If the term is a string, its value is utf-8 encoded. /// If the term is a u32, its value is encoded according /// to `byteorder::LittleEndian`. @@ -73,6 +75,16 @@ impl Term { &self.0[1..] } + /// Returns the text associated with the term. + /// + /// # Panics + /// If the value is not valid utf-8. This may happen + /// if the index is corrupted or if you try to + /// call this method on a non-string type. + pub unsafe fn text(&self) -> &str { + str::from_utf8_unchecked(self.value()) + } + /// Set the texts only, keeping the field untouched. pub fn set_text(&mut self, text: &str) { self.0.resize(1, 0u8);