mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 17:10:48 +00:00
issue/67 First stab. Iterator working.
This commit is contained in:
@@ -7,7 +7,7 @@ mod segment_component;
|
||||
mod segment;
|
||||
mod index_meta;
|
||||
mod pool;
|
||||
|
||||
mod term_iterator;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -19,7 +19,7 @@ pub use self::segment::SegmentInfo;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::{IndexMeta, SegmentMeta};
|
||||
|
||||
pub use self::term_iterator::TermIterator;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
||||
|
||||
@@ -7,6 +7,7 @@ use query::Query;
|
||||
use DocId;
|
||||
use DocAddress;
|
||||
use schema::Term;
|
||||
use core::TermIterator;
|
||||
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
@@ -47,9 +48,18 @@ impl Searcher {
|
||||
.map(|segment_reader| segment_reader.doc_freq(term))
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// the searcher.
|
||||
///
|
||||
/// This includes all of the fields from all of the segment_readers.
|
||||
/// See [TermIterator](struct.TermIterator.html).
|
||||
pub fn terms<'a>(&'a self) -> TermIterator<'a> {
|
||||
TermIterator::from(self.segment_readers())
|
||||
}
|
||||
|
||||
/// Return the list of segment readers
|
||||
pub fn segment_readers(&self,) -> &Vec<SegmentReader> {
|
||||
pub fn segment_readers(&self,) -> &[SegmentReader] {
|
||||
&self.segment_readers
|
||||
}
|
||||
|
||||
|
||||
191
src/core/term_iterator.rs
Normal file
191
src/core/term_iterator.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use fst::Streamer;
|
||||
use std::mem;
|
||||
use std::collections::BinaryHeap;
|
||||
use fst::map::Keys;
|
||||
use schema::Term;
|
||||
use core::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
|
||||
static EMPTY: [u8; 0] = [];
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
struct HeapItem {
|
||||
term: Term,
|
||||
segment_ord: usize,
|
||||
}
|
||||
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &HeapItem) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other: &HeapItem) -> Ordering {
|
||||
(&other.term, &other.segment_ord).cmp(&(&self.term, &self.segment_ord))
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub struct TermIterator<'a> {
|
||||
key_streams: Vec<Keys<'a>>,
|
||||
heap: BinaryHeap<HeapItem>,
|
||||
// Buffer hosting the list of segment ordinals containing
|
||||
// the current term.
|
||||
current_term: Term,
|
||||
current_segment_ords: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a> TermIterator<'a> {
|
||||
fn new(key_streams: Vec<Keys<'a>>) -> TermIterator<'a> {
|
||||
let key_streams_len = key_streams.len();
|
||||
let mut term_iterator = TermIterator {
|
||||
key_streams: key_streams,
|
||||
heap: BinaryHeap::new(),
|
||||
current_term: Term::from(&EMPTY[..]),
|
||||
current_segment_ords: vec![],
|
||||
};
|
||||
for segment_ord in 0..key_streams_len {
|
||||
term_iterator.push_next_segment_el(segment_ord);
|
||||
}
|
||||
term_iterator
|
||||
}
|
||||
|
||||
fn push_next_segment_el(&mut self, segment_ord: usize) {
|
||||
self.current_segment_ords.push(segment_ord);
|
||||
if let Some(term) = self.key_streams[segment_ord].next() {
|
||||
self.heap.push(HeapItem {
|
||||
term: Term::from(term),
|
||||
segment_ord: segment_ord,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
|
||||
type Item = (&'a Term, &'a [usize]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
self.current_segment_ords.clear();
|
||||
self.heap
|
||||
.pop()
|
||||
.map(move |mut head| {
|
||||
mem::swap(&mut self.current_term, &mut head.term);
|
||||
self.push_next_segment_el(head.segment_ord);
|
||||
loop {
|
||||
match self.heap.peek() {
|
||||
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let next_heap_it = self.heap
|
||||
.pop()
|
||||
.expect("This is only reached if an element was \
|
||||
peeked beforehand.");
|
||||
self.push_next_segment_el(next_heap_it.segment_ord);
|
||||
}
|
||||
(&self.current_term, self.current_segment_ords.as_slice())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
|
||||
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
|
||||
TermIterator::new(segment_readers.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use schema::{SchemaBuilder, Document, TEXT};
|
||||
use core::Index;
|
||||
|
||||
#[test]
|
||||
fn test_term_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b d f");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
// writing the segment
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c d f");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
// writing the segment
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "e f");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
}
|
||||
let searcher = index.searcher();
|
||||
let mut term_it = searcher.terms();
|
||||
{
|
||||
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "a".as_bytes());
|
||||
let expected_segments = [0, 1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
|
||||
}
|
||||
{
|
||||
let (term, segments): (&Term, &[usize]) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "b".as_bytes());
|
||||
let expected_segments = [0, 1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (ref term, ref segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "c".as_bytes());
|
||||
let expected_segments = [1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "d".as_bytes());
|
||||
let expected_segments = [0, 1];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "e".as_bytes());
|
||||
let expected_segments = [2];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
{
|
||||
let (term, segments) = term_it.next().unwrap();
|
||||
assert_eq!(term.value(), "f".as_bytes());
|
||||
let expected_segments = [0, 1, 2];
|
||||
assert_eq!(segments, &expected_segments);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user