From b3f62b8accf6b1fbcfeecff2ff302619b74a38c9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 18 May 2017 23:33:15 +0900 Subject: [PATCH] Better API --- src/core/segment_reader.rs | 12 +++++------ src/datastruct/fstmap/fstmap.rs | 32 +++++++++++++++++++++++++++--- src/datastruct/fstmap/fstmerger.rs | 2 +- src/datastruct/fstmap/streamer.rs | 4 +++- src/schema/field_type.rs | 2 +- 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 1be0451f6..25f2a023f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -43,7 +43,7 @@ use schema::TextIndexingOptions; pub struct SegmentReader { segment_id: SegmentId, segment_meta: SegmentMeta, - term_infos: Arc>, + terms: Arc>, postings_data: ReadOnlySource, store_reader: StoreReader, fast_fields_reader: Arc, @@ -135,7 +135,7 @@ impl SegmentReader { pub fn open(segment: Segment) -> Result { let source = try!(segment.open_read(SegmentComponent::TERMS)); - let term_infos = try!(FstMap::from_source(source)); + let terms = try!(FstMap::from_source(source)); let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE))); let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS)); @@ -160,7 +160,7 @@ impl SegmentReader { Ok(SegmentReader { segment_meta: segment.meta().clone(), postings_data: postings_shared_mmap, - term_infos: Arc::new(term_infos), + terms: Arc::new(terms), segment_id: segment.id(), store_reader: store_reader, fast_fields_reader: Arc::new(fast_fields_reader), @@ -172,8 +172,8 @@ impl SegmentReader { } /// Return the term dictionary datastructure. - pub fn term_infos(&self) -> &FstMap { - &self.term_infos + pub fn terms(&self) -> &FstMap { + &self.terms } /// Returns the document (or to be accurate, its stored field) @@ -259,7 +259,7 @@ impl SegmentReader { /// Returns the term info associated with the term. pub fn get_term_info(&self, term: &Term) -> Option { - self.term_infos.get(term.as_slice()) + self.terms.get(term.as_slice()) } /// Returns the segment id diff --git a/src/datastruct/fstmap/fstmap.rs b/src/datastruct/fstmap/fstmap.rs index f6fedf577..8338cee37 100644 --- a/src/datastruct/fstmap/fstmap.rs +++ b/src/datastruct/fstmap/fstmap.rs @@ -5,6 +5,7 @@ use super::{FstMapStreamerBuilder, FstMapStreamer}; use directory::ReadOnlySource; use common::BinarySerializable; use std::marker::PhantomData; +use schema::{Field, Term}; fn convert_fst_error(e: fst::Error) -> io::Error { @@ -104,22 +105,47 @@ impl FstMap }) } - pub(crate) fn read_value(&self, offset: u64) -> V { + + /// In the `FstMap`, the dictionary itself associated + /// each key `&[u8]` to a `u64` that is in fact the address + /// of the value object in a data array. + /// + /// This method deserialize this object, and returns it. + pub(crate) fn read_value(&self, offset: u64) -> io::Result { let buffer = self.values_mmap.as_slice(); let mut cursor = &buffer[(offset as usize)..]; - V::deserialize(&mut cursor).expect("Data in FST is corrupted") + V::deserialize(&mut cursor) } + /// Returns, if present the value associated to a given key. pub fn get>(&self, key: K) -> Option { self.fst_index .get(key) - .map(|offset| self.read_value(offset)) + .map(|offset| { + self.read_value(offset) + .expect("The fst is corrupted. Failed to deserialize a value.") + }) } + + /// Returns a stream of all the sorted terms. pub fn stream(&self) -> FstMapStreamer { self.range().into_stream() } + + /// Returns a stream of all the sorted terms in the given field. + pub fn stream_field(&self, field: Field) -> FstMapStreamer { + let start_term = Term::from_field_text(field, ""); + let stop_term = Term::from_field_text(Field(field.0 + 1), ""); + self.range() + .ge(start_term.as_slice()) + .lt(stop_term.as_slice()) + .into_stream() + } + + /// Returns a range builder, to stream all of the terms + /// within an interval. pub fn range(&self) -> FstMapStreamerBuilder { FstMapStreamerBuilder::new(self, self.fst_index.range()) } diff --git a/src/datastruct/fstmap/fstmerger.rs b/src/datastruct/fstmap/fstmerger.rs index c535bc28e..a4342855a 100644 --- a/src/datastruct/fstmap/fstmerger.rs +++ b/src/datastruct/fstmap/fstmerger.rs @@ -136,7 +136,7 @@ impl<'a> From<&'a [SegmentReader]> for FstMerger<'a, TermInfo> fn from(segment_readers: &'a [SegmentReader]) -> FstMerger<'a, TermInfo> { FstMerger::new(segment_readers .iter() - .map(|reader| reader.term_infos().stream()) + .map(|reader| reader.terms().stream()) .collect()) } } diff --git a/src/datastruct/fstmap/streamer.rs b/src/datastruct/fstmap/streamer.rs index ba8edcccb..3bef03389 100644 --- a/src/datastruct/fstmap/streamer.rs +++ b/src/datastruct/fstmap/streamer.rs @@ -99,6 +99,8 @@ impl<'a, V> FstMapStreamer<'a, V> } pub fn value(&self) -> V { - self.fst_map.read_value(self.offset) + self.fst_map + .read_value(self.offset) + .expect("Fst data is corrupted. Failed to deserialize a value.") } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 06b62cdc8..fcaaf8013 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -49,7 +49,7 @@ impl FieldType { FieldType::Str(ref text_options) => { match text_options.get_indexing_options() { TextIndexingOptions::Untokenized | - TextIndexingOptions::TokenizedNoFreq => Some(SegmentPostingsOption::NoFreq), + TextIndexingOptions::TokenizedNoFreq => Some(SegmentPostingsOption::NoFreq), TextIndexingOptions::TokenizedWithFreq => Some(SegmentPostingsOption::Freq), TextIndexingOptions::TokenizedWithFreqAndPosition => { Some(SegmentPostingsOption::FreqAndPositions)