diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 9a7471881..d8df2fa8f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -14,7 +14,7 @@ use DocId; use std::str; use std::cmp; use postings::TermInfo; -use termdict::FstMap; +use termdict::TermDictionary; use std::sync::Arc; use std::fmt; use schema::Field; @@ -43,7 +43,7 @@ use schema::TextIndexingOptions; pub struct SegmentReader { segment_id: SegmentId, segment_meta: SegmentMeta, - terms: Arc>, + terms: Arc, postings_data: ReadOnlySource, store_reader: StoreReader, fast_fields_reader: Arc, @@ -135,7 +135,7 @@ impl SegmentReader { pub fn open(segment: Segment) -> Result { let source = try!(segment.open_read(SegmentComponent::TERMS)); - let terms = try!(FstMap::from_source(source)); + let terms = try!(TermDictionary::from_source(source)); let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE))); let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS)); @@ -172,7 +172,7 @@ impl SegmentReader { } /// Return the term dictionary datastructure. - pub fn terms(&self) -> &FstMap { + pub fn terms(&self) -> &TermDictionary { &self.terms } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 6a038812c..0a774a160 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -273,7 +273,7 @@ fn index_documents(heap: &mut Heap, // // Tantivy does not resize its hashtable. When it reaches // capacity, we just stop indexing new document. - if segment_writer.is_termdic_saturated() { + if segment_writer.is_term_saturated() { info!("Term dic saturated, flushing segment with maxdoc={}.", segment_writer.max_doc()); break; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index ffaae2cca..c5d4d6662 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -98,8 +98,8 @@ impl<'a> SegmentWriter<'a> { /// Return true if the term dictionary hashmap is reaching capacity. /// It is one of the condition that triggers a `SegmentWriter` to /// be finalized. - pub(crate) fn is_termdic_saturated(&self) -> bool { - self.multifield_postings.is_termdic_saturated() + pub(crate) fn is_term_saturated(&self) -> bool { + self.multifield_postings.is_term_saturated() } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 33d297e62..65fba2f20 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -109,7 +109,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { } /// Return true iff the term dictionary is saturated. - pub fn is_termdic_saturated(&self) -> bool { + pub fn is_term_saturated(&self) -> bool { self.term_index.is_saturated() } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 048f23dca..1313ad445 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,5 +1,5 @@ use Result; -use termdict::FstMapBuilder; +use termdict::TermDictionaryBuilder; use super::TermInfo; use schema::Field; use schema::FieldEntry; @@ -50,7 +50,7 @@ use common::BinarySerializable; /// A description of the serialization format is /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). pub struct PostingsSerializer { - terms_fst_builder: FstMapBuilder, + terms_fst_builder: TermDictionaryBuilder, postings_write: WritePtr, positions_write: WritePtr, written_bytes_postings: usize, @@ -74,7 +74,7 @@ impl PostingsSerializer { positions_write: WritePtr, schema: Schema) -> Result { - let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); + let terms_fst_builder = try!(TermDictionaryBuilder::new(terms_write)); Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, postings_write: postings_write, diff --git a/src/termdict/fstmap.rs.bk b/src/termdict/fstmap.rs.bk deleted file mode 100644 index 732d14084..000000000 --- a/src/termdict/fstmap.rs.bk +++ /dev/null @@ -1,183 +0,0 @@ -use std::io::{self, Write}; -use fst; -use fst::raw::Fst; -use super::{FstMapStreamerBuilder, FstMapStreamer}; -use directory::ReadOnlySource; -use common::BinarySerializable; -use std::marker::PhantomData; -use schema::{Field, Term}; - - -fn convert_fst_error(e: fst::Error) -> io::Error { - io::Error::new(io::ErrorKind::Other, e) -} - -pub struct FstMapBuilder { - fst_builder: fst::MapBuilder, - data: Vec, - _phantom_: PhantomData, -} - -impl FstMapBuilder { - pub fn new(w: W) -> io::Result> { - let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; - Ok(FstMapBuilder { - fst_builder: fst_builder, - data: Vec::new(), - _phantom_: PhantomData, - }) - } - - /// Horribly unsafe, nobody should ever do that... except me :) - /// - /// If used, it must be used by systematically alternating calls - /// to insert_key and insert_value. - /// - /// TODO see if I can bend Rust typesystem to enforce that - /// in a nice way. - pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { - self.fst_builder - .insert(key, self.data.len() as u64) - .map_err(convert_fst_error)?; - Ok(()) - } - - /// Horribly unsafe, nobody should ever do that... except me :) - pub fn insert_value(&mut self, value: &V) -> io::Result<()> { - value.serialize(&mut self.data)?; - Ok(()) - } - - #[cfg(test)] - pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> { - self.fst_builder - .insert(key, self.data.len() as u64) - .map_err(convert_fst_error)?; - value.serialize(&mut self.data)?; - Ok(()) - } - - pub fn finish(self) -> io::Result { - let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?; - let footer_size = self.data.len() as u32; - file.write_all(&self.data)?; - (footer_size as u32).serialize(&mut file)?; - file.flush()?; - Ok(file) - } -} - -pub struct FstMap { - fst_index: fst::Map, - values_mmap: ReadOnlySource, - _phantom_: PhantomData, -} - - -fn open_fst_index(source: ReadOnlySource) -> io::Result { - Ok(fst::Map::from(match source { - ReadOnlySource::Anonymous(data) => { - Fst::from_shared_bytes(data.data, data.start, data.len) - .map_err(convert_fst_error)? - } - ReadOnlySource::Mmap(mmap_readonly) => { - Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)? - } - })) -} - -impl FstMap - where V: BinarySerializable -{ - pub fn from_source(source: ReadOnlySource) -> io::Result> { - let total_len = source.len(); - let length_offset = total_len - 4; - let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = u32::deserialize(&mut split_len_buffer)? as usize; - let split_len = length_offset - footer_size; - let fst_source = source.slice(0, split_len); - let values_source = source.slice(split_len, length_offset); - let fst_index = open_fst_index(fst_source)?; - Ok(FstMap { - fst_index: fst_index, - values_mmap: values_source, - _phantom_: PhantomData, - }) - } - - - /// In the `FstMap`, the dictionary itself associated - /// each key `&[u8]` to a `u64` that is in fact the address - /// of the value object in a data array. - /// - /// This method deserialize this object, and returns it. - pub(crate) fn read_value(&self, offset: u64) -> io::Result { - let buffer = self.values_mmap.as_slice(); - let mut cursor = &buffer[(offset as usize)..]; - V::deserialize(&mut cursor) - } - - /// Returns, if present the value associated to a given key. - pub fn get>(&self, key: K) -> Option { - self.fst_index - .get(key) - .map(|offset| self.read_value(offset).expect("The fst is corrupted. Failed to deserialize a value.")) - } - - - /// Returns a stream of all the sorted terms. - pub fn stream(&self) -> FstMapStreamer { - self.range().into_stream() - } - - - /// Returns a stream of all the sorted terms in the given field. - pub fn stream_field(&self, field: Field) -> FstMapStreamer { - let start_term = Term::from_field_text(field, ""); - let stop_term = Term::from_field_text(Field(field.0 + 1), ""); - self.range() - .ge(start_term.as_slice()) - .lt(stop_term.as_slice()) - .into_stream() - } - - /// Returns a range builder, to stream all of the terms - /// within an interval. - pub fn range(&self) -> FstMapStreamerBuilder { - FstMapStreamerBuilder::new(self, self.fst_index.range()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use directory::{RAMDirectory, Directory}; - use std::path::PathBuf; - use fst::Streamer; - - #[test] - fn test_fstmap() { - let mut directory = RAMDirectory::create(); - let path = PathBuf::from("fstmap"); - { - let write = directory.open_write(&path).unwrap(); - let mut fstmap_builder = FstMapBuilder::new(write).unwrap(); - fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap(); - fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap(); - fstmap_builder.finish().unwrap(); - } - let source = directory.open_read(&path).unwrap(); - let fstmap: FstMap = FstMap::from_source(source).unwrap(); - assert_eq!(fstmap.get("abc"), Some(34u32)); - assert_eq!(fstmap.get("abcd"), Some(346u32)); - let mut stream = fstmap.stream(); - assert_eq!(stream.next().unwrap(), "abc".as_bytes()); - assert_eq!(stream.key(), "abc".as_bytes()); - assert_eq!(stream.value(), 34u32); - assert_eq!(stream.next().unwrap(), "abcd".as_bytes()); - assert_eq!(stream.key(), "abcd".as_bytes()); - assert_eq!(stream.value(), 346u32); - assert!(!stream.advance()); - } - -} diff --git a/src/termdict/fstmerger.rs b/src/termdict/merger.rs similarity index 97% rename from src/termdict/fstmerger.rs rename to src/termdict/merger.rs index a4342855a..e1d3fd155 100644 --- a/src/termdict/fstmerger.rs +++ b/src/termdict/merger.rs @@ -1,6 +1,6 @@ use std::collections::BinaryHeap; use core::SegmentReader; -use super::FstMapStreamer; +use super::TermStreamer; use common::BinarySerializable; use postings::TermInfo; use std::cmp::Ordering; @@ -9,7 +9,7 @@ use fst::Streamer; pub struct HeapItem<'a, V> where V: 'a + BinarySerializable { - pub streamer: FstMapStreamer<'a, V>, + pub streamer: TermStreamer<'a, V>, pub segment_ord: usize, } @@ -56,7 +56,7 @@ pub struct FstMerger<'a, V> impl<'a, V> FstMerger<'a, V> where V: 'a + BinarySerializable { - fn new(streams: Vec>) -> FstMerger<'a, V> { + fn new(streams: Vec>) -> FstMerger<'a, V> { FstMerger { heap: BinaryHeap::new(), current_streamers: streams diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 3ef5c4c63..708538744 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -15,12 +15,12 @@ sorted. */ -mod fstmap; +mod termdict; mod streamer; -mod fstmerger; +mod merger; -pub use self::fstmap::FstMap; -pub(crate) use self::fstmap::FstMapBuilder; -pub use self::streamer::FstMapStreamer; -pub use self::streamer::FstMapStreamerBuilder; -pub use self::fstmerger::FstMerger; \ No newline at end of file +pub use self::termdict::TermDictionary; +pub(crate) use self::termdict::TermDictionaryBuilder; +pub use self::streamer::TermStreamer; +pub use self::streamer::TermStreamerBuilder; +pub use self::merger::FstMerger; \ No newline at end of file diff --git a/src/termdict/streamer.rs b/src/termdict/streamer.rs index 3bef03389..c5784e6bd 100644 --- a/src/termdict/streamer.rs +++ b/src/termdict/streamer.rs @@ -1,16 +1,16 @@ use fst::{self, IntoStreamer, Streamer}; use fst::map::{StreamBuilder, Stream}; use common::BinarySerializable; -use super::FstMap; +use super::TermDictionary; -pub struct FstMapStreamerBuilder<'a, V> +pub struct TermStreamerBuilder<'a, V> where V: 'a + BinarySerializable { - fst_map: &'a FstMap, + fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a>, } -impl<'a, V> FstMapStreamerBuilder<'a, V> +impl<'a, V> TermStreamerBuilder<'a, V> where V: 'a + BinarySerializable { pub fn ge>(mut self, bound: T) -> Self { @@ -33,8 +33,8 @@ impl<'a, V> FstMapStreamerBuilder<'a, V> self } - pub fn into_stream(self) -> FstMapStreamer<'a, V> { - FstMapStreamer { + pub fn into_stream(self) -> TermStreamer<'a, V> { + TermStreamer { fst_map: self.fst_map, stream: self.stream_builder.into_stream(), buffer: Vec::with_capacity(100), @@ -42,10 +42,10 @@ impl<'a, V> FstMapStreamerBuilder<'a, V> } } - pub fn new(fst_map: &'a FstMap, + pub fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a>) - -> FstMapStreamerBuilder<'a, V> { - FstMapStreamerBuilder { + -> TermStreamerBuilder<'a, V> { + TermStreamerBuilder { fst_map: fst_map, stream_builder: stream_builder, } @@ -56,17 +56,17 @@ impl<'a, V> FstMapStreamerBuilder<'a, V> -pub struct FstMapStreamer<'a, V> +pub struct TermStreamer<'a, V> where V: 'a + BinarySerializable { - fst_map: &'a FstMap, + fst_map: &'a TermDictionary, stream: Stream<'a>, offset: u64, buffer: Vec, } -impl<'a, 'b, V> fst::Streamer<'b> for FstMapStreamer<'a, V> +impl<'a, 'b, V> fst::Streamer<'b> for TermStreamer<'a, V> where V: 'a + BinarySerializable { type Item = &'b [u8]; @@ -80,7 +80,7 @@ impl<'a, 'b, V> fst::Streamer<'b> for FstMapStreamer<'a, V> } } -impl<'a, V> FstMapStreamer<'a, V> +impl<'a, V> TermStreamer<'a, V> where V: 'a + BinarySerializable { pub fn advance(&mut self) -> bool { diff --git a/src/termdict/fstmap.rs b/src/termdict/termdict.rs similarity index 73% rename from src/termdict/fstmap.rs rename to src/termdict/termdict.rs index bcf8c3a50..232826234 100644 --- a/src/termdict/fstmap.rs +++ b/src/termdict/termdict.rs @@ -1,11 +1,12 @@ use std::io::{self, Write}; use fst; use fst::raw::Fst; -use super::{FstMapStreamerBuilder, FstMapStreamer}; +use super::{TermStreamerBuilder, TermStreamer}; use directory::ReadOnlySource; use common::BinarySerializable; use std::marker::PhantomData; use schema::{Field, Term}; +use postings::TermInfo; fn convert_fst_error(e: fst::Error) -> io::Error { @@ -13,43 +14,39 @@ fn convert_fst_error(e: fst::Error) -> io::Error { } -pub struct FstMapBuilder { +pub struct TermDictionaryBuilder where V: BinarySerializable { fst_builder: fst::MapBuilder, data: Vec, _phantom_: PhantomData, } -impl FstMapBuilder { - pub fn new(w: W) -> io::Result> { +impl TermDictionaryBuilder { + pub fn new(w: W) -> io::Result> { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; - Ok(FstMapBuilder { + Ok(TermDictionaryBuilder { fst_builder: fst_builder, data: Vec::new(), _phantom_: PhantomData, }) } - /// Horribly unsafe, nobody should ever do that... except me :) + /// Horribly unsafe internal API /// /// If used, it must be used by systematically alternating calls /// to insert_key and insert_value. - /// - /// TODO see if I can bend Rust typesystem to enforce that - /// in a nice way. - pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { + pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { self.fst_builder .insert(key, self.data.len() as u64) .map_err(convert_fst_error)?; Ok(()) } - /// Horribly unsafe, nobody should ever do that... except me :) - pub fn insert_value(&mut self, value: &V) -> io::Result<()> { + /// Horribly unsafe internal API + pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> { value.serialize(&mut self.data)?; Ok(()) } - #[cfg(test)] pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> { self.fst_builder .insert(key, self.data.len() as u64) @@ -68,7 +65,7 @@ impl FstMapBuilder { } } -pub struct FstMap { +pub struct TermDictionary where V: BinarySerializable { fst_index: fst::Map, values_mmap: ReadOnlySource, _phantom_: PhantomData, @@ -89,10 +86,9 @@ fn open_fst_index(source: ReadOnlySource) -> io::Result { Ok(fst::Map::from(fst)) } -impl FstMap - where V: BinarySerializable +impl TermDictionary where V: BinarySerializable { - pub fn from_source(source: ReadOnlySource) -> io::Result> { + pub fn from_source(source: ReadOnlySource) -> io::Result> { let total_len = source.len(); let length_offset = total_len - 4; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; @@ -101,7 +97,7 @@ impl FstMap let fst_source = source.slice(0, split_len); let values_source = source.slice(split_len, length_offset); let fst_index = open_fst_index(fst_source)?; - Ok(FstMap { + Ok(TermDictionary { fst_index: fst_index, values_mmap: values_source, _phantom_: PhantomData, @@ -109,7 +105,6 @@ impl FstMap } - /// /// This method deserialize this object, and returns it. pub(crate) fn read_value(&self, offset: u64) -> io::Result { let buffer = self.values_mmap.as_slice(); @@ -129,13 +124,13 @@ impl FstMap /// Returns a stream of all the sorted terms. - pub fn stream(&self) -> FstMapStreamer { + pub fn stream(&self) -> TermStreamer { self.range().into_stream() } /// Returns a stream of all the sorted terms in the given field. - pub fn stream_field(&self, field: Field) -> FstMapStreamer { + pub fn stream_field(&self, field: Field) -> TermStreamer { let start_term = Term::from_field_text(field, ""); let stop_term = Term::from_field_text(Field(field.0 + 1), ""); self.range() @@ -146,8 +141,8 @@ impl FstMap /// Returns a range builder, to stream all of the terms /// within an interval. - pub fn range(&self) -> FstMapStreamerBuilder { - FstMapStreamerBuilder::new(self, self.fst_index.range()) + pub fn range(&self) -> TermStreamerBuilder { + TermStreamerBuilder::new(self, self.fst_index.range()) } } @@ -159,21 +154,21 @@ mod tests { use fst::Streamer; #[test] - fn test_fstmap() { + fn test_term_dictionary() { let mut directory = RAMDirectory::create(); - let path = PathBuf::from("fstmap"); + let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let mut fstmap_builder = FstMapBuilder::new(write).unwrap(); - fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap(); - fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap(); - fstmap_builder.finish().unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::new(write).unwrap(); + term_dictionary_builder.insert("abc".as_bytes(), &34u32).unwrap(); + term_dictionary_builder.insert("abcd".as_bytes(), &346u32).unwrap(); + term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let fstmap: FstMap = FstMap::from_source(source).unwrap(); - assert_eq!(fstmap.get("abc"), Some(34u32)); - assert_eq!(fstmap.get("abcd"), Some(346u32)); - let mut stream = fstmap.stream(); + let term_dict: TermDictionary = TermDictionary::from_source(source).unwrap(); + assert_eq!(term_dict.get("abc"), Some(34u32)); + assert_eq!(term_dict.get("abcd"), Some(346u32)); + let mut stream = term_dict.stream(); assert_eq!(stream.next().unwrap(), "abc".as_bytes()); assert_eq!(stream.key(), "abc".as_bytes()); assert_eq!(stream.value(), 34u32);