From 02bfa9be527efc05991fa20bfba9d5496236e959 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 19 May 2017 08:43:52 +0900 Subject: [PATCH] Moving to termdict --- src/core/searcher.rs | 2 +- src/core/segment_reader.rs | 2 +- src/datastruct/fstmap/mod.rs | 9 - src/datastruct/mod.rs | 1 - src/directory/mod.rs | 5 + src/fastfield/mod.rs | 46 ++--- src/indexer/merger.rs | 2 +- src/lib.rs | 6 +- src/postings/serializer.rs | 2 +- src/{datastruct/fstmap => termdict}/fstmap.rs | 26 +-- src/termdict/fstmap.rs.bk | 183 ++++++++++++++++++ .../fstmap => termdict}/fstmerger.rs | 0 src/termdict/mod.rs | 26 +++ .../fstmap => termdict}/streamer.rs | 0 14 files changed, 258 insertions(+), 52 deletions(-) delete mode 100644 src/datastruct/fstmap/mod.rs rename src/{datastruct/fstmap => termdict}/fstmap.rs (89%) create mode 100644 src/termdict/fstmap.rs.bk rename src/{datastruct/fstmap => termdict}/fstmerger.rs (100%) create mode 100644 src/termdict/mod.rs rename src/{datastruct/fstmap => termdict}/streamer.rs (100%) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 45bdfccb7..daa547f00 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -7,7 +7,7 @@ use query::Query; use DocId; use DocAddress; use schema::Term; -use datastruct::fstmap::FstMerger; +use termdict::FstMerger; use std::fmt; use postings::TermInfo; diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 25f2a023f..9a7471881 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -14,7 +14,7 @@ use DocId; use std::str; use std::cmp; use postings::TermInfo; -use datastruct::fstmap::FstMap; +use termdict::FstMap; use std::sync::Arc; use std::fmt; use schema::Field; diff --git a/src/datastruct/fstmap/mod.rs b/src/datastruct/fstmap/mod.rs deleted file mode 100644 index 62d1a786d..000000000 --- a/src/datastruct/fstmap/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -mod fstmap; -mod streamer; -mod fstmerger; - -pub use self::fstmap::FstMap; -pub use self::fstmap::FstMapBuilder; -pub use self::streamer::FstMapStreamer; -pub use self::streamer::FstMapStreamerBuilder; -pub use self::fstmerger::FstMerger; diff --git a/src/datastruct/mod.rs b/src/datastruct/mod.rs index 96cc8242f..5ff10f7c7 100644 --- a/src/datastruct/mod.rs +++ b/src/datastruct/mod.rs @@ -1,4 +1,3 @@ -pub mod fstmap; mod skip; pub mod stacker; diff --git a/src/directory/mod.rs b/src/directory/mod.rs index f47cfdcbb..b107d78c5 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -1,3 +1,8 @@ +/*! + +WORM directory abstraction. + +*/ mod mmap_directory; mod ram_directory; mod directory; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 061ed3910..84fe5307a 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1,25 +1,27 @@ -//! # Fast fields -//! -//! Fast fields are the equivalent of `DocValues` in `Lucene`. -//! Fast fields is a non-compressed column-oriented fashion storage -//! of `tantivy`. -//! -//! It is designed for the fast random access of some document -//! fields given a document id. -//! -//! `FastField` are useful when a field is required for all or most of -//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting. -//! -//! -//! Fields have to be declared as `FAST` in the schema. -//! Currently only 64-bits integers (signed or unsigned) are -//! supported. -//! -//! They are stored in a bitpacked fashion so that their -//! memory usage is directly linear with the amplitude of the -//! values stored. -//! -//! Read access performance is comparable to that of an array lookup. +/*! +Fast fields is a column oriented storage storage. + +It is the equivalent of `Lucene`'s `DocValues`. + +Fast fields is a column-oriented fashion storage of `tantivy`. + +It is designed for the fast random access of some document +fields given a document id. + +`FastField` are useful when a field is required for all or most of +the `DocSet` : for instance for scoring, grouping, filtering, or facetting. + + +Fields have to be declared as `FAST` in the schema. +Currently only 64-bits integers (signed or unsigned) are +supported. + +They are stored in a bitpacked fashion so that their +memory usage is directly linear with the amplitude of the +values stored. + +Read access performance is comparable to that of an array lookup. +*/ mod reader; mod writer; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 8036c4977..f26177ea3 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -12,7 +12,7 @@ use postings::Postings; use postings::DocSet; use fastfield::DeleteBitSet; use schema::{Schema, Field}; -use datastruct::fstmap::FstMerger; +use termdict::FstMerger; use fastfield::FastFieldSerializer; use fastfield::FastFieldReader; use store::StoreWriter; diff --git a/src/lib.rs b/src/lib.rs index b0a7ddbb6..bfd098a96 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,9 +111,10 @@ mod datastruct; +pub mod termdict; + /// Query module pub mod query; -/// Directory module pub mod directory; /// Collector module pub mod collector; @@ -147,8 +148,7 @@ pub fn version() -> &'static str { } } -/// Tantivy's makes it possible to personalize when -/// the indexer should merge its segments +/// Defines tantivy's merging strategy pub mod merge_policy { pub use indexer::MergePolicy; pub use indexer::LogMergePolicy; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 1f84999b9..048f23dca 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,5 +1,5 @@ use Result; -use datastruct::fstmap::FstMapBuilder; +use termdict::FstMapBuilder; use super::TermInfo; use schema::Field; use schema::FieldEntry; diff --git a/src/datastruct/fstmap/fstmap.rs b/src/termdict/fstmap.rs similarity index 89% rename from src/datastruct/fstmap/fstmap.rs rename to src/termdict/fstmap.rs index 8338cee37..bcf8c3a50 100644 --- a/src/datastruct/fstmap/fstmap.rs +++ b/src/termdict/fstmap.rs @@ -12,6 +12,7 @@ fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) } + pub struct FstMapBuilder { fst_builder: fst::MapBuilder, data: Vec, @@ -75,15 +76,17 @@ pub struct FstMap { fn open_fst_index(source: ReadOnlySource) -> io::Result { - Ok(fst::Map::from(match source { - ReadOnlySource::Anonymous(data) => { - Fst::from_shared_bytes(data.data, data.start, data.len) - .map_err(convert_fst_error)? - } - ReadOnlySource::Mmap(mmap_readonly) => { - Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)? - } - })) + let fst = match source { + ReadOnlySource::Anonymous(data) => { + Fst::from_shared_bytes(data.data, data.start, data.len) + .map_err(convert_fst_error)? + } + ReadOnlySource::Mmap(mmap_readonly) => { + Fst::from_mmap(mmap_readonly) + .map_err(convert_fst_error)? + } + }; + Ok(fst::Map::from(fst)) } impl FstMap @@ -106,9 +109,6 @@ impl FstMap } - /// In the `FstMap`, the dictionary itself associated - /// each key `&[u8]` to a `u64` that is in fact the address - /// of the value object in a data array. /// /// This method deserialize this object, and returns it. pub(crate) fn read_value(&self, offset: u64) -> io::Result { @@ -125,7 +125,7 @@ impl FstMap self.read_value(offset) .expect("The fst is corrupted. Failed to deserialize a value.") }) - } + } /// Returns a stream of all the sorted terms. diff --git a/src/termdict/fstmap.rs.bk b/src/termdict/fstmap.rs.bk new file mode 100644 index 000000000..732d14084 --- /dev/null +++ b/src/termdict/fstmap.rs.bk @@ -0,0 +1,183 @@ +use std::io::{self, Write}; +use fst; +use fst::raw::Fst; +use super::{FstMapStreamerBuilder, FstMapStreamer}; +use directory::ReadOnlySource; +use common::BinarySerializable; +use std::marker::PhantomData; +use schema::{Field, Term}; + + +fn convert_fst_error(e: fst::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} + +pub struct FstMapBuilder { + fst_builder: fst::MapBuilder, + data: Vec, + _phantom_: PhantomData, +} + +impl FstMapBuilder { + pub fn new(w: W) -> io::Result> { + let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; + Ok(FstMapBuilder { + fst_builder: fst_builder, + data: Vec::new(), + _phantom_: PhantomData, + }) + } + + /// Horribly unsafe, nobody should ever do that... except me :) + /// + /// If used, it must be used by systematically alternating calls + /// to insert_key and insert_value. + /// + /// TODO see if I can bend Rust typesystem to enforce that + /// in a nice way. + pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { + self.fst_builder + .insert(key, self.data.len() as u64) + .map_err(convert_fst_error)?; + Ok(()) + } + + /// Horribly unsafe, nobody should ever do that... except me :) + pub fn insert_value(&mut self, value: &V) -> io::Result<()> { + value.serialize(&mut self.data)?; + Ok(()) + } + + #[cfg(test)] + pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> { + self.fst_builder + .insert(key, self.data.len() as u64) + .map_err(convert_fst_error)?; + value.serialize(&mut self.data)?; + Ok(()) + } + + pub fn finish(self) -> io::Result { + let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?; + let footer_size = self.data.len() as u32; + file.write_all(&self.data)?; + (footer_size as u32).serialize(&mut file)?; + file.flush()?; + Ok(file) + } +} + +pub struct FstMap { + fst_index: fst::Map, + values_mmap: ReadOnlySource, + _phantom_: PhantomData, +} + + +fn open_fst_index(source: ReadOnlySource) -> io::Result { + Ok(fst::Map::from(match source { + ReadOnlySource::Anonymous(data) => { + Fst::from_shared_bytes(data.data, data.start, data.len) + .map_err(convert_fst_error)? + } + ReadOnlySource::Mmap(mmap_readonly) => { + Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)? + } + })) +} + +impl FstMap + where V: BinarySerializable +{ + pub fn from_source(source: ReadOnlySource) -> io::Result> { + let total_len = source.len(); + let length_offset = total_len - 4; + let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; + let footer_size = u32::deserialize(&mut split_len_buffer)? as usize; + let split_len = length_offset - footer_size; + let fst_source = source.slice(0, split_len); + let values_source = source.slice(split_len, length_offset); + let fst_index = open_fst_index(fst_source)?; + Ok(FstMap { + fst_index: fst_index, + values_mmap: values_source, + _phantom_: PhantomData, + }) + } + + + /// In the `FstMap`, the dictionary itself associated + /// each key `&[u8]` to a `u64` that is in fact the address + /// of the value object in a data array. + /// + /// This method deserialize this object, and returns it. + pub(crate) fn read_value(&self, offset: u64) -> io::Result { + let buffer = self.values_mmap.as_slice(); + let mut cursor = &buffer[(offset as usize)..]; + V::deserialize(&mut cursor) + } + + /// Returns, if present the value associated to a given key. + pub fn get>(&self, key: K) -> Option { + self.fst_index + .get(key) + .map(|offset| self.read_value(offset).expect("The fst is corrupted. Failed to deserialize a value.")) + } + + + /// Returns a stream of all the sorted terms. + pub fn stream(&self) -> FstMapStreamer { + self.range().into_stream() + } + + + /// Returns a stream of all the sorted terms in the given field. + pub fn stream_field(&self, field: Field) -> FstMapStreamer { + let start_term = Term::from_field_text(field, ""); + let stop_term = Term::from_field_text(Field(field.0 + 1), ""); + self.range() + .ge(start_term.as_slice()) + .lt(stop_term.as_slice()) + .into_stream() + } + + /// Returns a range builder, to stream all of the terms + /// within an interval. + pub fn range(&self) -> FstMapStreamerBuilder { + FstMapStreamerBuilder::new(self, self.fst_index.range()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use directory::{RAMDirectory, Directory}; + use std::path::PathBuf; + use fst::Streamer; + + #[test] + fn test_fstmap() { + let mut directory = RAMDirectory::create(); + let path = PathBuf::from("fstmap"); + { + let write = directory.open_write(&path).unwrap(); + let mut fstmap_builder = FstMapBuilder::new(write).unwrap(); + fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap(); + fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap(); + fstmap_builder.finish().unwrap(); + } + let source = directory.open_read(&path).unwrap(); + let fstmap: FstMap = FstMap::from_source(source).unwrap(); + assert_eq!(fstmap.get("abc"), Some(34u32)); + assert_eq!(fstmap.get("abcd"), Some(346u32)); + let mut stream = fstmap.stream(); + assert_eq!(stream.next().unwrap(), "abc".as_bytes()); + assert_eq!(stream.key(), "abc".as_bytes()); + assert_eq!(stream.value(), 34u32); + assert_eq!(stream.next().unwrap(), "abcd".as_bytes()); + assert_eq!(stream.key(), "abcd".as_bytes()); + assert_eq!(stream.value(), 346u32); + assert!(!stream.advance()); + } + +} diff --git a/src/datastruct/fstmap/fstmerger.rs b/src/termdict/fstmerger.rs similarity index 100% rename from src/datastruct/fstmap/fstmerger.rs rename to src/termdict/fstmerger.rs diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs new file mode 100644 index 000000000..3ef5c4c63 --- /dev/null +++ b/src/termdict/mod.rs @@ -0,0 +1,26 @@ +/*! +The term dictionary contains all of the terms in +`tantivy index` in a sorted manner. + +It is implemented as a wrapper of the `Fst` crate in order +to add a value type. + +A finite state transducer itself associates +each term `&[u8]` to a `u64` that is in fact an address +in a buffer. The value is then accessible via +deserializing the value at this address. + +Keys (`&[u8]`) in this datastructure are +sorted. + +*/ + +mod fstmap; +mod streamer; +mod fstmerger; + +pub use self::fstmap::FstMap; +pub(crate) use self::fstmap::FstMapBuilder; +pub use self::streamer::FstMapStreamer; +pub use self::streamer::FstMapStreamerBuilder; +pub use self::fstmerger::FstMerger; \ No newline at end of file diff --git a/src/datastruct/fstmap/streamer.rs b/src/termdict/streamer.rs similarity index 100% rename from src/datastruct/fstmap/streamer.rs rename to src/termdict/streamer.rs