Renamed fstmap to termdict

This commit is contained in:
Paul Masurel
2017-05-19 09:26:18 +09:00
parent 02bfa9be52
commit c57ab6a335
10 changed files with 62 additions and 250 deletions

View File

@@ -14,7 +14,7 @@ use DocId;
use std::str;
use std::cmp;
use postings::TermInfo;
use termdict::FstMap;
use termdict::TermDictionary;
use std::sync::Arc;
use std::fmt;
use schema::Field;
@@ -43,7 +43,7 @@ use schema::TextIndexingOptions;
pub struct SegmentReader {
segment_id: SegmentId,
segment_meta: SegmentMeta,
terms: Arc<FstMap<TermInfo>>,
terms: Arc<TermDictionary>,
postings_data: ReadOnlySource,
store_reader: StoreReader,
fast_fields_reader: Arc<FastFieldsReader>,
@@ -135,7 +135,7 @@ impl SegmentReader {
pub fn open(segment: Segment) -> Result<SegmentReader> {
let source = try!(segment.open_read(SegmentComponent::TERMS));
let terms = try!(FstMap::from_source(source));
let terms = try!(TermDictionary::from_source(source));
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
@@ -172,7 +172,7 @@ impl SegmentReader {
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &FstMap<TermInfo> {
pub fn terms(&self) -> &TermDictionary {
&self.terms
}

View File

@@ -273,7 +273,7 @@ fn index_documents(heap: &mut Heap,
//
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_termdic_saturated() {
if segment_writer.is_term_saturated() {
info!("Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc());
break;

View File

@@ -98,8 +98,8 @@ impl<'a> SegmentWriter<'a> {
/// Return true if the term dictionary hashmap is reaching capacity.
/// It is one of the condition that triggers a `SegmentWriter` to
/// be finalized.
pub(crate) fn is_termdic_saturated(&self) -> bool {
self.multifield_postings.is_termdic_saturated()
pub(crate) fn is_term_saturated(&self) -> bool {
self.multifield_postings.is_term_saturated()
}

View File

@@ -109,7 +109,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
}
/// Return true iff the term dictionary is saturated.
pub fn is_termdic_saturated(&self) -> bool {
pub fn is_term_saturated(&self) -> bool {
self.term_index.is_saturated()
}
}

View File

@@ -1,5 +1,5 @@
use Result;
use termdict::FstMapBuilder;
use termdict::TermDictionaryBuilder;
use super::TermInfo;
use schema::Field;
use schema::FieldEntry;
@@ -50,7 +50,7 @@ use common::BinarySerializable;
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>,
terms_fst_builder: TermDictionaryBuilder<WritePtr, TermInfo>,
postings_write: WritePtr,
positions_write: WritePtr,
written_bytes_postings: usize,
@@ -74,7 +74,7 @@ impl PostingsSerializer {
positions_write: WritePtr,
schema: Schema)
-> Result<PostingsSerializer> {
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
let terms_fst_builder = try!(TermDictionaryBuilder::new(terms_write));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,

View File

@@ -1,183 +0,0 @@
use std::io::{self, Write};
use fst;
use fst::raw::Fst;
use super::{FstMapStreamerBuilder, FstMapStreamer};
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
use schema::{Field, Term};
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
/// Horribly unsafe, nobody should ever do that... except me :)
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// TODO see if I can bend Rust typesystem to enforce that
/// in a nice way.
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error)?;
Ok(())
}
/// Horribly unsafe, nobody should ever do that... except me :)
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
value.serialize(&mut self.data)?;
Ok(())
}
#[cfg(test)]
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error)?;
value.serialize(&mut self.data)?;
Ok(())
}
pub fn finish(self) -> io::Result<W> {
let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?;
let footer_size = self.data.len() as u32;
file.write_all(&self.data)?;
(footer_size as u32).serialize(&mut file)?;
file.flush()?;
Ok(file)
}
}
pub struct FstMap<V: BinarySerializable> {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error)?
}
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
}
}))
}
impl<V> FstMap<V>
where V: BinarySerializable
{
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source)?;
Ok(FstMap {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
}
/// In the `FstMap`, the dictionary itself associated
/// each key `&[u8]` to a `u64` that is in fact the address
/// of the value object in a data array.
///
/// This method deserialize this object, and returns it.
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
let buffer = self.values_mmap.as_slice();
let mut cursor = &buffer[(offset as usize)..];
V::deserialize(&mut cursor)
}
/// Returns, if present the value associated to a given key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
self.fst_index
.get(key)
.map(|offset| self.read_value(offset).expect("The fst is corrupted. Failed to deserialize a value."))
}
/// Returns a stream of all the sorted terms.
pub fn stream(&self) -> FstMapStreamer<V> {
self.range().into_stream()
}
/// Returns a stream of all the sorted terms in the given field.
pub fn stream_field(&self, field: Field) -> FstMapStreamer<V> {
let start_term = Term::from_field_text(field, "");
let stop_term = Term::from_field_text(Field(field.0 + 1), "");
self.range()
.ge(start_term.as_slice())
.lt(stop_term.as_slice())
.into_stream()
}
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> FstMapStreamerBuilder<V> {
FstMapStreamerBuilder::new(self, self.fst_index.range())
}
}
#[cfg(test)]
mod tests {
use super::*;
use directory::{RAMDirectory, Directory};
use std::path::PathBuf;
use fst::Streamer;
#[test]
fn test_fstmap() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("fstmap");
{
let write = directory.open_write(&path).unwrap();
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
let mut stream = fstmap.stream();
assert_eq!(stream.next().unwrap(), "abc".as_bytes());
assert_eq!(stream.key(), "abc".as_bytes());
assert_eq!(stream.value(), 34u32);
assert_eq!(stream.next().unwrap(), "abcd".as_bytes());
assert_eq!(stream.key(), "abcd".as_bytes());
assert_eq!(stream.value(), 346u32);
assert!(!stream.advance());
}
}

View File

@@ -1,6 +1,6 @@
use std::collections::BinaryHeap;
use core::SegmentReader;
use super::FstMapStreamer;
use super::TermStreamer;
use common::BinarySerializable;
use postings::TermInfo;
use std::cmp::Ordering;
@@ -9,7 +9,7 @@ use fst::Streamer;
pub struct HeapItem<'a, V>
where V: 'a + BinarySerializable
{
pub streamer: FstMapStreamer<'a, V>,
pub streamer: TermStreamer<'a, V>,
pub segment_ord: usize,
}
@@ -56,7 +56,7 @@ pub struct FstMerger<'a, V>
impl<'a, V> FstMerger<'a, V>
where V: 'a + BinarySerializable
{
fn new(streams: Vec<FstMapStreamer<'a, V>>) -> FstMerger<'a, V> {
fn new(streams: Vec<TermStreamer<'a, V>>) -> FstMerger<'a, V> {
FstMerger {
heap: BinaryHeap::new(),
current_streamers: streams

View File

@@ -15,12 +15,12 @@ sorted.
*/
mod fstmap;
mod termdict;
mod streamer;
mod fstmerger;
mod merger;
pub use self::fstmap::FstMap;
pub(crate) use self::fstmap::FstMapBuilder;
pub use self::streamer::FstMapStreamer;
pub use self::streamer::FstMapStreamerBuilder;
pub use self::fstmerger::FstMerger;
pub use self::termdict::TermDictionary;
pub(crate) use self::termdict::TermDictionaryBuilder;
pub use self::streamer::TermStreamer;
pub use self::streamer::TermStreamerBuilder;
pub use self::merger::FstMerger;

View File

@@ -1,16 +1,16 @@
use fst::{self, IntoStreamer, Streamer};
use fst::map::{StreamBuilder, Stream};
use common::BinarySerializable;
use super::FstMap;
use super::TermDictionary;
pub struct FstMapStreamerBuilder<'a, V>
pub struct TermStreamerBuilder<'a, V>
where V: 'a + BinarySerializable
{
fst_map: &'a FstMap<V>,
fst_map: &'a TermDictionary<V>,
stream_builder: StreamBuilder<'a>,
}
impl<'a, V> FstMapStreamerBuilder<'a, V>
impl<'a, V> TermStreamerBuilder<'a, V>
where V: 'a + BinarySerializable
{
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
@@ -33,8 +33,8 @@ impl<'a, V> FstMapStreamerBuilder<'a, V>
self
}
pub fn into_stream(self) -> FstMapStreamer<'a, V> {
FstMapStreamer {
pub fn into_stream(self) -> TermStreamer<'a, V> {
TermStreamer {
fst_map: self.fst_map,
stream: self.stream_builder.into_stream(),
buffer: Vec::with_capacity(100),
@@ -42,10 +42,10 @@ impl<'a, V> FstMapStreamerBuilder<'a, V>
}
}
pub fn new(fst_map: &'a FstMap<V>,
pub fn new(fst_map: &'a TermDictionary<V>,
stream_builder: StreamBuilder<'a>)
-> FstMapStreamerBuilder<'a, V> {
FstMapStreamerBuilder {
-> TermStreamerBuilder<'a, V> {
TermStreamerBuilder {
fst_map: fst_map,
stream_builder: stream_builder,
}
@@ -56,17 +56,17 @@ impl<'a, V> FstMapStreamerBuilder<'a, V>
pub struct FstMapStreamer<'a, V>
pub struct TermStreamer<'a, V>
where V: 'a + BinarySerializable
{
fst_map: &'a FstMap<V>,
fst_map: &'a TermDictionary<V>,
stream: Stream<'a>,
offset: u64,
buffer: Vec<u8>,
}
impl<'a, 'b, V> fst::Streamer<'b> for FstMapStreamer<'a, V>
impl<'a, 'b, V> fst::Streamer<'b> for TermStreamer<'a, V>
where V: 'a + BinarySerializable
{
type Item = &'b [u8];
@@ -80,7 +80,7 @@ impl<'a, 'b, V> fst::Streamer<'b> for FstMapStreamer<'a, V>
}
}
impl<'a, V> FstMapStreamer<'a, V>
impl<'a, V> TermStreamer<'a, V>
where V: 'a + BinarySerializable
{
pub fn advance(&mut self) -> bool {

View File

@@ -1,11 +1,12 @@
use std::io::{self, Write};
use fst;
use fst::raw::Fst;
use super::{FstMapStreamerBuilder, FstMapStreamer};
use super::{TermStreamerBuilder, TermStreamer};
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
use schema::{Field, Term};
use postings::TermInfo;
fn convert_fst_error(e: fst::Error) -> io::Error {
@@ -13,43 +14,39 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
pub struct TermDictionaryBuilder<W: Write, V=TermInfo> where V: BinarySerializable {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
impl<W: Write, V: BinarySerializable> TermDictionaryBuilder<W, V> {
pub fn new(w: W) -> io::Result<TermDictionaryBuilder<W, V>> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(FstMapBuilder {
Ok(TermDictionaryBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
/// Horribly unsafe, nobody should ever do that... except me :)
/// Horribly unsafe internal API
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// TODO see if I can bend Rust typesystem to enforce that
/// in a nice way.
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error)?;
Ok(())
}
/// Horribly unsafe, nobody should ever do that... except me :)
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
/// Horribly unsafe internal API
pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> {
value.serialize(&mut self.data)?;
Ok(())
}
#[cfg(test)]
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
self.fst_builder
.insert(key, self.data.len() as u64)
@@ -68,7 +65,7 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
}
}
pub struct FstMap<V: BinarySerializable> {
pub struct TermDictionary<V=TermInfo> where V: BinarySerializable {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
@@ -89,10 +86,9 @@ fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(fst))
}
impl<V> FstMap<V>
where V: BinarySerializable
impl<V> TermDictionary<V> where V: BinarySerializable
{
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
pub fn from_source(source: ReadOnlySource) -> io::Result<TermDictionary<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
@@ -101,7 +97,7 @@ impl<V> FstMap<V>
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source)?;
Ok(FstMap {
Ok(TermDictionary {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
@@ -109,7 +105,6 @@ impl<V> FstMap<V>
}
///
/// This method deserialize this object, and returns it.
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
let buffer = self.values_mmap.as_slice();
@@ -129,13 +124,13 @@ impl<V> FstMap<V>
/// Returns a stream of all the sorted terms.
pub fn stream(&self) -> FstMapStreamer<V> {
pub fn stream(&self) -> TermStreamer<V> {
self.range().into_stream()
}
/// Returns a stream of all the sorted terms in the given field.
pub fn stream_field(&self, field: Field) -> FstMapStreamer<V> {
pub fn stream_field(&self, field: Field) -> TermStreamer<V> {
let start_term = Term::from_field_text(field, "");
let stop_term = Term::from_field_text(Field(field.0 + 1), "");
self.range()
@@ -146,8 +141,8 @@ impl<V> FstMap<V>
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> FstMapStreamerBuilder<V> {
FstMapStreamerBuilder::new(self, self.fst_index.range())
pub fn range(&self) -> TermStreamerBuilder<V> {
TermStreamerBuilder::new(self, self.fst_index.range())
}
}
@@ -159,21 +154,21 @@ mod tests {
use fst::Streamer;
#[test]
fn test_fstmap() {
fn test_term_dictionary() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("fstmap");
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_builder.finish().unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::new(write).unwrap();
term_dictionary_builder.insert("abc".as_bytes(), &34u32).unwrap();
term_dictionary_builder.insert("abcd".as_bytes(), &346u32).unwrap();
term_dictionary_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
let mut stream = fstmap.stream();
let term_dict: TermDictionary<u32> = TermDictionary::from_source(source).unwrap();
assert_eq!(term_dict.get("abc"), Some(34u32));
assert_eq!(term_dict.get("abcd"), Some(346u32));
let mut stream = term_dict.stream();
assert_eq!(stream.next().unwrap(), "abc".as_bytes());
assert_eq!(stream.key(), "abc".as_bytes());
assert_eq!(stream.value(), 34u32);