Moving to termdict

This commit is contained in:
Paul Masurel
2017-05-19 08:43:52 +09:00
parent b3f62b8acc
commit 02bfa9be52
14 changed files with 258 additions and 52 deletions

View File

@@ -7,7 +7,7 @@ use query::Query;
use DocId;
use DocAddress;
use schema::Term;
use datastruct::fstmap::FstMerger;
use termdict::FstMerger;
use std::fmt;
use postings::TermInfo;

View File

@@ -14,7 +14,7 @@ use DocId;
use std::str;
use std::cmp;
use postings::TermInfo;
use datastruct::fstmap::FstMap;
use termdict::FstMap;
use std::sync::Arc;
use std::fmt;
use schema::Field;

View File

@@ -1,9 +0,0 @@
mod fstmap;
mod streamer;
mod fstmerger;
pub use self::fstmap::FstMap;
pub use self::fstmap::FstMapBuilder;
pub use self::streamer::FstMapStreamer;
pub use self::streamer::FstMapStreamerBuilder;
pub use self::fstmerger::FstMerger;

View File

@@ -1,4 +1,3 @@
pub mod fstmap;
mod skip;
pub mod stacker;

View File

@@ -1,3 +1,8 @@
/*!
WORM directory abstraction.
*/
mod mmap_directory;
mod ram_directory;
mod directory;

View File

@@ -1,25 +1,27 @@
//! # Fast fields
//!
//! Fast fields are the equivalent of `DocValues` in `Lucene`.
//! Fast fields is a non-compressed column-oriented fashion storage
//! of `tantivy`.
//!
//! It is designed for the fast random access of some document
//! fields given a document id.
//!
//! `FastField` are useful when a field is required for all or most of
//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
//!
//!
//! Fields have to be declared as `FAST` in the schema.
//! Currently only 64-bits integers (signed or unsigned) are
//! supported.
//!
//! They are stored in a bitpacked fashion so that their
//! memory usage is directly linear with the amplitude of the
//! values stored.
//!
//! Read access performance is comparable to that of an array lookup.
/*!
Fast fields is a column oriented storage storage.
It is the equivalent of `Lucene`'s `DocValues`.
Fast fields is a column-oriented fashion storage of `tantivy`.
It is designed for the fast random access of some document
fields given a document id.
`FastField` are useful when a field is required for all or most of
the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
Fields have to be declared as `FAST` in the schema.
Currently only 64-bits integers (signed or unsigned) are
supported.
They are stored in a bitpacked fashion so that their
memory usage is directly linear with the amplitude of the
values stored.
Read access performance is comparable to that of an array lookup.
*/
mod reader;
mod writer;

View File

@@ -12,7 +12,7 @@ use postings::Postings;
use postings::DocSet;
use fastfield::DeleteBitSet;
use schema::{Schema, Field};
use datastruct::fstmap::FstMerger;
use termdict::FstMerger;
use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;

View File

@@ -111,9 +111,10 @@ mod datastruct;
pub mod termdict;
/// Query module
pub mod query;
/// Directory module
pub mod directory;
/// Collector module
pub mod collector;
@@ -147,8 +148,7 @@ pub fn version() -> &'static str {
}
}
/// Tantivy's makes it possible to personalize when
/// the indexer should merge its segments
/// Defines tantivy's merging strategy
pub mod merge_policy {
pub use indexer::MergePolicy;
pub use indexer::LogMergePolicy;

View File

@@ -1,5 +1,5 @@
use Result;
use datastruct::fstmap::FstMapBuilder;
use termdict::FstMapBuilder;
use super::TermInfo;
use schema::Field;
use schema::FieldEntry;

View File

@@ -12,6 +12,7 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
@@ -75,15 +76,17 @@ pub struct FstMap<V: BinarySerializable> {
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error)?
}
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
}
}))
let fst = match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error)?
}
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly)
.map_err(convert_fst_error)?
}
};
Ok(fst::Map::from(fst))
}
impl<V> FstMap<V>
@@ -106,9 +109,6 @@ impl<V> FstMap<V>
}
/// In the `FstMap`, the dictionary itself associated
/// each key `&[u8]` to a `u64` that is in fact the address
/// of the value object in a data array.
///
/// This method deserialize this object, and returns it.
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
@@ -125,7 +125,7 @@ impl<V> FstMap<V>
self.read_value(offset)
.expect("The fst is corrupted. Failed to deserialize a value.")
})
}
}
/// Returns a stream of all the sorted terms.

183
src/termdict/fstmap.rs.bk Normal file
View File

@@ -0,0 +1,183 @@
use std::io::{self, Write};
use fst;
use fst::raw::Fst;
use super::{FstMapStreamerBuilder, FstMapStreamer};
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
use schema::{Field, Term};
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
/// Horribly unsafe, nobody should ever do that... except me :)
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// TODO see if I can bend Rust typesystem to enforce that
/// in a nice way.
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error)?;
Ok(())
}
/// Horribly unsafe, nobody should ever do that... except me :)
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
value.serialize(&mut self.data)?;
Ok(())
}
#[cfg(test)]
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error)?;
value.serialize(&mut self.data)?;
Ok(())
}
pub fn finish(self) -> io::Result<W> {
let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?;
let footer_size = self.data.len() as u32;
file.write_all(&self.data)?;
(footer_size as u32).serialize(&mut file)?;
file.flush()?;
Ok(file)
}
}
pub struct FstMap<V: BinarySerializable> {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error)?
}
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
}
}))
}
impl<V> FstMap<V>
where V: BinarySerializable
{
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source)?;
Ok(FstMap {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
}
/// In the `FstMap`, the dictionary itself associated
/// each key `&[u8]` to a `u64` that is in fact the address
/// of the value object in a data array.
///
/// This method deserialize this object, and returns it.
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
let buffer = self.values_mmap.as_slice();
let mut cursor = &buffer[(offset as usize)..];
V::deserialize(&mut cursor)
}
/// Returns, if present the value associated to a given key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
self.fst_index
.get(key)
.map(|offset| self.read_value(offset).expect("The fst is corrupted. Failed to deserialize a value."))
}
/// Returns a stream of all the sorted terms.
pub fn stream(&self) -> FstMapStreamer<V> {
self.range().into_stream()
}
/// Returns a stream of all the sorted terms in the given field.
pub fn stream_field(&self, field: Field) -> FstMapStreamer<V> {
let start_term = Term::from_field_text(field, "");
let stop_term = Term::from_field_text(Field(field.0 + 1), "");
self.range()
.ge(start_term.as_slice())
.lt(stop_term.as_slice())
.into_stream()
}
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> FstMapStreamerBuilder<V> {
FstMapStreamerBuilder::new(self, self.fst_index.range())
}
}
#[cfg(test)]
mod tests {
use super::*;
use directory::{RAMDirectory, Directory};
use std::path::PathBuf;
use fst::Streamer;
#[test]
fn test_fstmap() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("fstmap");
{
let write = directory.open_write(&path).unwrap();
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
let mut stream = fstmap.stream();
assert_eq!(stream.next().unwrap(), "abc".as_bytes());
assert_eq!(stream.key(), "abc".as_bytes());
assert_eq!(stream.value(), 34u32);
assert_eq!(stream.next().unwrap(), "abcd".as_bytes());
assert_eq!(stream.key(), "abcd".as_bytes());
assert_eq!(stream.value(), 346u32);
assert!(!stream.advance());
}
}

26
src/termdict/mod.rs Normal file
View File

@@ -0,0 +1,26 @@
/*!
The term dictionary contains all of the terms in
`tantivy index` in a sorted manner.
It is implemented as a wrapper of the `Fst` crate in order
to add a value type.
A finite state transducer itself associates
each term `&[u8]` to a `u64` that is in fact an address
in a buffer. The value is then accessible via
deserializing the value at this address.
Keys (`&[u8]`) in this datastructure are
sorted.
*/
mod fstmap;
mod streamer;
mod fstmerger;
pub use self::fstmap::FstMap;
pub(crate) use self::fstmap::FstMapBuilder;
pub use self::streamer::FstMapStreamer;
pub use self::streamer::FstMapStreamerBuilder;
pub use self::fstmerger::FstMerger;