mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-25 20:50:43 +00:00
Moving to termdict
This commit is contained in:
@@ -7,7 +7,7 @@ use query::Query;
|
||||
use DocId;
|
||||
use DocAddress;
|
||||
use schema::Term;
|
||||
use datastruct::fstmap::FstMerger;
|
||||
use termdict::FstMerger;
|
||||
use std::fmt;
|
||||
use postings::TermInfo;
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ use DocId;
|
||||
use std::str;
|
||||
use std::cmp;
|
||||
use postings::TermInfo;
|
||||
use datastruct::fstmap::FstMap;
|
||||
use termdict::FstMap;
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use schema::Field;
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
mod fstmap;
|
||||
mod streamer;
|
||||
mod fstmerger;
|
||||
|
||||
pub use self::fstmap::FstMap;
|
||||
pub use self::fstmap::FstMapBuilder;
|
||||
pub use self::streamer::FstMapStreamer;
|
||||
pub use self::streamer::FstMapStreamerBuilder;
|
||||
pub use self::fstmerger::FstMerger;
|
||||
@@ -1,4 +1,3 @@
|
||||
pub mod fstmap;
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
/*!
|
||||
|
||||
WORM directory abstraction.
|
||||
|
||||
*/
|
||||
mod mmap_directory;
|
||||
mod ram_directory;
|
||||
mod directory;
|
||||
|
||||
@@ -1,25 +1,27 @@
|
||||
//! # Fast fields
|
||||
//!
|
||||
//! Fast fields are the equivalent of `DocValues` in `Lucene`.
|
||||
//! Fast fields is a non-compressed column-oriented fashion storage
|
||||
//! of `tantivy`.
|
||||
//!
|
||||
//! It is designed for the fast random access of some document
|
||||
//! fields given a document id.
|
||||
//!
|
||||
//! `FastField` are useful when a field is required for all or most of
|
||||
//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
|
||||
//!
|
||||
//!
|
||||
//! Fields have to be declared as `FAST` in the schema.
|
||||
//! Currently only 64-bits integers (signed or unsigned) are
|
||||
//! supported.
|
||||
//!
|
||||
//! They are stored in a bitpacked fashion so that their
|
||||
//! memory usage is directly linear with the amplitude of the
|
||||
//! values stored.
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
/*!
|
||||
Fast fields is a column oriented storage storage.
|
||||
|
||||
It is the equivalent of `Lucene`'s `DocValues`.
|
||||
|
||||
Fast fields is a column-oriented fashion storage of `tantivy`.
|
||||
|
||||
It is designed for the fast random access of some document
|
||||
fields given a document id.
|
||||
|
||||
`FastField` are useful when a field is required for all or most of
|
||||
the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
|
||||
|
||||
|
||||
Fields have to be declared as `FAST` in the schema.
|
||||
Currently only 64-bits integers (signed or unsigned) are
|
||||
supported.
|
||||
|
||||
They are stored in a bitpacked fashion so that their
|
||||
memory usage is directly linear with the amplitude of the
|
||||
values stored.
|
||||
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
@@ -12,7 +12,7 @@ use postings::Postings;
|
||||
use postings::DocSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
use schema::{Schema, Field};
|
||||
use datastruct::fstmap::FstMerger;
|
||||
use termdict::FstMerger;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::FastFieldReader;
|
||||
use store::StoreWriter;
|
||||
|
||||
@@ -111,9 +111,10 @@ mod datastruct;
|
||||
|
||||
|
||||
|
||||
pub mod termdict;
|
||||
|
||||
/// Query module
|
||||
pub mod query;
|
||||
/// Directory module
|
||||
pub mod directory;
|
||||
/// Collector module
|
||||
pub mod collector;
|
||||
@@ -147,8 +148,7 @@ pub fn version() -> &'static str {
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's makes it possible to personalize when
|
||||
/// the indexer should merge its segments
|
||||
/// Defines tantivy's merging strategy
|
||||
pub mod merge_policy {
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use Result;
|
||||
use datastruct::fstmap::FstMapBuilder;
|
||||
use termdict::FstMapBuilder;
|
||||
use super::TermInfo;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
|
||||
@@ -12,6 +12,7 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
|
||||
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
|
||||
fst_builder: fst::MapBuilder<W>,
|
||||
data: Vec<u8>,
|
||||
@@ -75,15 +76,17 @@ pub struct FstMap<V: BinarySerializable> {
|
||||
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
Ok(fst::Map::from(match source {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error)?
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
|
||||
}
|
||||
}))
|
||||
let fst = match source {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error)?
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly)
|
||||
.map_err(convert_fst_error)?
|
||||
}
|
||||
};
|
||||
Ok(fst::Map::from(fst))
|
||||
}
|
||||
|
||||
impl<V> FstMap<V>
|
||||
@@ -106,9 +109,6 @@ impl<V> FstMap<V>
|
||||
}
|
||||
|
||||
|
||||
/// In the `FstMap`, the dictionary itself associated
|
||||
/// each key `&[u8]` to a `u64` that is in fact the address
|
||||
/// of the value object in a data array.
|
||||
///
|
||||
/// This method deserialize this object, and returns it.
|
||||
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
|
||||
@@ -125,7 +125,7 @@ impl<V> FstMap<V>
|
||||
self.read_value(offset)
|
||||
.expect("The fst is corrupted. Failed to deserialize a value.")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns a stream of all the sorted terms.
|
||||
183
src/termdict/fstmap.rs.bk
Normal file
183
src/termdict/fstmap.rs.bk
Normal file
@@ -0,0 +1,183 @@
|
||||
use std::io::{self, Write};
|
||||
use fst;
|
||||
use fst::raw::Fst;
|
||||
use super::{FstMapStreamerBuilder, FstMapStreamer};
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use schema::{Field, Term};
|
||||
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
|
||||
fst_builder: fst::MapBuilder<W>,
|
||||
data: Vec<u8>,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
|
||||
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
|
||||
Ok(FstMapBuilder {
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Horribly unsafe, nobody should ever do that... except me :)
|
||||
///
|
||||
/// If used, it must be used by systematically alternating calls
|
||||
/// to insert_key and insert_value.
|
||||
///
|
||||
/// TODO see if I can bend Rust typesystem to enforce that
|
||||
/// in a nice way.
|
||||
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Horribly unsafe, nobody should ever do that... except me :)
|
||||
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
|
||||
value.serialize(&mut self.data)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
|
||||
self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error)?;
|
||||
value.serialize(&mut self.data)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<W> {
|
||||
let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?;
|
||||
let footer_size = self.data.len() as u32;
|
||||
file.write_all(&self.data)?;
|
||||
(footer_size as u32).serialize(&mut file)?;
|
||||
file.flush()?;
|
||||
Ok(file)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FstMap<V: BinarySerializable> {
|
||||
fst_index: fst::Map,
|
||||
values_mmap: ReadOnlySource,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
Ok(fst::Map::from(match source {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error)?
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
impl<V> FstMap<V>
|
||||
where V: BinarySerializable
|
||||
{
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
|
||||
let split_len = length_offset - footer_size;
|
||||
let fst_source = source.slice(0, split_len);
|
||||
let values_source = source.slice(split_len, length_offset);
|
||||
let fst_index = open_fst_index(fst_source)?;
|
||||
Ok(FstMap {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// In the `FstMap`, the dictionary itself associated
|
||||
/// each key `&[u8]` to a `u64` that is in fact the address
|
||||
/// of the value object in a data array.
|
||||
///
|
||||
/// This method deserialize this object, and returns it.
|
||||
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
|
||||
let buffer = self.values_mmap.as_slice();
|
||||
let mut cursor = &buffer[(offset as usize)..];
|
||||
V::deserialize(&mut cursor)
|
||||
}
|
||||
|
||||
/// Returns, if present the value associated to a given key.
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
|
||||
self.fst_index
|
||||
.get(key)
|
||||
.map(|offset| self.read_value(offset).expect("The fst is corrupted. Failed to deserialize a value."))
|
||||
}
|
||||
|
||||
|
||||
/// Returns a stream of all the sorted terms.
|
||||
pub fn stream(&self) -> FstMapStreamer<V> {
|
||||
self.range().into_stream()
|
||||
}
|
||||
|
||||
|
||||
/// Returns a stream of all the sorted terms in the given field.
|
||||
pub fn stream_field(&self, field: Field) -> FstMapStreamer<V> {
|
||||
let start_term = Term::from_field_text(field, "");
|
||||
let stop_term = Term::from_field_text(Field(field.0 + 1), "");
|
||||
self.range()
|
||||
.ge(start_term.as_slice())
|
||||
.lt(stop_term.as_slice())
|
||||
.into_stream()
|
||||
}
|
||||
|
||||
/// Returns a range builder, to stream all of the terms
|
||||
/// within an interval.
|
||||
pub fn range(&self) -> FstMapStreamerBuilder<V> {
|
||||
FstMapStreamerBuilder::new(self, self.fst_index.range())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use directory::{RAMDirectory, Directory};
|
||||
use std::path::PathBuf;
|
||||
use fst::Streamer;
|
||||
|
||||
#[test]
|
||||
fn test_fstmap() {
|
||||
let mut directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("fstmap");
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
|
||||
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
|
||||
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
|
||||
fstmap_builder.finish().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
|
||||
assert_eq!(fstmap.get("abc"), Some(34u32));
|
||||
assert_eq!(fstmap.get("abcd"), Some(346u32));
|
||||
let mut stream = fstmap.stream();
|
||||
assert_eq!(stream.next().unwrap(), "abc".as_bytes());
|
||||
assert_eq!(stream.key(), "abc".as_bytes());
|
||||
assert_eq!(stream.value(), 34u32);
|
||||
assert_eq!(stream.next().unwrap(), "abcd".as_bytes());
|
||||
assert_eq!(stream.key(), "abcd".as_bytes());
|
||||
assert_eq!(stream.value(), 346u32);
|
||||
assert!(!stream.advance());
|
||||
}
|
||||
|
||||
}
|
||||
26
src/termdict/mod.rs
Normal file
26
src/termdict/mod.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
/*!
|
||||
The term dictionary contains all of the terms in
|
||||
`tantivy index` in a sorted manner.
|
||||
|
||||
It is implemented as a wrapper of the `Fst` crate in order
|
||||
to add a value type.
|
||||
|
||||
A finite state transducer itself associates
|
||||
each term `&[u8]` to a `u64` that is in fact an address
|
||||
in a buffer. The value is then accessible via
|
||||
deserializing the value at this address.
|
||||
|
||||
Keys (`&[u8]`) in this datastructure are
|
||||
sorted.
|
||||
|
||||
*/
|
||||
|
||||
mod fstmap;
|
||||
mod streamer;
|
||||
mod fstmerger;
|
||||
|
||||
pub use self::fstmap::FstMap;
|
||||
pub(crate) use self::fstmap::FstMapBuilder;
|
||||
pub use self::streamer::FstMapStreamer;
|
||||
pub use self::streamer::FstMapStreamerBuilder;
|
||||
pub use self::fstmerger::FstMerger;
|
||||
Reference in New Issue
Block a user