Make it possible to stream the terms matching an Automaton (#297)

* rustfmt and some English grammar

* sort cargo.toml crates

* WIP: something to show

* Remove example for now

* Implement desired method

* Resolving Generic Type Arguments

* Resolve Generic Types

* Banging around on the tests

* DANGER! Change unsafe usage based on compiler warnings

* Unscrew up my rebase

* Clean Up Type Spam

Default Types FTW

* typo

* better variable names

* Remove Duplicate Levenshtein crate
This commit is contained in:
Dru Sellers
2018-05-11 14:41:14 -05:00
committed by Paul Masurel
parent 82d87416c2
commit 08d2cc6c7b
7 changed files with 87 additions and 19 deletions

View File

@@ -17,7 +17,7 @@ byteorder = "1.0"
lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = {version="0.2", default-features=false}
fst = {version="0.3", default-features=false}
atomicwrites = {version="0.1", optional=true}
tempfile = "2.1"
log = "0.3.6"
@@ -28,6 +28,7 @@ serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.5.9"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
lz4 = "1.20"
bit-set = "0.4.0"
uuid = { version = "0.6", features = ["v4", "serde"] }

View File

@@ -40,9 +40,11 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
// instead.
return Ok(None);
}
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
unsafe {
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
}
}
#[derive(Default, Clone, Debug, Serialize, Deserialize)]

View File

@@ -42,7 +42,7 @@ impl ReadOnlySource {
pub fn as_slice(&self) -> &[u8] {
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}

View File

@@ -139,6 +139,7 @@ extern crate fst;
extern crate futures;
extern crate futures_cpupool;
extern crate itertools;
extern crate levenshtein_automata;
extern crate lz4;
extern crate num_cpus;
extern crate owning_ref;

View File

@@ -1,7 +1,7 @@
/*!
The term dictionary is one of the key datastructure of
The term dictionary is one of the key data structures of
tantivy. It associates sorted `terms` to a `TermInfo` struct
that serves as an address in their respective posting list.
that serves as an address to their respective posting list.
The term dictionary API makes it possible to iterate through
a range of keys in a sorted manner.
@@ -9,12 +9,12 @@ a range of keys in a sorted manner.
# Implementations
There is currently two implementations of the term dictionary.
There are currently two implementations of the term dictionary.
## Default implementation : `fstdict`
The default one relies heavily on the `fst` crate.
It associate each terms `&[u8]` representation to a `u64`
It associate each term's `&[u8]` representation to a `u64`
that is in fact an address in a buffer. The value is then accessible
via deserializing the value at this address.
@@ -419,4 +419,46 @@ mod tests {
}
}
#[test]
fn test_automaton_search() {
use levenshtein_automata::LevenshteinAutomatonBuilder;
const COUNTRIES: [&'static str; 7] = [
"San Marino",
"Serbia",
"Slovakia",
"Slovenia",
"Spain",
"Sweden",
"Switzerland",
];
let mut directory = RAMDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::new(write, field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
.unwrap();
}
term_dictionary_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let term_dict: TermDictionary = TermDictionary::from_source(source);
// We can now build an entire dfa.
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
let automaton = lev_automaton_builder.build_dfa("Spaen");
let mut range = term_dict.search(automaton).into_stream();
// get the first finding
assert!(range.advance());
assert_eq!("Spain".as_bytes(), range.key());
assert!(!range.advance());
}
}

View File

@@ -1,18 +1,26 @@
use super::TermDictionary;
use fst::automaton::AlwaysMatch;
use fst::map::{Stream, StreamBuilder};
use fst::Automaton;
use fst::{IntoStreamer, Streamer};
use postings::TermInfo;
use termdict::TermOrdinal;
/// `TermStreamerBuilder` is an helper object used to define
/// `TermStreamerBuilder` is a helper object used to define
/// a range of terms that should be streamed.
pub struct TermStreamerBuilder<'a> {
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
where
A: Automaton,
{
fst_map: &'a TermDictionary,
stream_builder: StreamBuilder<'a>,
stream_builder: StreamBuilder<'a, A>,
}
impl<'a> TermStreamerBuilder<'a> {
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a>) -> Self {
impl<'a, A> TermStreamerBuilder<'a, A>
where
A: Automaton,
{
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
TermStreamerBuilder {
fst_map,
stream_builder,
@@ -45,7 +53,7 @@ impl<'a> TermStreamerBuilder<'a> {
/// Creates the stream corresponding to the range
/// of terms defined using the `TermStreamerBuilder`.
pub fn into_stream(self) -> TermStreamer<'a> {
pub fn into_stream(self) -> TermStreamer<'a, A> {
TermStreamer {
fst_map: self.fst_map,
stream: self.stream_builder.into_stream(),
@@ -58,15 +66,21 @@ impl<'a> TermStreamerBuilder<'a> {
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
/// Terms are guaranteed to be sorted.
pub struct TermStreamer<'a> {
pub struct TermStreamer<'a, A = AlwaysMatch>
where
A: Automaton,
{
fst_map: &'a TermDictionary,
stream: Stream<'a>,
stream: Stream<'a, A>,
term_ord: TermOrdinal,
current_key: Vec<u8>,
current_value: TermInfo,
}
impl<'a> TermStreamer<'a> {
impl<'a, A> TermStreamer<'a, A>
where
A: Automaton,
{
/// Advance position the stream on the next item.
/// Before the first call to `.advance()`, the stream
/// is an unitialized state.

View File

@@ -5,6 +5,7 @@ use common::CountingWriter;
use directory::ReadOnlySource;
use fst;
use fst::raw::Fst;
use fst::Automaton;
use postings::TermInfo;
use schema::FieldType;
use std::io::{self, Write};
@@ -101,7 +102,7 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
/// The term dictionary contains all of the terms in
/// `tantivy index` in a sorted manner.
///
/// The `Fst` crate is used to assoicated terms to their
/// The `Fst` crate is used to associate terms to their
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
/// possible to fetch the associated `TermInfo`.
pub struct TermDictionary {
@@ -199,4 +200,11 @@ impl TermDictionary {
pub fn stream<'a>(&'a self) -> TermStreamer<'a> {
self.range().into_stream()
}
/// Returns a search builder, to stream all of the terms
/// within the Automaton
pub fn search<'a, A: Automaton>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
let stream_builder = self.fst_index.search(automaton);
TermStreamerBuilder::<A>::new(self, stream_builder)
}
}