mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 10:00:41 +00:00
Make it possible to stream the terms matching an Automaton (#297)
* rustfmt and some English grammar * sort cargo.toml crates * WIP: something to show * Remove example for now * Implement desired method * Resolving Generic Type Arguments * Resolve Generic Types * Banging around on the tests * DANGER! Change unsafe usage based on compiler warnings * Unscrew up my rebase * Clean Up Type Spam Default Types FTW * typo * better variable names * Remove Duplicate Levenshtein crate
This commit is contained in:
committed by
Paul Masurel
parent
82d87416c2
commit
08d2cc6c7b
@@ -17,7 +17,7 @@ byteorder = "1.0"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.2", default-features=false}
|
||||
fst = {version="0.3", default-features=false}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
@@ -28,6 +28,7 @@ serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
|
||||
@@ -40,9 +40,11 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
// instead.
|
||||
return Ok(None);
|
||||
}
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
unsafe {
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
|
||||
@@ -42,7 +42,7 @@ impl ReadOnlySource {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,6 +139,7 @@ extern crate fst;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate lz4;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*!
|
||||
The term dictionary is one of the key datastructure of
|
||||
The term dictionary is one of the key data structures of
|
||||
tantivy. It associates sorted `terms` to a `TermInfo` struct
|
||||
that serves as an address in their respective posting list.
|
||||
that serves as an address to their respective posting list.
|
||||
|
||||
The term dictionary API makes it possible to iterate through
|
||||
a range of keys in a sorted manner.
|
||||
@@ -9,12 +9,12 @@ a range of keys in a sorted manner.
|
||||
|
||||
# Implementations
|
||||
|
||||
There is currently two implementations of the term dictionary.
|
||||
There are currently two implementations of the term dictionary.
|
||||
|
||||
## Default implementation : `fstdict`
|
||||
|
||||
The default one relies heavily on the `fst` crate.
|
||||
It associate each terms `&[u8]` representation to a `u64`
|
||||
It associate each term's `&[u8]` representation to a `u64`
|
||||
that is in fact an address in a buffer. The value is then accessible
|
||||
via deserializing the value at this address.
|
||||
|
||||
@@ -419,4 +419,46 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() {
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
"Serbia",
|
||||
"Slovakia",
|
||||
"Slovenia",
|
||||
"Spain",
|
||||
"Sweden",
|
||||
"Switzerland",
|
||||
];
|
||||
|
||||
let mut directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(write, field_type).unwrap();
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder
|
||||
.insert(term.as_bytes(), &make_term_info(0u64))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let term_dict: TermDictionary = TermDictionary::from_source(source);
|
||||
|
||||
// We can now build an entire dfa.
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
|
||||
let automaton = lev_automaton_builder.build_dfa("Spaen");
|
||||
|
||||
let mut range = term_dict.search(automaton).into_stream();
|
||||
|
||||
// get the first finding
|
||||
assert!(range.advance());
|
||||
assert_eq!("Spain".as_bytes(), range.key());
|
||||
assert!(!range.advance());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,26 @@
|
||||
use super::TermDictionary;
|
||||
use fst::automaton::AlwaysMatch;
|
||||
use fst::map::{Stream, StreamBuilder};
|
||||
use fst::Automaton;
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use postings::TermInfo;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
/// `TermStreamerBuilder` is an helper object used to define
|
||||
/// `TermStreamerBuilder` is a helper object used to define
|
||||
/// a range of terms that should be streamed.
|
||||
pub struct TermStreamerBuilder<'a> {
|
||||
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
fst_map: &'a TermDictionary,
|
||||
stream_builder: StreamBuilder<'a>,
|
||||
stream_builder: StreamBuilder<'a, A>,
|
||||
}
|
||||
|
||||
impl<'a> TermStreamerBuilder<'a> {
|
||||
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a>) -> Self {
|
||||
impl<'a, A> TermStreamerBuilder<'a, A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
|
||||
TermStreamerBuilder {
|
||||
fst_map,
|
||||
stream_builder,
|
||||
@@ -45,7 +53,7 @@ impl<'a> TermStreamerBuilder<'a> {
|
||||
|
||||
/// Creates the stream corresponding to the range
|
||||
/// of terms defined using the `TermStreamerBuilder`.
|
||||
pub fn into_stream(self) -> TermStreamer<'a> {
|
||||
pub fn into_stream(self) -> TermStreamer<'a, A> {
|
||||
TermStreamer {
|
||||
fst_map: self.fst_map,
|
||||
stream: self.stream_builder.into_stream(),
|
||||
@@ -58,15 +66,21 @@ impl<'a> TermStreamerBuilder<'a> {
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub struct TermStreamer<'a> {
|
||||
pub struct TermStreamer<'a, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
fst_map: &'a TermDictionary,
|
||||
stream: Stream<'a>,
|
||||
stream: Stream<'a, A>,
|
||||
term_ord: TermOrdinal,
|
||||
current_key: Vec<u8>,
|
||||
current_value: TermInfo,
|
||||
}
|
||||
|
||||
impl<'a> TermStreamer<'a> {
|
||||
impl<'a, A> TermStreamer<'a, A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
/// Advance position the stream on the next item.
|
||||
/// Before the first call to `.advance()`, the stream
|
||||
/// is an unitialized state.
|
||||
|
||||
@@ -5,6 +5,7 @@ use common::CountingWriter;
|
||||
use directory::ReadOnlySource;
|
||||
use fst;
|
||||
use fst::raw::Fst;
|
||||
use fst::Automaton;
|
||||
use postings::TermInfo;
|
||||
use schema::FieldType;
|
||||
use std::io::{self, Write};
|
||||
@@ -101,7 +102,7 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
///
|
||||
/// The `Fst` crate is used to assoicated terms to their
|
||||
/// The `Fst` crate is used to associate terms to their
|
||||
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
|
||||
/// possible to fetch the associated `TermInfo`.
|
||||
pub struct TermDictionary {
|
||||
@@ -199,4 +200,11 @@ impl TermDictionary {
|
||||
pub fn stream<'a>(&'a self) -> TermStreamer<'a> {
|
||||
self.range().into_stream()
|
||||
}
|
||||
|
||||
/// Returns a search builder, to stream all of the terms
|
||||
/// within the Automaton
|
||||
pub fn search<'a, A: Automaton>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
|
||||
let stream_builder = self.fst_index.search(automaton);
|
||||
TermStreamerBuilder::<A>::new(self, stream_builder)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user