mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-01 16:10:42 +00:00
Cargo fmt
This commit is contained in:
@@ -1,7 +1,29 @@
|
||||
/*!
|
||||
The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to
|
||||
a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information
|
||||
about the term.
|
||||
|
||||
Internally, the term dictionary relies on the `fst` crate to store
|
||||
a sorted mapping that associate each term to its rank in the lexicographical order.
|
||||
For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan",
|
||||
the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`.
|
||||
|
||||
For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the
|
||||
lexicographical order matches the natural order of integers.
|
||||
|
||||
`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()`
|
||||
and then treated as a `u64`.
|
||||
|
||||
`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
|
||||
as `u64`.
|
||||
|
||||
A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html).
|
||||
*/
|
||||
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
mod fst_termdict;
|
||||
// mod traits;
|
||||
use fst_termdict as termdict;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
@@ -11,12 +33,12 @@ pub type TermOrdinal = u64;
|
||||
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
pub type TermDictionary = self::fst_termdict::TermDictionary;
|
||||
pub type TermDictionary = self::termdict::TermDictionary;
|
||||
|
||||
/// Builder for the new term dictionary.
|
||||
///
|
||||
/// Inserting must be done in the order of the `keys`.
|
||||
pub type TermDictionaryBuilder<W> = self::fst_termdict::TermDictionaryBuilder<W>;
|
||||
pub type TermDictionaryBuilder<W> = self::termdict::TermDictionaryBuilder<W>;
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
@@ -25,8 +47,8 @@ pub type TermDictionaryBuilder<W> = self::fst_termdict::TermDictionaryBuilder<W>
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub type TermMerger<'a> = self::fst_termdict::TermMerger<'a>;
|
||||
pub type TermMerger<'a> = self::termdict::TermMerger<'a>;
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub type TermStreamer<'a, A = AlwaysMatch> = self::fst_termdict::TermStreamer<'a, A>;
|
||||
pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
|
||||
|
||||
@@ -1,421 +1,420 @@
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
use crate::core::Index;
|
||||
use crate::directory::{Directory, FileSlice, RAMDirectory};
|
||||
use crate::postings::TermInfo;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use std::path::PathBuf;
|
||||
use std::str;
|
||||
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
use crate::core::Index;
|
||||
use crate::directory::{Directory, FileSlice, RAMDirectory};
|
||||
use crate::postings::TermInfo;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use std::path::PathBuf;
|
||||
use std::str;
|
||||
const BLOCK_SIZE: usize = 1_500;
|
||||
|
||||
const BLOCK_SIZE: usize = 1_500;
|
||||
|
||||
fn make_term_info(term_ord: u64) -> TermInfo {
|
||||
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_start_offset: offset(term_ord),
|
||||
postings_stop_offset: offset(term_ord + 1),
|
||||
positions_idx: offset(term_ord) * 2u64,
|
||||
}
|
||||
fn make_term_info(term_ord: u64) -> TermInfo {
|
||||
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_start_offset: offset(term_ord),
|
||||
postings_stop_offset: offset(term_ord + 1),
|
||||
positions_idx: offset(term_ord) * 2u64,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_term_dictionary() {
|
||||
let empty = TermDictionary::empty();
|
||||
assert!(empty.stream().next().is_none());
|
||||
#[test]
|
||||
fn test_empty_term_dictionary() {
|
||||
let empty = TermDictionary::empty();
|
||||
assert!(empty.stream().next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_ordinals() -> crate::Result<()> {
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
"Serbia",
|
||||
"Slovakia",
|
||||
"Slovenia",
|
||||
"Spain",
|
||||
"Sweden",
|
||||
"Switzerland",
|
||||
];
|
||||
let directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_ordinals() -> crate::Result<()> {
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
"Serbia",
|
||||
"Slovakia",
|
||||
"Slovenia",
|
||||
"Spain",
|
||||
"Sweden",
|
||||
"Switzerland",
|
||||
];
|
||||
let directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?;
|
||||
}
|
||||
let term_file = directory.open_read(&path)?;
|
||||
let term_dict: TermDictionary = TermDictionary::open(term_file)?;
|
||||
for (term_ord, term) in COUNTRIES.iter().enumerate() {
|
||||
assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64);
|
||||
let mut bytes = vec![];
|
||||
assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes));
|
||||
assert_eq!(bytes, term.as_bytes());
|
||||
}
|
||||
Ok(())
|
||||
let term_file = directory.open_read(&path)?;
|
||||
let term_dict: TermDictionary = TermDictionary::open(term_file)?;
|
||||
for (term_ord, term) in COUNTRIES.iter().enumerate() {
|
||||
assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64);
|
||||
let mut bytes = vec![];
|
||||
assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes));
|
||||
assert_eq!(bytes, term.as_bytes());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_dictionary_simple() -> crate::Result<()> {
|
||||
let directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
|
||||
term_dictionary_builder.insert("abc".as_bytes(), &make_term_info(34u64))?;
|
||||
term_dictionary_builder.insert("abcd".as_bytes(), &make_term_info(346u64))?;
|
||||
term_dictionary_builder.finish()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
let term_dict: TermDictionary = TermDictionary::open(file)?;
|
||||
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
|
||||
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
|
||||
let mut stream = term_dict.stream();
|
||||
{
|
||||
{
|
||||
let (k, v) = stream.next().unwrap();
|
||||
assert_eq!(k.as_ref(), "abc".as_bytes());
|
||||
assert_eq!(v.doc_freq, 34u32);
|
||||
}
|
||||
assert_eq!(stream.key(), "abc".as_bytes());
|
||||
assert_eq!(stream.value().doc_freq, 34u32);
|
||||
}
|
||||
{
|
||||
{
|
||||
let (k, v) = stream.next().unwrap();
|
||||
assert_eq!(k, "abcd".as_bytes());
|
||||
assert_eq!(v.doc_freq, 346u32);
|
||||
}
|
||||
assert_eq!(stream.key(), "abcd".as_bytes());
|
||||
assert_eq!(stream.value().doc_freq, 346u32);
|
||||
}
|
||||
assert!(!stream.advance());
|
||||
Ok(())
|
||||
#[test]
|
||||
fn test_term_dictionary_simple() -> crate::Result<()> {
|
||||
let directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
|
||||
term_dictionary_builder.insert("abc".as_bytes(), &make_term_info(34u64))?;
|
||||
term_dictionary_builder.insert("abcd".as_bytes(), &make_term_info(346u64))?;
|
||||
term_dictionary_builder.finish()?;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_iterator() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let file = directory.open_read(&path)?;
|
||||
let term_dict: TermDictionary = TermDictionary::open(file)?;
|
||||
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
|
||||
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
|
||||
let mut stream = term_dict.stream();
|
||||
{
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b d f"));
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d f"));
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field => "e f"));
|
||||
index_writer.commit()?;
|
||||
let (k, v) = stream.next().unwrap();
|
||||
assert_eq!(k.as_ref(), "abc".as_bytes());
|
||||
assert_eq!(v.doc_freq, 34u32);
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let field_searcher = searcher.field(text_field)?;
|
||||
let mut term_it = field_searcher.terms();
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
//let term = Term::from_bytes(term_it.key());
|
||||
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
|
||||
}
|
||||
assert_eq!(&*term_string, "abcdef");
|
||||
Ok(())
|
||||
assert_eq!(stream.key(), "abc".as_bytes());
|
||||
assert_eq!(stream.value().doc_freq, 34u32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_dictionary_stream() -> crate::Result<()> {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let term_file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(term_file)?;
|
||||
{
|
||||
{
|
||||
let mut streamer = term_dictionary.stream();
|
||||
let mut i = 0;
|
||||
while let Some((streamer_k, streamer_v)) = streamer.next() {
|
||||
let &(ref key, ref v) = &ids[i];
|
||||
assert_eq!(streamer_k.as_ref(), key.as_bytes());
|
||||
assert_eq!(streamer_v, &make_term_info(*v as u64));
|
||||
i += 1;
|
||||
}
|
||||
let (k, v) = stream.next().unwrap();
|
||||
assert_eq!(k, "abcd".as_bytes());
|
||||
assert_eq!(v.doc_freq, 346u32);
|
||||
}
|
||||
|
||||
let &(ref key, ref val) = &ids[2047];
|
||||
assert_eq!(
|
||||
term_dictionary.get(key.as_bytes()),
|
||||
Some(make_term_info(*val as u64))
|
||||
);
|
||||
Ok(())
|
||||
assert_eq!(stream.key(), "abcd".as_bytes());
|
||||
assert_eq!(stream.value().doc_freq, 346u32);
|
||||
}
|
||||
assert!(!stream.advance());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
|
||||
term_dictionary_builder.insert("abr", &make_term_info(3))?;
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
let term_dict_file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?;
|
||||
let mut kv_stream = term_dictionary.stream();
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(1));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(2));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abr".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(3));
|
||||
assert!(!kv_stream.advance());
|
||||
Ok(())
|
||||
#[test]
|
||||
fn test_term_iterator() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b d f"));
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d f"));
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field => "e f"));
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
#[test]
|
||||
fn test_stream_range() -> crate::Result<()> {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
|
||||
let file = FileSlice::from(buffer);
|
||||
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
|
||||
{
|
||||
for i in (0..20).chain(6000..8_000) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
let mut streamer = term_dictionary
|
||||
.range()
|
||||
.ge(target_key.as_bytes())
|
||||
.into_stream();
|
||||
for j in 0..3 {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j];
|
||||
assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
|
||||
assert_eq!(streamer_v.doc_freq, *v);
|
||||
assert_eq!(streamer_v, &make_term_info(*v as u64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
let mut streamer = term_dictionary
|
||||
.range()
|
||||
.gt(target_key.as_bytes())
|
||||
.into_stream();
|
||||
for j in 0..3 {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j + 1];
|
||||
assert_eq!(streamer_k.as_ref(), key.as_bytes());
|
||||
assert_eq!(streamer_v.doc_freq, *v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
|
||||
for j in 0..3 {
|
||||
let &(ref fst_key, _) = &ids[i];
|
||||
let &(ref last_key, _) = &ids[i + j];
|
||||
let mut streamer = term_dictionary
|
||||
.range()
|
||||
.ge(fst_key.as_bytes())
|
||||
.lt(last_key.as_bytes())
|
||||
.into_stream();
|
||||
for _ in 0..j {
|
||||
assert!(streamer.next().is_some());
|
||||
}
|
||||
assert!(streamer.next().is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
let field_searcher = searcher.field(text_field)?;
|
||||
let mut term_it = field_searcher.terms();
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
//let term = Term::from_bytes(term_it.key());
|
||||
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
|
||||
}
|
||||
assert_eq!(&*term_string, "abcdef");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_string() -> crate::Result<()> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
#[test]
|
||||
fn test_term_dictionary_stream() -> crate::Result<()> {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(&[], &make_term_info(1 as u64))
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[1u8], &make_term_info(2 as u64))
|
||||
.unwrap();
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
|
||||
let mut stream = term_dictionary.stream();
|
||||
assert!(stream.advance());
|
||||
assert!(stream.key().is_empty());
|
||||
assert!(stream.advance());
|
||||
assert_eq!(stream.key(), &[1u8]);
|
||||
assert!(!stream.advance());
|
||||
Ok(())
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let term_file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(term_file)?;
|
||||
{
|
||||
let mut streamer = term_dictionary.stream();
|
||||
let mut i = 0;
|
||||
while let Some((streamer_k, streamer_v)) = streamer.next() {
|
||||
let &(ref key, ref v) = &ids[i];
|
||||
assert_eq!(streamer_k.as_ref(), key.as_bytes());
|
||||
assert_eq!(streamer_v, &make_term_info(*v as u64));
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_range_boundaries() -> crate::Result<()> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
|
||||
for i in 0u8..10u8 {
|
||||
let number_arr = [i; 1];
|
||||
term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
let file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
|
||||
let &(ref key, ref val) = &ids[2047];
|
||||
assert_eq!(
|
||||
term_dictionary.get(key.as_bytes()),
|
||||
Some(make_term_info(*val as u64))
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
|
||||
let mut res: Vec<u32> = vec![];
|
||||
while let Some((_, ref v)) = streamer.next() {
|
||||
res.push(v.doc_freq);
|
||||
}
|
||||
if backwards {
|
||||
res.reverse();
|
||||
}
|
||||
res
|
||||
};
|
||||
{
|
||||
let range = term_dictionary.range().backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
#[test]
|
||||
fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
|
||||
term_dictionary_builder.insert("abr", &make_term_info(3))?;
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
let term_dict_file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?;
|
||||
let mut kv_stream = term_dictionary.stream();
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(1));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(2));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abr".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(3));
|
||||
assert!(!kv_stream.advance());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_range() -> crate::Result<()> {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([2u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([2u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().gt([2u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().gt([2u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().lt([6u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().lt([6u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().le([6u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().le([6u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
|
||||
assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
|
||||
let file = FileSlice::from(buffer);
|
||||
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
|
||||
{
|
||||
for i in (0..20).chain(6000..8_000) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
let mut streamer = term_dictionary
|
||||
.range()
|
||||
.ge([0u8])
|
||||
.lt([5u8])
|
||||
.backward()
|
||||
.ge(target_key.as_bytes())
|
||||
.into_stream();
|
||||
assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() -> crate::Result<()> {
|
||||
use crate::query::DFAWrapper;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
"Serbia",
|
||||
"Slovakia",
|
||||
"Slovenia",
|
||||
"Spain",
|
||||
"Sweden",
|
||||
"Switzerland",
|
||||
];
|
||||
|
||||
let directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?;
|
||||
for j in 0..3 {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j];
|
||||
assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
|
||||
assert_eq!(streamer_v.doc_freq, *v);
|
||||
assert_eq!(streamer_v, &make_term_info(*v as u64));
|
||||
}
|
||||
term_dictionary_builder.finish()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
let term_dict: TermDictionary = TermDictionary::open(file)?;
|
||||
|
||||
// We can now build an entire dfa.
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
|
||||
let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));
|
||||
|
||||
let mut range = term_dict.search(automaton).into_stream();
|
||||
|
||||
// get the first finding
|
||||
assert!(range.advance());
|
||||
assert_eq!("Spain".as_bytes(), range.key());
|
||||
assert!(!range.advance());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
{
|
||||
for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
let mut streamer = term_dictionary
|
||||
.range()
|
||||
.gt(target_key.as_bytes())
|
||||
.into_stream();
|
||||
for j in 0..3 {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j + 1];
|
||||
assert_eq!(streamer_k.as_ref(), key.as_bytes());
|
||||
assert_eq!(streamer_v.doc_freq, *v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
|
||||
for j in 0..3 {
|
||||
let &(ref fst_key, _) = &ids[i];
|
||||
let &(ref last_key, _) = &ids[i + j];
|
||||
let mut streamer = term_dictionary
|
||||
.range()
|
||||
.ge(fst_key.as_bytes())
|
||||
.lt(last_key.as_bytes())
|
||||
.into_stream();
|
||||
for _ in 0..j {
|
||||
assert!(streamer.next().is_some());
|
||||
}
|
||||
assert!(streamer.next().is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_string() -> crate::Result<()> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[], &make_term_info(1 as u64))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[1u8], &make_term_info(2 as u64))
|
||||
.unwrap();
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
|
||||
let mut stream = term_dictionary.stream();
|
||||
assert!(stream.advance());
|
||||
assert!(stream.key().is_empty());
|
||||
assert!(stream.advance());
|
||||
assert_eq!(stream.key(), &[1u8]);
|
||||
assert!(!stream.advance());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_range_boundaries() -> crate::Result<()> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
|
||||
for i in 0u8..10u8 {
|
||||
let number_arr = [i; 1];
|
||||
term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
let file = FileSlice::from(buffer);
|
||||
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
|
||||
|
||||
let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
|
||||
let mut res: Vec<u32> = vec![];
|
||||
while let Some((_, ref v)) = streamer.next() {
|
||||
res.push(v.doc_freq);
|
||||
}
|
||||
if backwards {
|
||||
res.reverse();
|
||||
}
|
||||
res
|
||||
};
|
||||
{
|
||||
let range = term_dictionary.range().backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([2u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([2u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().gt([2u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().gt([2u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().lt([6u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().lt([6u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().le([6u8]).into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, false),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().le([6u8]).backward().into_stream();
|
||||
assert_eq!(
|
||||
value_list(range, true),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
|
||||
assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary
|
||||
.range()
|
||||
.ge([0u8])
|
||||
.lt([5u8])
|
||||
.backward()
|
||||
.into_stream();
|
||||
assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() -> crate::Result<()> {
|
||||
use crate::query::DFAWrapper;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
"Serbia",
|
||||
"Slovakia",
|
||||
"Slovenia",
|
||||
"Spain",
|
||||
"Sweden",
|
||||
"Switzerland",
|
||||
];
|
||||
|
||||
let directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
let term_dict: TermDictionary = TermDictionary::open(file)?;
|
||||
|
||||
// We can now build an entire dfa.
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
|
||||
let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));
|
||||
|
||||
let mut range = term_dict.search(automaton).into_stream();
|
||||
|
||||
// get the first finding
|
||||
assert!(range.advance());
|
||||
assert_eq!("Spain".as_bytes(), range.key());
|
||||
assert!(!range.advance());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user