Compare commits

..

4 Commits

Author SHA1 Message Date
Paul Masurel
45da5829bc added unit test 2019-09-10 08:31:27 +09:00
Paul Masurel
e2f7aab39f Added exact fuzzy query 2019-09-07 15:39:04 +09:00
Paul Masurel
1b9cbdb672 blop 2019-09-07 15:05:21 +09:00
Paul Masurel
a8f3cf9679 Added an incremental search crate 2019-09-07 13:23:58 +09:00
61 changed files with 1048 additions and 561 deletions

View File

@@ -7,8 +7,7 @@ Tantivy 0.11.0
- Better handling of whitespaces.
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
- API change around `Box<BoxableTokenizer>`. See detail in #629
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
- Avoid rebuilding Regex automaton whenever a regex query is reused. #630 (@brainlock)
## How to update?
@@ -16,12 +15,6 @@ Tantivy 0.11.0
- Regex are now compiled when the `RegexQuery` instance is built. As a result, it can now return
an error and handling the `Result` is required.
Tantivy 0.10.2
=====================
- Closes #656. Solving memory leak.
Tantivy 0.10.1
=====================

View File

@@ -15,13 +15,13 @@ edition = "2018"
[dependencies]
base64 = "0.10.0"
byteorder = "1.0"
crc32fast = "1.2.0"
once_cell = "1.0"
regex ={version = "1.3.0", default-features = false, features = ["std"]}
tantivy-fst = "0.1"
memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
derive_builder = "0.7"
atomicwrites = {version="0.2.2", optional=true}
tempfile = "3.0"
log = "0.4"
@@ -31,7 +31,7 @@ serde_json = "1.0"
num_cpus = "1.2"
fs2={version="0.4", optional=true}
itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
levenshtein_automata = "0.1"
notify = {version="4", optional=true}
bit-set = "0.5"
uuid = { version = "0.7.2", features = ["v4", "serde"] }
@@ -82,7 +82,7 @@ unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar"]
members = ["query-grammar", "incremental-search"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -1,3 +1,3 @@
test:
echo "Run test only... No examples."
cargo test --tests --lib
cargo test --all --tests --lib

View File

@@ -0,0 +1,10 @@
[package]
name = "incremental-search"
version = "0.11.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
edition = "2018"
[dependencies]
derive_builder = "0.7"
tantivy = {path = ".."}

View File

@@ -0,0 +1,395 @@
use std::fmt;
use std::u64;
#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.into_iter().collect::<Vec<u32>>().fmt(f)
}
}
pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest()
}
}
impl IntoIterator for TinySet {
type Item = u32;
type IntoIter = TinySetIterator;
fn into_iter(self) -> Self::IntoIter {
TinySetIterator(self)
}
}
impl TinySet {
/// Returns an empty `TinySet`.
pub fn empty() -> TinySet {
TinySet(0u64)
}
/// Returns the complement of the set in `[0, 64[`.
fn complement(self) -> TinySet {
TinySet(!self.0)
}
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
}
/// Creates a new `TinySet` containing only one element
/// within `[0; 64[`
#[inline(always)]
pub fn singleton(el: u32) -> TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64[
#[inline(always)]
pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el))
}
/// Insert a new element within [0..64[
#[inline(always)]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.insert(el);
old != *self
}
/// Returns the union of two tinysets
#[inline(always)]
pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0)
}
/// Returns true iff the `TinySet` is empty.
#[inline(always)]
pub fn is_empty(self) -> bool {
self.0 == 0u64
}
/// Returns the lowest element in the `TinySet`
/// and removes it.
#[inline(always)]
pub fn pop_lowest(&mut self) -> Option<u32> {
if self.is_empty() {
None
} else {
let lowest = self.0.trailing_zeros() as u32;
self.0 ^= TinySet::singleton(lowest).0;
Some(lowest)
}
}
/// Returns a `TinySet` than contains all values up
/// to limit excluded.
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_lower(upper_bound: u32) -> TinySet {
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
}
/// Returns a `TinySet` that contains all values greater
/// or equal to the given limit, included. (and up to 63)
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
TinySet::range_lower(from_included).complement()
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
pub fn len(self) -> u32 {
self.0.count_ones()
}
}
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: usize, //< Technically it should be u32, but we
// count multiple inserts.
// `usize` guards us from overflow.
max_value: u32,
}
fn num_buckets(max_val: u32) -> u32 {
(max_val + 63u32) / 64u32
}
impl BitSet {
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val[`.
pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet {
tinysets: tinybisets,
len: 0,
max_value,
}
}
/// Removes all elements from the `BitSet`.
pub fn clear(&mut self) {
for tinyset in self.tinysets.iter_mut() {
*tinyset = TinySet::empty();
}
}
/// Returns the number of elements in the `BitSet`.
pub fn len(&self) -> usize {
self.len
}
/// Inserts an element in the `BitSet`
pub fn insert(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`.
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)
}
/// Returns the first non-empty `TinySet` associated to a bucket lower
/// or greater than bucket.
///
/// Reminder: the tiny set with the bucket `bucket`, represents the
/// elements from `bucket * 64` to `(bucket+1) * 64`.
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
self.tinysets[bucket as usize..]
.iter()
.cloned()
.position(|tinyset| !tinyset.is_empty())
.map(|delta_bucket| bucket + delta_bucket as u32)
}
pub fn max_value(&self) -> u32 {
self.max_value
}
/// Returns the tiny bitset representing the
/// the set restricted to the number range from
/// `bucket * 64` to `(bucket + 1) * 64`.
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
self.tinysets[bucket as usize]
}
}
#[cfg(test)]
mod tests {
use super::BitSet;
use super::TinySet;
use crate::docset::DocSet;
use crate::query::BitSetDocSet;
use crate::tests;
use crate::tests::generate_nonunique_unsorted;
use std::collections::BTreeSet;
use std::collections::HashSet;
#[test]
fn test_tiny_set() {
assert!(TinySet::empty().is_empty());
{
let mut u = TinySet::empty().insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(1u32).insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(2u32);
assert_eq!(u.pop_lowest(), Some(2u32));
u.insert_mut(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(63u32);
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
}
#[test]
fn test_bitset() {
let test_against_hashset = |els: &[u32], max_value: u32| {
let mut hashset: HashSet<u32> = HashSet::new();
let mut bitset = BitSet::with_max_value(max_value);
for &el in els {
assert!(el < max_value);
hashset.insert(el);
bitset.insert(el);
}
for el in 0..max_value {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
};
test_against_hashset(&[], 0);
test_against_hashset(&[], 1);
test_against_hashset(&[0u32], 1);
test_against_hashset(&[0u32], 100);
test_against_hashset(&[1u32, 2u32], 4);
test_against_hashset(&[99u32], 100);
test_against_hashset(&[63u32], 64);
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
for el in btreeset.into_iter() {
bitset_docset.advance();
assert_eq!(bitset_docset.doc(), el);
}
assert!(!bitset_docset.advance());
}
#[test]
fn test_bitset_num_buckets() {
use super::num_buckets;
assert_eq!(num_buckets(0u32), 0);
assert_eq!(num_buckets(1u32), 1);
assert_eq!(num_buckets(64u32), 1);
assert_eq!(num_buckets(65u32), 2);
assert_eq!(num_buckets(128u32), 2);
assert_eq!(num_buckets(129u32), 3);
}
#[test]
fn test_tinyset_range() {
assert_eq!(
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
[0, 1, 2]
);
assert!(TinySet::range_lower(0).is_empty());
assert_eq!(
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
(0u32..63u32).collect::<Vec<_>>()
);
assert_eq!(
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
[0]
);
assert_eq!(
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
[0, 1]
);
assert_eq!(
TinySet::range_greater_or_equal(3)
.into_iter()
.collect::<Vec<u32>>(),
(3u32..64u32).collect::<Vec<_>>()
);
}
#[test]
fn test_bitset_len() {
let mut bitset = BitSet::with_max_value(1_000);
assert_eq!(bitset.len(), 0);
bitset.insert(3u32);
assert_eq!(bitset.len(), 1);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(3u32);
assert_eq!(bitset.len(), 2);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(104u32);
assert_eq!(bitset.len(), 3);
}
#[test]
fn test_bitset_clear() {
let mut bitset = BitSet::with_max_value(1_000);
let els = tests::sample(1_000, 0.01f64);
for &el in &els {
bitset.insert(el);
}
assert!(els.iter().all(|el| bitset.contains(*el)));
bitset.clear();
for el in 0u32..1000u32 {
assert!(!bitset.contains(el));
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::TinySet;
use test;
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -0,0 +1,266 @@
use tantivy::query::{BooleanQuery, FuzzyTermQuery, EmptyQuery};
use derive_builder::Builder;
use std::str::FromStr;
use tantivy::query::{FuzzyConfiguration, FuzzyConfigurationBuilder, Query, Occur};
use tantivy::schema::Field;
use tantivy::{Searcher, TantivyError, DocAddress, Term, Document};
use tantivy::collector::TopDocs;
use std::ops::Deref;
#[derive(Debug)]
pub struct IncrementalSearchQuery {
pub terms: Vec<String>,
pub last_is_prefix: bool,
}
impl IncrementalSearchQuery {
pub fn fuzzy_configurations(&self) -> Vec<FuzzyConfigurations> {
if self.terms.is_empty() {
return Vec::default();
}
let single_term_confs: Vec<FuzzyConfigurationBuilder> = (0u8..3u8)
.map(|d: u8| {
let mut builder = FuzzyConfigurationBuilder::default();
builder.distance(d).transposition_cost_one(true);
builder
})
.collect();
let mut configurations: Vec<Vec<FuzzyConfigurationBuilder>> = single_term_confs
.iter()
.map(|conf| vec![conf.clone()])
.collect();
let mut new_configurations = Vec::new();
for _ in 1..self.terms.len() {
new_configurations.clear();
for single_term_conf in &single_term_confs {
for configuration in &configurations {
let mut new_configuration: Vec<FuzzyConfigurationBuilder> = configuration.clone();
new_configuration.push(single_term_conf.clone());
new_configurations.push(new_configuration);
}
}
std::mem::swap(&mut configurations, &mut new_configurations);
}
if self.last_is_prefix {
for configuration in &mut configurations {
if let Some(last_conf) = configuration.last_mut() {
last_conf.prefix(true);
}
}
}
let mut fuzzy_configurations: Vec<FuzzyConfigurations> = configurations
.into_iter()
.map(FuzzyConfigurations::from)
.collect();
fuzzy_configurations.sort_by(|left, right| left.cost.partial_cmp(&right.cost).unwrap());
fuzzy_configurations
}
fn search_query(&self, fields: &[Field], configurations: FuzzyConfigurations) -> Box<dyn Query> {
if self.terms.is_empty() {
Box::new(EmptyQuery)
} else if self.terms.len() == 1 {
build_query_for_fields(fields, &self.terms[0], &configurations.configurations[0])
} else {
Box::new(BooleanQuery::from(self.terms.iter()
.zip(configurations.configurations.iter())
.map(|(term, configuration)|
(Occur::Must, build_query_for_fields(fields, &term, &configuration))
)
.collect::<Vec<_>>()))
}
}
}
#[derive(Debug)]
pub struct FuzzyConfigurations {
configurations: Vec<FuzzyConfiguration>,
cost: f64,
}
fn compute_cost(fuzzy_confs: &[FuzzyConfiguration]) -> f64 {
fuzzy_confs
.iter()
.map(|fuzzy_conf| {
let weight = if fuzzy_conf.prefix { 30f64 } else { 5f64 };
weight * f64::from(fuzzy_conf.distance)
})
.sum()
}
impl From<Vec<FuzzyConfigurationBuilder>> for FuzzyConfigurations {
fn from(fuzzy_conf_builder: Vec<FuzzyConfigurationBuilder>) -> FuzzyConfigurations {
let configurations = fuzzy_conf_builder
.into_iter()
.map(|conf| conf.build().unwrap())
.collect::<Vec<FuzzyConfiguration>>();
let cost = compute_cost(&configurations);
FuzzyConfigurations {
configurations,
cost,
}
}
}
#[derive(Debug)]
pub struct ParseIncrementalQueryError;
impl Into<TantivyError> for ParseIncrementalQueryError {
fn into(self) -> TantivyError {
TantivyError::InvalidArgument(format!("Invalid query: {:?}", self))
}
}
impl FromStr for IncrementalSearchQuery {
type Err = ParseIncrementalQueryError;
fn from_str(query_str: &str) -> Result<Self, Self::Err> {
let terms: Vec<String> = query_str
.split_whitespace()
.map(ToString::to_string)
.collect();
Ok(IncrementalSearchQuery {
terms,
last_is_prefix: query_str
.chars()
.last()
.map(|c| !c.is_whitespace())
.unwrap_or(false),
})
}
}
fn build_query_for_fields(fields: &[Field], term_text: &str, conf: &FuzzyConfiguration) -> Box<dyn Query> {
assert!(fields.len() > 0);
if fields.len() > 1 {
let term_queries: Vec<(Occur, Box<dyn Query>)> = fields
.iter()
.map(|&field| {
let term = Term::from_field_text(field, term_text);
let query = FuzzyTermQuery::new_from_configuration(term, conf.clone());
let boxed_query: Box<dyn Query> = Box::new(query);
(Occur::Must, boxed_query)
})
.collect();
Box::new(BooleanQuery::from(term_queries))
} else {
let term = Term::from_field_text(fields[0], term_text);
Box::new( FuzzyTermQuery::new_from_configuration(term, conf.clone()))
}
}
pub struct IncrementalSearchResult {
pub docs: Vec<Document>
}
#[derive(Builder, Default)]
pub struct IncrementalSearch {
nhits: usize,
#[builder(default)]
search_fields: Vec<Field>,
#[builder(default)]
return_fields: Vec<Field>,
}
impl IncrementalSearch {
pub fn search<S: Deref<Target=Searcher>>(
&self,
query: &str,
searcher: &S,
) -> tantivy::Result<IncrementalSearchResult> {
let searcher = searcher.deref();
let inc_search_query: IncrementalSearchQuery =
FromStr::from_str(query).map_err(Into::<TantivyError>::into)?;
let mut results: Vec<DocAddress> = Vec::default();
let mut remaining = self.nhits;
for fuzzy_conf in inc_search_query.fuzzy_configurations() {
if remaining == 0 {
break;
}
let query = inc_search_query.search_query(&self.search_fields[..], fuzzy_conf);
let new_docs = searcher.search(query.as_ref(), &TopDocs::with_limit(remaining))?;
// TODO(pmasurel) remove already added docs.
results.extend(new_docs.into_iter()
.map(|(_, doc_address)| doc_address));
remaining = self.nhits - results.len();
if remaining == 0 {
break;
}
}
let docs: Vec<Document> = results.into_iter()
.map(|doc_address: DocAddress| searcher.doc(doc_address))
.collect::<tantivy::Result<_>>()?;
Ok(IncrementalSearchResult {
docs
})
}
}
#[cfg(test)]
mod tests {
use tantivy::doc;
use crate::{IncrementalSearch, IncrementalSearchBuilder, IncrementalSearchQuery};
use std::str::FromStr;
use tantivy::schema::{SchemaBuilder, TEXT, STORED};
use tantivy::Index;
#[test]
fn test_incremental_search() {
let incremental_search = IncrementalSearchBuilder::default()
.nhits(10)
.build()
.unwrap();
}
#[test]
fn test_incremental_search_query_parse_empty() {
let query = IncrementalSearchQuery::from_str("").unwrap();
assert_eq!(query.terms, Vec::<String>::new());
assert!(!query.last_is_prefix);
}
#[test]
fn test_incremental_search_query_parse_trailing_whitespace() {
let query = IncrementalSearchQuery::from_str("hello happy tax pa ").unwrap();
assert_eq!(query.terms, vec!["hello", "happy", "tax", "pa"]);
assert!(!query.last_is_prefix);
}
#[test]
fn test_incremental_search_query_parse_unicode_whitespace() {
let query = IncrementalSearchQuery::from_str("hello happy tax pa ").unwrap();
assert_eq!(query.terms, vec!["hello", "happy", "tax", "pa"]);
assert!(!query.last_is_prefix);
}
#[test]
fn test_incremental_search_query_parse() {
let query = IncrementalSearchQuery::from_str("hello happy tax pa").unwrap();
assert_eq!(query.terms, vec!["hello", "happy", "tax", "pa"]);
assert!(query.last_is_prefix);
}
#[test]
fn test_blop() {
let mut schema_builder = SchemaBuilder::new();
let body = schema_builder.add_text_field("body", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 30_000_000).unwrap();
index_writer.add_document(doc!(body=> "hello happy tax payer"));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let incremental_search: IncrementalSearch = IncrementalSearchBuilder::default()
.nhits(1)
.search_fields(vec![body])
.build()
.unwrap();
let top_docs = incremental_search.search("hello hapy t", &searcher).unwrap();
assert_eq!(top_docs.docs.len(), 1);
}
}

View File

@@ -1,6 +1,3 @@
use std::fmt;
use std::fmt::Write;
/// Defines whether a term in a query must be present,
/// should be present or must not be present.
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
@@ -21,7 +18,7 @@ impl Occur {
/// - `Should` => '?',
/// - `Must` => '+'
/// - `Not` => '-'
fn to_char(self) -> char {
pub fn to_char(self) -> char {
match self {
Occur::Should => '?',
Occur::Must => '+',
@@ -50,9 +47,3 @@ impl Occur {
}
}
}
impl fmt::Display for Occur {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_char(self.to_char())
}
}

View File

@@ -151,7 +151,7 @@ impl fmt::Debug for UserInputAST {
Ok(())
}
UserInputAST::Unary(ref occur, ref subquery) => {
write!(formatter, "{}({:?})", occur, subquery)
write!(formatter, "{}({:?})", occur.to_char(), subquery)
}
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
}

View File

@@ -123,4 +123,5 @@ mod tests {
assert_eq!(count_collector.harvest(), 2);
}
}
}

View File

@@ -599,18 +599,19 @@ mod tests {
);
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use crate::collector::FacetCollector;
use crate::query::AllQuery;
use crate::schema::{Facet, Schema};
use crate::Index;
use rand::seq::SliceRandom;
use rand::thread_rng;
use collector::FacetCollector;
use query::AllQuery;
use rand::{thread_rng, Rng};
use schema::Facet;
use schema::Schema;
use test::Bencher;
use Index;
#[bench]
fn bench_facet_collector(b: &mut Bencher) {
@@ -627,7 +628,7 @@ mod bench {
}
}
// 40425 docs
docs[..].shuffle(&mut thread_rng());
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for doc in docs {
@@ -636,7 +637,7 @@ mod bench {
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let searcher = reader.searcher();
let searcher = index.searcher();
let facet_collector = FacetCollector::for_field(facet_field);
searcher.search(&AllQuery, &facet_collector).unwrap();
});

View File

@@ -592,4 +592,5 @@ mod tests {
let query = query_parser.parse_query(query).unwrap();
(index, query)
}
}

View File

@@ -2,7 +2,7 @@ use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::common::VInt;
use crate::directory::ReadOnlySource;
use crate::directory::{TerminatingWrite, WritePtr};
use crate::directory::WritePtr;
use crate::schema::Field;
use crate::space_usage::FieldUsage;
use crate::space_usage::PerFieldSpaceUsage;
@@ -42,7 +42,7 @@ pub struct CompositeWrite<W = WritePtr> {
offsets: HashMap<FileAddr, u64>,
}
impl<W: TerminatingWrite + Write> CompositeWrite<W> {
impl<W: Write> CompositeWrite<W> {
/// Crate a new API writer that writes a composite file
/// in a given write.
pub fn wrap(w: W) -> CompositeWrite<W> {
@@ -91,7 +91,8 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
footer_len.serialize(&mut self.write)?;
self.write.terminate()
self.write.flush()?;
Ok(())
}
}
@@ -230,4 +231,5 @@ mod test {
}
}
}
}

View File

@@ -1,5 +1,3 @@
use crate::directory::AntiCallToken;
use crate::directory::TerminatingWrite;
use std::io;
use std::io::Write;
@@ -44,13 +42,6 @@ impl<W: Write> Write for CountingWriter<W> {
}
}
impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
self.flush()?;
self.underlying.terminate_ref(token)
}
}
#[cfg(test)]
mod test {

View File

@@ -199,7 +199,10 @@ pub mod test {
fn test_serialize_string() {
assert_eq!(serialize_test(String::from("")), 1);
assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4);
assert_eq!(serialize_test(String::from("富士さん見える。")), 1 + 3 * 8);
assert_eq!(
serialize_test(String::from("富士さん見える。")),
1 + 3 * 8
);
}
#[test]

View File

@@ -26,10 +26,9 @@ use crate::IndexWriter;
use crate::Result;
use num_cpus;
use std::borrow::BorrowMut;
use std::collections::HashSet;
use std::fmt;
#[cfg(feature = "mmap")]
use std::path::{Path, PathBuf};
use std::path::Path;
use std::sync::Arc;
fn load_metas(directory: &dyn Directory, inventory: &SegmentMetaInventory) -> Result<IndexMeta> {
@@ -369,11 +368,6 @@ impl Index {
.map(SegmentMeta::id)
.collect())
}
/// Returns the set of corrupted files
pub fn validate_checksum(&self) -> Result<HashSet<PathBuf>> {
self.directory.list_damaged().map_err(Into::into)
}
}
impl fmt::Debug for Index {
@@ -601,4 +595,5 @@ mod tests {
assert_eq!(searcher.num_docs(), 8_000);
assert!(mem_right_after_merge_finished < mem_right_after_commit);
}
}

View File

@@ -118,8 +118,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
///
/// Specifically, subsequent writes or flushes should
/// have no effect on the returned `ReadOnlySource` object.
///
/// You should only use this to read files create with [`open_write`]
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
/// Removes a file
@@ -159,8 +157,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// atomic_write.
///
/// This should only be used for small files.
///
/// You should only use this to read files create with [`atomic_write`]
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
/// Atomically replace the content of a file with data.

View File

@@ -1,213 +0,0 @@
use crate::directory::read_only_source::ReadOnlySource;
use crate::directory::{AntiCallToken, TerminatingWrite};
use byteorder::{ByteOrder, LittleEndian};
use crc32fast::Hasher;
use std::io;
use std::io::Write;
const COMMON_FOOTER_SIZE: usize = 4 * 5;
#[derive(Debug, Clone, PartialEq)]
pub struct Footer {
pub tantivy_version: (u32, u32, u32),
pub meta: String,
pub versioned_footer: VersionedFooter,
}
impl Footer {
pub fn new(versioned_footer: VersionedFooter) -> Self {
let tantivy_version = (
env!("CARGO_PKG_VERSION_MAJOR").parse().unwrap(),
env!("CARGO_PKG_VERSION_MINOR").parse().unwrap(),
env!("CARGO_PKG_VERSION_PATCH").parse().unwrap(),
);
Footer {
tantivy_version,
meta: format!(
"tantivy {}.{}.{}, index v{}",
tantivy_version.0,
tantivy_version.1,
tantivy_version.2,
versioned_footer.version()
),
versioned_footer,
}
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut res = self.versioned_footer.to_bytes();
res.extend_from_slice(self.meta.as_bytes());
let len = res.len();
res.resize(len + COMMON_FOOTER_SIZE, 0);
let mut common_footer = &mut res[len..];
LittleEndian::write_u32(&mut common_footer, self.meta.len() as u32);
LittleEndian::write_u32(&mut common_footer[4..], self.tantivy_version.0);
LittleEndian::write_u32(&mut common_footer[8..], self.tantivy_version.1);
LittleEndian::write_u32(&mut common_footer[12..], self.tantivy_version.2);
LittleEndian::write_u32(&mut common_footer[16..], (len + COMMON_FOOTER_SIZE) as u32);
res
}
pub fn from_bytes(data: &[u8]) -> Result<Self, io::Error> {
let len = data.len();
if len < COMMON_FOOTER_SIZE + 4 {
// 4 bytes for index version, stored in versioned footer
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!("File corrupted. The footer len must be over 24, while the entire file len is {}", len)
)
);
}
let size = LittleEndian::read_u32(&data[len - 4..]) as usize;
if len < size as usize {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"File corrupted. The footer len is {}, while the entire file len is {}",
size, len
),
));
}
let footer = &data[len - size as usize..];
let meta_len = LittleEndian::read_u32(&footer[size - 20..]) as usize;
let tantivy_major = LittleEndian::read_u32(&footer[size - 16..]);
let tantivy_minor = LittleEndian::read_u32(&footer[size - 12..]);
let tantivy_patch = LittleEndian::read_u32(&footer[size - 8..]);
Ok(Footer {
tantivy_version: (tantivy_major, tantivy_minor, tantivy_patch),
meta: String::from_utf8_lossy(&footer[size - meta_len - 20..size - 20]).into_owned(),
versioned_footer: VersionedFooter::from_bytes(&footer[..size - meta_len - 20])?,
})
}
pub fn extract_footer(source: ReadOnlySource) -> Result<(Footer, ReadOnlySource), io::Error> {
let footer = Footer::from_bytes(source.as_slice())?;
let reader = source.slice_to(source.as_slice().len() - footer.size());
Ok((footer, reader))
}
pub fn size(&self) -> usize {
self.versioned_footer.size() as usize + self.meta.len() + 20
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum VersionedFooter {
UnknownVersion { version: u32, size: u32 },
V0(u32), // crc
}
impl VersionedFooter {
pub fn to_bytes(&self) -> Vec<u8> {
match self {
VersionedFooter::V0(crc) => {
let mut res = vec![0; 8];
LittleEndian::write_u32(&mut res, 0);
LittleEndian::write_u32(&mut res[4..], *crc);
res
}
VersionedFooter::UnknownVersion { .. } => {
panic!("Unsupported index should never get serialized");
}
}
}
pub fn from_bytes(footer: &[u8]) -> Result<Self, io::Error> {
assert!(footer.len() >= 4);
let version = LittleEndian::read_u32(footer);
match version {
0 => {
if footer.len() == 8 {
Ok(VersionedFooter::V0(LittleEndian::read_u32(&footer[4..])))
} else {
Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"File corrupted. The versioned footer len is {}, while it should be 8",
footer.len()
),
))
}
}
version => Ok(VersionedFooter::UnknownVersion {
version,
size: footer.len() as u32,
}),
}
}
pub fn size(&self) -> u32 {
match self {
VersionedFooter::V0(_) => 8,
VersionedFooter::UnknownVersion { size, .. } => *size,
}
}
pub fn version(&self) -> u32 {
match self {
VersionedFooter::V0(_) => 0,
VersionedFooter::UnknownVersion { version, .. } => *version,
}
}
pub fn crc(&self) -> Option<u32> {
match self {
VersionedFooter::V0(crc) => Some(*crc),
VersionedFooter::UnknownVersion { .. } => None,
}
}
}
pub(crate) struct FooterProxy<W: TerminatingWrite> {
/// always Some except after terminate call
hasher: Option<Hasher>,
/// always Some except after terminate call
writer: Option<W>,
}
impl<W: TerminatingWrite> FooterProxy<W> {
pub fn new(writer: W) -> Self {
FooterProxy {
hasher: Some(Hasher::new()),
writer: Some(writer),
}
}
}
impl<W: TerminatingWrite> Write for FooterProxy<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let count = self.writer.as_mut().unwrap().write(buf)?;
self.hasher.as_mut().unwrap().update(&buf[..count]);
Ok(count)
}
fn flush(&mut self) -> io::Result<()> {
self.writer.as_mut().unwrap().flush()
}
}
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
let crc = self.hasher.take().unwrap().finalize();
let footer = Footer::new(VersionedFooter::V0(crc)).to_bytes();
let mut writer = self.writer.take().unwrap();
writer.write_all(&footer)?;
writer.terminate()
}
}
#[cfg(test)]
mod tests {
use crate::directory::footer::{Footer, VersionedFooter};
#[test]
fn test_serialize_deserialize_footer() {
let crc = 123456;
let footer = Footer::new(VersionedFooter::V0(crc));
let footer_bytes = footer.to_bytes();
assert_eq!(Footer::from_bytes(&footer_bytes).unwrap(), footer);
}
}

View File

@@ -1,6 +1,5 @@
use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock;
use crate::directory::Lock;
use crate::directory::META_LOCK;
@@ -9,7 +8,6 @@ use crate::directory::{WatchCallback, WatchHandle};
use crate::error::DataCorruption;
use crate::Directory;
use crate::Result;
use crc32fast::Hasher;
use serde_json;
use std::collections::HashSet;
use std::io;
@@ -209,59 +207,17 @@ impl ManagedDirectory {
}
Ok(())
}
/// Verify checksum of a managed file
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
let reader = self.directory.open_read(path)?;
let (footer, data) = Footer::extract_footer(reader)
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
let mut hasher = Hasher::new();
hasher.update(data.as_slice());
let crc = hasher.finalize();
Ok(footer
.versioned_footer
.crc()
.map(|v| v == crc)
.unwrap_or(false))
}
/// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new();
let managed_paths = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in list damaged.")
.managed_paths
.clone();
for path in managed_paths.into_iter() {
if !self.validate_checksum(&path)? {
hashset.insert(path);
}
}
Ok(hashset)
}
}
impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
let read_only_source = self.directory.open_read(path)?;
let (_footer, reader) = Footer::extract_footer(read_only_source)
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
Ok(reader)
self.directory.open_read(path)
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
Ok(io::BufWriter::new(Box::new(FooterProxy::new(
self.directory
.open_write(path)?
.into_inner()
.map_err(|_| ())
.expect("buffer should be empty"),
))))
self.directory.open_write(path)
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
@@ -303,9 +259,8 @@ impl Clone for ManagedDirectory {
#[cfg(test)]
mod tests_mmap_specific {
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
use crate::directory::{Directory, ManagedDirectory, MmapDirectory};
use std::collections::HashSet;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
@@ -320,8 +275,8 @@ mod tests_mmap_specific {
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let write_file = managed_directory.open_write(test_path1).unwrap();
write_file.terminate().unwrap();
let mut write_file = managed_directory.open_write(test_path1).unwrap();
write_file.flush().unwrap();
managed_directory
.atomic_write(test_path2, &[0u8, 1u8])
.unwrap();
@@ -355,9 +310,9 @@ mod tests_mmap_specific {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut write = managed_directory.open_write(test_path1).unwrap();
write.write_all(&[0u8, 1u8]).unwrap();
write.terminate().unwrap();
managed_directory
.atomic_write(test_path1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(test_path1));
let _mmap_read = managed_directory.open_read(test_path1).unwrap();
@@ -376,38 +331,4 @@ mod tests_mmap_specific {
}
}
#[test]
fn test_checksum() {
let test_path1: &'static Path = Path::new("some_path_for_test");
let test_path2: &'static Path = Path::new("other_test_path");
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut write = managed_directory.open_write(test_path1).unwrap();
write.write_all(&[0u8, 1u8]).unwrap();
write.terminate().unwrap();
let mut write = managed_directory.open_write(test_path2).unwrap();
write.write_all(&[3u8, 4u8, 5u8]).unwrap();
write.terminate().unwrap();
assert!(managed_directory.list_damaged().unwrap().is_empty());
let mut corrupted_path = tempdir_path.clone();
corrupted_path.push(test_path2);
let mut file = OpenOptions::new()
.write(true)
.open(&corrupted_path)
.unwrap();
file.write_all(&[255u8]).unwrap();
file.flush().unwrap();
drop(file);
let damaged = managed_directory.list_damaged().unwrap();
assert_eq!(damaged.len(), 1);
assert!(damaged.contains(test_path2));
}
}

View File

@@ -11,7 +11,6 @@ use crate::directory::error::{
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::directory::read_only_source::BoxedData;
use crate::directory::AntiCallToken;
use crate::directory::Directory;
use crate::directory::DirectoryLock;
use crate::directory::Lock;
@@ -19,7 +18,7 @@ use crate::directory::ReadOnlySource;
use crate::directory::WatchCallback;
use crate::directory::WatchCallbackList;
use crate::directory::WatchHandle;
use crate::directory::{TerminatingWrite, WritePtr};
use crate::directory::WritePtr;
use atomicwrites;
use memmap::Mmap;
use std::collections::HashMap;
@@ -142,28 +141,42 @@ impl MmapCache {
}
}
struct WatcherWrapper {
struct InnerWatcherWrapper {
_watcher: Mutex<notify::RecommendedWatcher>,
watcher_router: Arc<WatchCallbackList>,
watcher_router: WatchCallbackList,
}
impl InnerWatcherWrapper {
pub fn new(path: &Path) -> Result<(Self, Receiver<notify::RawEvent>), notify::Error> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the
let mut watcher = notify::raw_watcher(tx)?;
watcher.watch(path, RecursiveMode::Recursive)?;
let inner = InnerWatcherWrapper {
_watcher: Mutex::new(watcher),
watcher_router: Default::default(),
};
Ok((inner, watcher_recv))
}
}
#[derive(Clone)]
struct WatcherWrapper {
inner: Arc<InnerWatcherWrapper>,
}
impl WatcherWrapper {
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the
let watcher = notify::raw_watcher(tx)
.and_then(|mut watcher| {
watcher.watch(path, RecursiveMode::Recursive)?;
Ok(watcher)
})
.map_err(|err| match err {
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
_ => {
panic!("Unknown error while starting watching directory {:?}", path);
}
})?;
let watcher_router: Arc<WatchCallbackList> = Default::default();
let watcher_router_clone = watcher_router.clone();
let (inner, watcher_recv) = InnerWatcherWrapper::new(path).map_err(|err| match err {
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
_ => {
panic!("Unknown error while starting watching directory {:?}", path);
}
})?;
let watcher_wrapper = WatcherWrapper {
inner: Arc::new(inner),
};
let watcher_wrapper_clone = watcher_wrapper.clone();
thread::Builder::new()
.name("meta-file-watch-thread".to_string())
.spawn(move || {
@@ -174,7 +187,7 @@ impl WatcherWrapper {
// We might want to be more accurate than this at one point.
if let Some(filename) = changed_path.file_name() {
if filename == *META_FILEPATH {
watcher_router_clone.broadcast();
watcher_wrapper_clone.inner.watcher_router.broadcast();
}
}
}
@@ -187,15 +200,13 @@ impl WatcherWrapper {
}
}
}
})?;
Ok(WatcherWrapper {
_watcher: Mutex::new(watcher),
watcher_router,
})
})
.expect("Failed to spawn thread to watch meta.json");
Ok(watcher_wrapper)
}
pub fn watch(&mut self, watch_callback: WatchCallback) -> WatchHandle {
self.watcher_router.subscribe(watch_callback)
self.inner.watcher_router.subscribe(watch_callback)
}
}
@@ -254,7 +265,7 @@ impl MmapDirectoryInner {
}
}
if let Some(watch_wrapper) = self.watcher.write().unwrap().as_mut() {
Ok(watch_wrapper.watch(watch_callback))
return Ok(watch_wrapper.watch(watch_callback));
} else {
unreachable!("At this point, watch wrapper is supposed to be initialized");
}
@@ -401,12 +412,6 @@ impl Seek for SafeFileWriter {
}
}
impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.flush()
}
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);

View File

@@ -9,7 +9,6 @@ mod mmap_directory;
mod directory;
mod directory_lock;
mod footer;
mod managed_directory;
mod ram_directory;
mod read_only_source;
@@ -25,49 +24,18 @@ pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
pub(crate) use self::watch_event_router::WatchCallbackList;
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{self, BufWriter, Write};
use std::io::{BufWriter, Write};
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;
pub use self::managed_directory::ManagedDirectory;
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
pub struct AntiCallToken(());
/// Trait used to indicate when no more write need to be done on a writer
pub trait TerminatingWrite: Write {
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
fn terminate(mut self) -> io::Result<()>
where
Self: Sized,
{
self.terminate_ref(AntiCallToken(()))
}
/// You should implement this function to define custom behavior.
/// This function should flush any buffer it may hold.
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
}
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
self.as_mut().terminate_ref(token)
}
}
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
self.flush()?;
self.get_mut().terminate_ref(a)
}
}
/// Write object for Directory.
///
/// `WritePtr` are required to implement both Write
/// and Seek.
pub type WritePtr = BufWriter<Box<dyn TerminatingWrite>>;
pub type WritePtr = BufWriter<Box<dyn Write>>;
#[cfg(test)]
mod tests;

View File

@@ -1,9 +1,8 @@
use crate::core::META_FILEPATH;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use crate::directory::AntiCallToken;
use crate::directory::WatchCallbackList;
use crate::directory::WritePtr;
use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr};
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;
@@ -72,12 +71,6 @@ impl Write for VecWriter {
}
}
impl TerminatingWrite for VecWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.flush()
}
}
#[derive(Default)]
struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>,

View File

@@ -127,7 +127,7 @@ fn test_watch(directory: &mut dyn Directory) {
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok());
for _ in 0..1_000 {
for _ in 0..100 {
if counter.load(Ordering::SeqCst) > i {
break;
}

View File

@@ -152,4 +152,5 @@ mod tests {
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
}
}

View File

@@ -429,6 +429,7 @@ mod tests {
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
@@ -436,9 +437,9 @@ mod bench {
use super::tests::FIELD;
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use crate::common::CompositeFile;
use crate::directory::{Directory, RAMDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use common::CompositeFile;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::FastFieldReader;
use std::collections::HashMap;
use std::path::Path;
use test::{self, Bencher};
@@ -536,4 +537,5 @@ mod bench {
});
}
}
}

View File

@@ -5,8 +5,8 @@ use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
use crate::termdict::TermOrdinal;
use crate::DocId;
use fnv::FnvHashMap;
use itertools::Itertools;
use std::collections::HashMap;
use std::io;
/// Writer for multi-valued (as in, more than one value per document)
@@ -102,7 +102,7 @@ impl MultiValueIntFastFieldWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
{
// writing the offset index

View File

@@ -6,7 +6,6 @@ use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldType, Schema};
use crate::termdict::TermOrdinal;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
@@ -117,7 +116,7 @@ impl FastFieldsWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?;

View File

@@ -8,7 +8,6 @@ use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::core::SegmentReader;
use crate::directory::DirectoryLock;
use crate::directory::TerminatingWrite;
use crate::docset::DocSet;
use crate::error::TantivyError;
use crate::fastfield::write_delete_bitset;
@@ -169,7 +168,6 @@ pub(crate) fn advance_deletes(
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
write_delete_bitset(&delete_bitset, &mut delete_file)?;
delete_file.terminate()?;
}
}
segment_entry.set_meta(segment.meta().clone());
@@ -1179,4 +1177,5 @@ mod tests {
assert!(clear_again.is_ok());
assert!(commit_again.is_ok());
}
}

View File

@@ -134,4 +134,5 @@ mod tests {
}
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
}
}

View File

@@ -296,4 +296,5 @@ mod tests {
assert_eq!(initial_table_size(10_000_000).unwrap(), 17);
assert_eq!(initial_table_size(1_000_000_000).unwrap(), 19);
}
}

View File

@@ -274,15 +274,13 @@ pub mod tests {
mod bench {
use super::*;
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
use rand::{Rng, XorShiftRng};
use test::Bencher;
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
let mut seed: [u8; 32] = [0; 32];
seed[31] = seed_val;
let mut rng = StdRng::from_seed(seed);
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
}

View File

@@ -622,23 +622,23 @@ pub mod tests {
assert!(!postings_unopt.advance());
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::tests::*;
use crate::docset::SkipResult;
use crate::query::Intersection;
use crate::schema::IndexRecordOption;
use crate::tests;
use crate::DocSet;
use docset::SkipResult;
use query::Intersection;
use schema::IndexRecordOption;
use test::{self, Bencher};
use tests;
use DocSet;
#[bench]
fn bench_segment_postings(b: &mut Bencher) {
let reader = INDEX.reader().unwrap();
let searcher = reader.searcher();
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
@@ -652,8 +652,7 @@ mod bench {
#[bench]
fn bench_segment_intersection(b: &mut Bencher) {
let reader = INDEX.reader().unwrap();
let searcher = reader.searcher();
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let segment_postings_a = segment_reader
@@ -683,8 +682,7 @@ mod bench {
}
fn bench_skip_next(p: f64, b: &mut Bencher) {
let reader = INDEX.reader().unwrap();
let searcher = reader.searcher();
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
let docs = tests::sample(segment_reader.num_docs(), p);
@@ -739,8 +737,7 @@ mod bench {
#[bench]
fn bench_iterate_segment_postings(b: &mut Bencher) {
let reader = INDEX.reader().unwrap();
let searcher = reader.searcher();
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let n: u32 = test::black_box(17);

View File

@@ -12,7 +12,6 @@ use crate::tokenizer::TokenStream;
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
use crate::DocId;
use crate::Result;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
use std::marker::PhantomData;
@@ -128,12 +127,12 @@ impl MultiFieldPostingsWriter {
pub fn serialize(
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
self.term_index.iter().collect();
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let field_offsets = make_field_partition(&term_offsets);
@@ -148,7 +147,7 @@ impl MultiFieldPostingsWriter {
let unordered_term_ids = term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
let mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)

View File

@@ -141,7 +141,10 @@ impl<'a> FieldSerializer<'a> {
FieldType::Str(ref text_options) => {
if let Some(text_indexing_options) = text_options.get_indexing_options() {
let index_option = text_indexing_options.index_option();
(index_option.has_freq(), index_option.has_positions())
(
index_option.is_termfreq_enabled(),
index_option.is_position_enabled(),
)
} else {
(false, false)
}

View File

@@ -310,7 +310,6 @@ mod bench {
use super::super::MemoryArena;
use super::ExpUnrolledLinkedList;
use byteorder::{NativeEndian, WriteBytesExt};
use std::iter;
use test::Bencher;
const NUM_STACK: usize = 10_000;
@@ -336,10 +335,11 @@ mod bench {
fn bench_push_stack(bench: &mut Bencher) {
bench.iter(|| {
let mut heap = MemoryArena::new();
let mut stacks: Vec<ExpUnrolledLinkedList> =
iter::repeat_with(ExpUnrolledLinkedList::new)
.take(NUM_STACK)
.collect();
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let mut stack = ExpUnrolledLinkedList::new();
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;

View File

@@ -130,4 +130,5 @@ mod tests {
assert!(!scorer.advance());
}
}
}

View File

@@ -216,6 +216,7 @@ mod tests {
assert!(!docset.advance());
}
}
}
#[cfg(all(test, feature = "unstable"))]
@@ -223,12 +224,13 @@ mod bench {
use super::BitSet;
use super::BitSetDocSet;
use crate::test;
use crate::tests;
use crate::DocSet;
use test;
use tests;
use DocSet;
#[bench]
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
use tests;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
b.iter(|| {
let mut bitset = BitSet::with_max_value(1_000_000);
@@ -240,6 +242,7 @@ mod bench {
#[bench]
fn bench_bitset_1pct_clone(b: &mut test::Bencher) {
use tests;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els {

View File

@@ -137,4 +137,5 @@ mod tests {
fn test_idf() {
assert_nearly_equals(idf(1, 2), 0.6931472);
}
}

View File

@@ -247,7 +247,9 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, text]);
let query = query_parser.parse_query("Оксана Лифенко").unwrap();
let query = query_parser
.parse_query("Оксана Лифенко")
.unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
scorer.advance();

View File

@@ -175,4 +175,5 @@ mod tests {
sample_skip,
);
}
}

View File

@@ -1,12 +1,14 @@
use crate::error::TantivyError::InvalidArgument;
use crate::query::{AutomatonWeight, Query, Weight};
use crate::schema::Term;
use crate::termdict::WrappedDFA;
use crate::Result;
use crate::Searcher;
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA};
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::ops::Range;
use derive_builder::Builder;
/// A range of Levenshtein distances that we will build DFAs for our terms
/// The computation is exponential, so best keep it to low single digits
@@ -24,6 +26,38 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
lev_builder_cache
});
#[derive(Builder, Default, Clone, Debug)]
pub struct FuzzyConfiguration {
/// How many changes are we going to allow
pub distance: u8,
/// Should a transposition cost 1 or 2?
#[builder(default)]
pub transposition_cost_one: bool,
#[builder(default)]
pub prefix: bool,
/// If true, only the term with a levenshtein of exactly `distance` will match.
/// If false, terms at a distance `<=` to `distance` will match.
#[builder(default)]
pub exact_distance: bool,
}
fn build_dfa(fuzzy_configuration: &FuzzyConfiguration, term_text: &str) -> Result<DFA> {
let automaton_builder = LEV_BUILDER
.get(&(fuzzy_configuration.distance, fuzzy_configuration.transposition_cost_one))
.ok_or_else(|| {
InvalidArgument(format!(
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
fuzzy_configuration.distance, VALID_LEVENSHTEIN_DISTANCE_RANGE
))
})?;
if fuzzy_configuration.prefix {
Ok(automaton_builder.build_prefix_dfa(term_text))
} else {
Ok(automaton_builder.build_dfa(term_text))
}
}
/// A Fuzzy Query matches all of the documents
/// containing a specific term that is within
/// Levenshtein distance
@@ -41,32 +75,19 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.commit().unwrap();
/// }
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// {
///
/// let term = Term::from_field_text(title, "Diary");
/// let query = FuzzyTermQuery::new(term, 1, true);
/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
/// assert_eq!(count, 2);
/// assert_eq!(top_docs.len(), 2);
/// }
///
/// let term = Term::from_field_text(title, "Diary");
/// let query = FuzzyTermQuery::new(term, 1, true);
/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
/// assert_eq!(count, 2);
/// assert_eq!(top_docs.len(), 2);
/// Ok(())
/// }
/// ```
@@ -74,54 +95,58 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
pub struct FuzzyTermQuery {
/// What term are we searching
term: Term,
/// How many changes are we going to allow
distance: u8,
/// Should a transposition cost 1 or 2?
transposition_cost_one: bool,
///
prefix: bool,
configuration: FuzzyConfiguration
}
impl FuzzyTermQuery {
pub fn new_from_configuration(term: Term, configuration: FuzzyConfiguration) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
configuration
}
}
/// Creates a new Fuzzy Query
pub fn new(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
distance,
transposition_cost_one,
prefix: false,
}
}
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
distance,
transposition_cost_one,
prefix: true,
}
}
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
match LEV_BUILDER.get(&(self.distance, false)) {
// Unwrap the option and build the Ok(AutomatonWeight)
Some(automaton_builder) => {
let automaton = automaton_builder.build_dfa(self.term.text());
Ok(AutomatonWeight::new(self.term.field(), automaton))
configuration: FuzzyConfiguration {
distance,
transposition_cost_one,
prefix: false,
exact_distance: false
}
None => Err(InvalidArgument(format!(
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
self.distance, VALID_LEVENSHTEIN_DISTANCE_RANGE
))),
}
}
}
impl Query for FuzzyTermQuery {
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight()?))
let dfa = build_dfa(&self.configuration, self.term.text())?;
// TODO optimize for distance = 0 and possibly prefix
if self.configuration.exact_distance {
let target_distance = self.configuration.distance;
let wrapped_dfa = WrappedDFA {
dfa,
condition: move |distance: Distance| distance == Distance::Exact(target_distance),
};
Ok(Box::new(AutomatonWeight::new(
self.term.field(),
wrapped_dfa,
)))
} else {
let wrapped_dfa = WrappedDFA {
dfa,
condition: move |distance: Distance| match distance {
Distance::Exact(_) => true,
Distance::AtLeast(_) => false,
},
};
Ok(Box::new(AutomatonWeight::new(
self.term.field(),
wrapped_dfa,
)))
}
}
}
@@ -134,6 +159,7 @@ mod test {
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::Term;
use super::FuzzyConfigurationBuilder;
#[test]
pub fn test_fuzzy_term() {
@@ -155,7 +181,6 @@ mod test {
let searcher = reader.searcher();
{
let term = Term::from_field_text(country_field, "japon");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
@@ -164,5 +189,73 @@ mod test {
let (score, _) = top_docs[0];
assert_nearly_equals(1f32, score);
}
{
let term = Term::from_field_text(country_field, "japon");
let fuzzy_conf = FuzzyConfigurationBuilder::default()
.distance(2)
.exact_distance(true)
.build()
.unwrap();
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert!(top_docs.is_empty());
}
{
let term = Term::from_field_text(country_field, "japon");
let fuzzy_conf = FuzzyConfigurationBuilder::default()
.distance(1)
.exact_distance(true)
.build()
.unwrap();
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1);
}
{
let term = Term::from_field_text(country_field, "jpp");
let fuzzy_conf = FuzzyConfigurationBuilder::default()
.distance(1)
.prefix(true)
.build()
.unwrap();
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1);
}
{
let term = Term::from_field_text(country_field, "jpaan");
let fuzzy_conf = FuzzyConfigurationBuilder::default()
.distance(1)
.exact_distance(true)
.transposition_cost_one(true)
.build()
.unwrap();
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1);
}
{
let term = Term::from_field_text(country_field, "jpaan");
let fuzzy_conf = FuzzyConfigurationBuilder::default()
.distance(2)
.exact_distance(true)
.transposition_cost_one(false)
.build()
.unwrap();
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1);
}
}
}

View File

@@ -45,7 +45,7 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
})
}
/// Creates a `DocSet` that iterate through the intersection of two or more `DocSet`s.
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>> {
left: TDocSet,
right: TDocSet,

View File

@@ -5,7 +5,7 @@ use Score;
use SkipResult;
/// Creates a `DocSet` that iterate through the intersection of two `DocSet`s.
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct IntersectionTwoTerms<TDocSet> {
left: TDocSet,
right: TDocSet

View File

@@ -40,7 +40,7 @@ pub use self::boolean_query::BooleanQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::explanation::Explanation;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::fuzzy_query::{FuzzyTermQuery, FuzzyConfiguration, FuzzyConfigurationBuilder};
pub use self::intersection::intersect_scorers;
pub use self::phrase_query::PhraseQuery;
pub use self::query::Query;

View File

@@ -479,4 +479,5 @@ mod tests {
91
);
}
}

View File

@@ -190,4 +190,5 @@ mod tests {
skip_docs,
);
}
}

View File

@@ -28,7 +28,7 @@ where
}
}
/// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
docsets: Vec<TScorer>,
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
@@ -409,17 +409,20 @@ mod tests {
vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Union, VecDocSet};
use crate::tests;
use crate::DocId;
use crate::DocSet;
use query::score_combiner::DoNothingCombiner;
use query::ConstScorer;
use query::Union;
use query::VecDocSet;
use test::Bencher;
use tests;
use DocId;
use DocSet;
#[bench]
fn bench_union_3_high(bench: &mut Bencher) {

View File

@@ -82,4 +82,5 @@ pub mod tests {
}
assert_eq!(postings.fill_buffer(&mut buffer[..]), 9);
}
}

View File

@@ -178,4 +178,5 @@ mod tests {
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
}
}

View File

@@ -29,6 +29,22 @@ pub enum IndexRecordOption {
}
impl IndexRecordOption {
/// Returns true iff the term frequency will be encoded.
pub fn is_termfreq_enabled(self) -> bool {
match self {
IndexRecordOption::WithFreqsAndPositions | IndexRecordOption::WithFreqs => true,
_ => false,
}
}
/// Returns true iff the term positions within the document are stored as well.
pub fn is_position_enabled(self) -> bool {
match self {
IndexRecordOption::WithFreqsAndPositions => true,
_ => false,
}
}
/// Returns true iff this option includes encoding
/// term frequencies.
pub fn has_freq(self) -> bool {

View File

@@ -174,4 +174,5 @@ mod tests {
assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field"));
}
}

View File

@@ -22,10 +22,10 @@ impl Term {
/// Builds a term given a field, and a i64-value
///
/// Assuming the term has a field id of 1, and a i64 value of 3234,
/// the Term will have 12 bytes.
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 8 following bytes are encoding the u64 value.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_i64(field: Field, val: i64) -> Term {
let val_u64: u64 = common::i64_to_u64(val);
Term::from_field_u64(field, val_u64)
@@ -33,11 +33,11 @@ impl Term {
/// Builds a term given a field, and a f64-value
///
/// Assuming the term has a field id of 1, and a f64 value of 1.5,
/// the Term will have 12 bytes.
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// the Term will have 8 bytes. <= this is wrong
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 8 following bytes are encoding the f64 as a u64 value.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_f64(field: Field, val: f64) -> Term {
let val_u64: u64 = common::f64_to_u64(val);
Term::from_field_u64(field, val_u64)
@@ -46,10 +46,10 @@ impl Term {
/// Builds a term given a field, and a DateTime value
///
/// Assuming the term has a field id of 1, and a timestamp i64 value of 3234,
/// the Term will have 12 bytes.
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 8 following bytes are encoding the DateTime as i64 timestamp value.
/// The 4 following bytes are encoding the DateTime as i64 timestamp value.
pub fn from_field_date(field: Field, val: &DateTime) -> Term {
let val_timestamp = val.timestamp();
Term::from_field_i64(field, val_timestamp)
@@ -82,10 +82,10 @@ impl Term {
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// the Term will have 12 bytes.
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 8 following bytes are encoding the u64 value.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_u64(field: Field, val: u64) -> Term {
let mut term = Term(vec![0u8; INT_TERM_LEN]);
term.set_field(field);
@@ -182,7 +182,7 @@ where
///
/// # Panics
/// ... or returns an invalid value
/// if the term is not a `f64` field.
/// if the term is not a `i64` field.
pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
}

View File

@@ -120,16 +120,17 @@ pub mod tests {
);
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::tests::write_lorem_ipsum_store;
use crate::directory::Directory;
use crate::directory::RAMDirectory;
use crate::store::StoreReader;
use directory::Directory;
use directory::RAMDirectory;
use std::path::Path;
use store::StoreReader;
use test::Bencher;
#[bench]

View File

@@ -165,4 +165,5 @@ mod tests {
assert_eq!(output.len(), 65);
assert_eq!(output[0], 128u8 + 3u8);
}
}

View File

@@ -3,7 +3,6 @@ use super::skiplist::SkipListBuilder;
use super::StoreReader;
use crate::common::CountingWriter;
use crate::common::{BinarySerializable, VInt};
use crate::directory::TerminatingWrite;
use crate::directory::WritePtr;
use crate::schema::Document;
use crate::DocId;
@@ -110,6 +109,6 @@ impl StoreWriter {
self.offset_index_writer.write(&mut self.writer)?;
header_offset.serialize(&mut self.writer)?;
self.doc.serialize(&mut self.writer)?;
self.writer.terminate()
self.writer.flush()
}
}

View File

@@ -31,14 +31,43 @@ mod termdict;
pub use self::merger::TermMerger;
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
use levenshtein_automata::{Distance, DFA, SINK_STATE};
use tantivy_fst::Automaton;
pub(crate) struct WrappedDFA<Cond> {
pub dfa: DFA,
pub condition: Cond,
}
impl<Cond: Fn(Distance) -> bool> Automaton for WrappedDFA<Cond> {
type State = u32;
fn start(&self) -> Self::State {
self.dfa.initial_state()
}
fn is_match(&self, state: &Self::State) -> bool {
let distance = self.dfa.distance(*state);
(self.condition)(distance)
}
fn can_match(&self, state: &Self::State) -> bool {
*state != SINK_STATE
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
self.dfa.transition(*state, byte)
}
}
#[cfg(test)]
mod tests {
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer, WrappedDFA};
use crate::core::Index;
use crate::directory::{Directory, RAMDirectory, ReadOnlySource};
use crate::postings::TermInfo;
use crate::schema::{Document, FieldType, Schema, TEXT};
use levenshtein_automata::Distance;
use std::path::PathBuf;
use std::str;
@@ -423,9 +452,14 @@ mod tests {
// We can now build an entire dfa.
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
let automaton = lev_automaton_builder.build_dfa("Spaen");
let mut range = term_dict.search(automaton).into_stream();
let wrapped_dfa = WrappedDFA {
dfa: lev_automaton_builder.build_dfa("Spaen"),
condition: |distance| match distance {
Distance::Exact(_) => true,
Distance::AtLeast(_) => false,
},
};
let mut range = term_dict.search(wrapped_dfa).into_stream();
// get the first finding
assert!(range.advance());

View File

@@ -328,4 +328,5 @@ mod tests {
assert_eq!(term_info_store.get(i as u64), term_infos[i]);
}
}
}

View File

@@ -98,6 +98,10 @@ mod tests {
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
assert_eq!(
lowercase_helper("Русский"),
vec!["русский".to_string()]
);
}
}

View File

@@ -281,4 +281,5 @@ pub mod tests {
assert!(tokens.is_empty());
}
}
}

View File

@@ -460,4 +460,5 @@ mod tests {
assert_eq!(it.next(), Some((8, 9)));
assert_eq!(it.next(), None);
}
}

View File

@@ -97,4 +97,5 @@ mod tests {
assert!(!token_chain.advance());
}
}

View File

@@ -1,7 +1,7 @@
use fail;
use std::io::Write;
use std::path::Path;
use tantivy::directory::{Directory, ManagedDirectory, RAMDirectory, TerminatingWrite};
use tantivy::directory::{Directory, ManagedDirectory, RAMDirectory};
use tantivy::doc;
use tantivy::schema::{Schema, TEXT};
use tantivy::{Index, Term};
@@ -17,7 +17,7 @@ fn test_failpoints_managed_directory_gc_if_delete_fails() {
managed_directory
.open_write(test_path)
.unwrap()
.terminate()
.flush()
.unwrap();
assert!(managed_directory.exists(test_path));
// triggering gc and setting the delete operation to fail.