Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
61422d7cd5 Change in the default fast field tokenizer manager.
`{ fast: true }` now results in the use of a the default fast field tokenizer. (instead of no tokenizer)
The default tokenizer lowercases.

Fast field gets a different default tokenizer manager than the normal tokenizer.

The serialization of the fast field options is unchanged.
2023-07-18 19:20:10 +09:00
104 changed files with 1434 additions and 4294 deletions

View File

@@ -1,30 +1,3 @@
Tantivy 0.21
================================
#### Bugfixes
- Fix track fast field memory consumption, which led to higher memory consumption than the budget allowed during indexing [#2148](https://github.com/quickwit-oss/tantivy/issues/2148)[#2147](https://github.com/quickwit-oss/tantivy/issues/2147)(@PSeitz)
- Fix a regression from 0.20 where sort index by date wasn't working anymore [#2124](https://github.com/quickwit-oss/tantivy/issues/2124)(@PSeitz)
- Fix getting the root facet on the `FacetCollector`. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086)(@adamreichold)
- Align numerical type priority order of columnar and query. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088)(@fmassot)
#### Breaking Changes
- Remove support for Brotli and Snappy compression [#2123](https://github.com/quickwit-oss/tantivy/issues/2123)(@adamreichold)
#### Features/Improvements
- Implement lenient query parser [#2129](https://github.com/quickwit-oss/tantivy/pull/2129)(@trinity-1686a)
- order_by_u64_field and order_by_fast_field allow sorting in ascending and descending order [#2111](https://github.com/quickwit-oss/tantivy/issues/2111)(@naveenann)
- Allow dynamic filters in text analyzer builder [#2110](https://github.com/quickwit-oss/tantivy/issues/2110)(@fulmicoton @fmassot)
- **Aggregation**
- Add missing parameter for term aggregation [#2149](https://github.com/quickwit-oss/tantivy/issues/2149)[#2103](https://github.com/quickwit-oss/tantivy/issues/2103)(@PSeitz)
- Add missing parameter for percentiles [#2157](https://github.com/quickwit-oss/tantivy/issues/2157)(@PSeitz)
- Add missing parameter for stats,min,max,count,sum,avg [#2151](https://github.com/quickwit-oss/tantivy/issues/2151)(@PSeitz)
- Improve aggregation deserialization error message [#2150](https://github.com/quickwit-oss/tantivy/issues/2150)(@PSeitz)
- Add validation for type Bytes to term_agg [#2077](https://github.com/quickwit-oss/tantivy/issues/2077)(@PSeitz)
- Alternative mixed field collection [#2135](https://github.com/quickwit-oss/tantivy/issues/2135)(@PSeitz)
- Add missing query_terms impl for TermSetQuery. [#2120](https://github.com/quickwit-oss/tantivy/issues/2120)(@adamreichold)
- Minor improvements to OwnedBytes [#2134](https://github.com/quickwit-oss/tantivy/issues/2134)(@adamreichold)
- Remove allocations in split compound words [#2080](https://github.com/quickwit-oss/tantivy/issues/2080)(@PSeitz)
- Ngram tokenizer now returns an error with invalid arguments [#2102](https://github.com/quickwit-oss/tantivy/issues/2102)(@fmassot)
- Make TextAnalyzerBuilder public [#2097](https://github.com/quickwit-oss/tantivy/issues/2097)(@adamreichold)
- Return an error when tokenizer is not found while indexing [#2093](https://github.com/quickwit-oss/tantivy/issues/2093)(@naveenann)
- Delayed column opening during merge [#2132](https://github.com/quickwit-oss/tantivy/issues/2132)(@PSeitz)
Tantivy 0.20.2
================================

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.21.1"
version = "0.20.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -54,13 +54,13 @@ measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
sstable = { version= "0.2", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version= "0.2", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.21.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.5", path="./bitpacker" }
common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
columnar = { version= "0.1", path="./columnar", package ="tantivy-columnar" }
sstable = { version= "0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version= "0.1", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.20.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.4", path="./bitpacker" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
@@ -73,17 +73,15 @@ maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
criterion = "0.5"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.4.3"
[target.'cfg(not(windows))'.dev-dependencies]
criterion = "0.5"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
[dev-dependencies.fail]
version = "0.5.0"
features = ["failpoints"]
@@ -128,7 +126,7 @@ members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sst
[[test]]
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["failpoints"]
required-features = ["fail/failpoints"]
[[bench]]
name = "analyzer"

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.5.0"
version = "0.4.0"
edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -64,8 +64,10 @@ fn mem_usage<T>(items: &Vec<T>) -> usize {
impl BlockedBitpacker {
pub fn new() -> Self {
let mut compressed_blocks = vec![];
compressed_blocks.resize(8, 0);
Self {
compressed_blocks: vec![0; 8],
compressed_blocks,
buffer: vec![],
offset_and_bits: vec![],
}

View File

@@ -32,7 +32,6 @@ postprocessors = [
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
{ pattern = 'Naveen Aiathurai', replace = "naveenann"}, # replace with github user
{ pattern = '', replace = ""}, # replace with github user
]

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-columnar"
version = "0.2.0"
version = "0.1.0"
edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
@@ -13,10 +13,10 @@ itertools = "0.11.0"
fnv = "1.0.7"
fastdivide = "0.4.0"
stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.2", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.6", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" }
stacker = { version= "0.1", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.1", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.5", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.4", path = "../bitpacker/" }
serde = "1.0.152"
[dev-dependencies]

View File

@@ -1,12 +1,9 @@
use std::cmp::Ordering;
use crate::{Column, DocId, RowId};
#[derive(Debug, Default, Clone)]
pub struct ColumnBlockAccessor<T> {
val_cache: Vec<T>,
docid_cache: Vec<DocId>,
missing_docids_cache: Vec<DocId>,
row_id_cache: Vec<RowId>,
}
@@ -23,20 +20,6 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
.values
.get_vals(&self.row_id_cache, &mut self.val_cache);
}
#[inline]
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
self.fetch_block(docs, accessor);
// We can compare docid_cache with docs to find missing docs
if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() {
self.missing_docids_cache.clear();
find_missing_docs(docs, &self.docid_cache, |doc| {
self.missing_docids_cache.push(doc);
self.val_cache.push(missing);
});
self.docid_cache
.extend_from_slice(&self.missing_docids_cache);
}
}
#[inline]
pub fn iter_vals(&self) -> impl Iterator<Item = T> + '_ {
@@ -51,82 +34,3 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
.zip(self.val_cache.iter().cloned())
}
}
/// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`.
/// Return all docs that are not in `hits`.
fn find_missing_docs<F>(docs: &[u32], hits: &[u32], mut callback: F)
where F: FnMut(u32) {
let mut docs_iter = docs.iter();
let mut hits_iter = hits.iter();
let mut doc = docs_iter.next();
let mut hit = hits_iter.next();
while let (Some(&current_doc), Some(&current_hit)) = (doc, hit) {
match current_doc.cmp(&current_hit) {
Ordering::Less => {
callback(current_doc);
doc = docs_iter.next();
}
Ordering::Equal => {
doc = docs_iter.next();
hit = hits_iter.next();
}
Ordering::Greater => {
hit = hits_iter.next();
}
}
}
while let Some(&current_doc) = doc {
callback(current_doc);
doc = docs_iter.next();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_missing_docs() {
let docs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![1, 3, 5, 7, 9]);
}
#[test]
fn test_find_missing_docs_empty() {
let docs: Vec<u32> = Vec::new();
let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![]);
}
#[test]
fn test_find_missing_docs_all_missing() {
let docs: Vec<u32> = vec![1, 2, 3, 4, 5];
let hits: Vec<u32> = Vec::new();
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![1, 2, 3, 4, 5]);
}
}

View File

@@ -30,13 +30,6 @@ impl fmt::Debug for BytesColumn {
}
impl BytesColumn {
pub fn empty(num_docs: u32) -> BytesColumn {
BytesColumn {
dictionary: Arc::new(Dictionary::empty()),
term_ord_column: Column::build_empty_column(num_docs),
}
}
/// Fills the given `output` buffer with the term associated to the ordinal `ord`.
///
/// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
@@ -84,7 +77,7 @@ impl From<StrColumn> for BytesColumn {
}
impl StrColumn {
pub fn wrap(bytes_column: BytesColumn) -> StrColumn {
pub(crate) fn wrap(bytes_column: BytesColumn) -> StrColumn {
StrColumn(bytes_column)
}

View File

@@ -130,7 +130,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.select_batch_in_place(selected_docid_range.start, doc_ids);
}
/// Fills the output vector with the (possibly multiple values that are associated_with
/// Fils the output vector with the (possibly multiple values that are associated_with
/// `row_id`.
///
/// This method clears the `output` vector.

View File

@@ -37,10 +37,6 @@ impl From<MultiValueIndex> for ColumnIndex {
}
impl ColumnIndex {
#[inline]
pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_))
}
// Returns the cardinality of the column index.
//
// By convention, if the column contains no docs, we consider that it is

View File

@@ -38,6 +38,6 @@ impl Ord for BlankRange {
}
impl PartialOrd for BlankRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
Some(self.blank_size().cmp(&other.blank_size()))
}
}

View File

@@ -2,7 +2,7 @@ mod merge_dict_column;
mod merge_mapping;
mod term_merger;
use std::collections::{BTreeMap, HashSet};
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
@@ -18,8 +18,7 @@ use crate::columnar::writer::CompatibleNumericalTypes;
use crate::columnar::ColumnarReader;
use crate::dynamic_column::DynamicColumn;
use crate::{
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
NumericalValue,
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, NumericalType, NumericalValue,
};
/// Column types are grouped into different categories.
@@ -29,16 +28,14 @@ use crate::{
/// In practise, today, only Numerical colummns are coerced into one type today.
///
/// See also [README.md].
///
/// The ordering has to match the ordering of the variants in [ColumnType].
#[derive(Copy, Clone, Eq, PartialOrd, Ord, PartialEq, Hash, Debug)]
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
pub(crate) enum ColumnTypeCategory {
Numerical,
Bytes,
Str,
Bool,
IpAddr,
Str,
Numerical,
DateTime,
Bytes,
IpAddr,
}
impl From<ColumnType> for ColumnTypeCategory {
@@ -86,20 +83,9 @@ pub fn merge_columnar(
.iter()
.map(|reader| reader.num_rows())
.collect::<Vec<u32>>();
let columns_to_merge =
group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
for res in columns_to_merge {
let ((column_name, _column_type_category), grouped_columns) = res;
let grouped_columns = grouped_columns.open(&merge_row_order)?;
if grouped_columns.is_empty() {
continue;
}
let column_type = grouped_columns.column_type_after_merge();
let mut columns = grouped_columns.columns;
coerce_columns(column_type, &mut columns)?;
for ((column_name, column_type), columns) in columns_to_merge {
let mut column_serializer =
serializer.start_serialize_column(column_name.as_bytes(), column_type);
merge_column(
@@ -111,7 +97,6 @@ pub fn merge_columnar(
)?;
column_serializer.finalize()?;
}
serializer.finalize(merge_row_order.num_rows())?;
Ok(())
}
@@ -225,12 +210,40 @@ fn merge_column(
struct GroupedColumns {
required_column_type: Option<ColumnType>,
columns: Vec<Option<DynamicColumn>>,
column_category: ColumnTypeCategory,
}
impl GroupedColumns {
/// Check is column group can be skipped during serialization.
fn is_empty(&self) -> bool {
self.required_column_type.is_none() && self.columns.iter().all(Option::is_none)
fn for_category(column_category: ColumnTypeCategory, num_columnars: usize) -> Self {
GroupedColumns {
required_column_type: None,
columns: vec![None; num_columnars],
column_category,
}
}
/// Set the dynamic column for a given columnar.
fn set_column(&mut self, columnar_id: usize, column: DynamicColumn) {
self.columns[columnar_id] = Some(column);
}
/// Force the existence of a column, as well as its type.
fn require_type(&mut self, required_type: ColumnType) -> io::Result<()> {
if let Some(existing_required_type) = self.required_column_type {
if existing_required_type == required_type {
// This was just a duplicate in the `required_columns`.
// Nothing to do.
return Ok(());
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Required column conflicts with another required column of the same type \
category.",
));
}
}
self.required_column_type = Some(required_type);
Ok(())
}
/// Returns the column type after merge.
@@ -252,76 +265,11 @@ impl GroupedColumns {
}
// At the moment, only the numerical categorical column type has more than one possible
// column type.
assert!(self
.columns
.iter()
.flatten()
.all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
assert_eq!(self.column_category, ColumnTypeCategory::Numerical);
merged_numerical_columns_type(self.columns.iter().flatten()).into()
}
}
struct GroupedColumnsHandle {
required_column_type: Option<ColumnType>,
columns: Vec<Option<DynamicColumnHandle>>,
}
impl GroupedColumnsHandle {
fn new(num_columnars: usize) -> Self {
GroupedColumnsHandle {
required_column_type: None,
columns: vec![None; num_columnars],
}
}
fn open(self, merge_row_order: &MergeRowOrder) -> io::Result<GroupedColumns> {
let mut columns: Vec<Option<DynamicColumn>> = Vec::new();
for (columnar_id, column) in self.columns.iter().enumerate() {
if let Some(column) = column {
let column = column.open()?;
// We skip columns that end up with 0 documents.
// That way, we make sure they don't end up influencing the merge type or
// creating empty columns.
if is_empty_after_merge(merge_row_order, &column, columnar_id) {
columns.push(None);
} else {
columns.push(Some(column));
}
} else {
columns.push(None);
}
}
Ok(GroupedColumns {
required_column_type: self.required_column_type,
columns,
})
}
/// Set the dynamic column for a given columnar.
fn set_column(&mut self, columnar_id: usize, column: DynamicColumnHandle) {
self.columns[columnar_id] = Some(column);
}
/// Force the existence of a column, as well as its type.
fn require_type(&mut self, required_type: ColumnType) -> io::Result<()> {
if let Some(existing_required_type) = self.required_column_type {
if existing_required_type == required_type {
// This was just a duplicate in the `required_columns`.
// Nothing to do.
return Ok(());
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Required column conflicts with another required column of the same type \
category.",
));
}
}
self.required_column_type = Some(required_type);
Ok(())
}
}
/// Returns the type of the merged numerical column.
///
/// This function picks the first numerical type out of i64, u64, f64 (order matters
@@ -345,7 +293,7 @@ fn merged_numerical_columns_type<'a>(
fn is_empty_after_merge(
merge_row_order: &MergeRowOrder,
column: &DynamicColumn,
columnar_ord: usize,
columnar_id: usize,
) -> bool {
if column.num_values() == 0u32 {
// It was empty before the merge.
@@ -357,7 +305,7 @@ fn is_empty_after_merge(
false
}
MergeRowOrder::Shuffled(shuffled) => {
if let Some(alive_bitset) = &shuffled.alive_bitsets[columnar_ord] {
if let Some(alive_bitset) = &shuffled.alive_bitsets[columnar_id] {
let column_index = column.column_index();
match column_index {
ColumnIndex::Empty { .. } => true,
@@ -400,34 +348,56 @@ fn is_empty_after_merge(
}
}
/// Iterates over the columns of the columnar readers, grouped by column name.
/// Key functionality is that `open` of the Columns is done lazy per group.
fn group_columns_for_merge<'a>(
columnar_readers: &'a [&'a ColumnarReader],
required_columns: &'a [(String, ColumnType)],
_merge_row_order: &'a MergeRowOrder,
) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();
#[allow(clippy::type_complexity)]
fn group_columns_for_merge(
columnar_readers: &[&ColumnarReader],
required_columns: &[(String, ColumnType)],
merge_row_order: &MergeRowOrder,
) -> io::Result<BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>>> {
// Each column name may have multiple types of column associated.
// For merging we are interested in the same column type category since they can be merged.
let mut columns_grouped: HashMap<(String, ColumnTypeCategory), GroupedColumns> = HashMap::new();
for &(ref column_name, column_type) in required_columns {
columns
columns_grouped
.entry((column_name.clone(), column_type.into()))
.or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
.or_insert_with(|| {
GroupedColumns::for_category(column_type.into(), columnar_readers.len())
})
.require_type(column_type)?;
}
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
let column_name_and_handle = columnar_reader.iter_columns()?;
let column_name_and_handle = columnar_reader.list_columns()?;
// We skip columns that end up with 0 documents.
// That way, we make sure they don't end up influencing the merge type or
// creating empty columns.
for (column_name, handle) in column_name_and_handle {
let column_category: ColumnTypeCategory = handle.column_type().into();
columns
let column = handle.open()?;
if is_empty_after_merge(merge_row_order, &column, columnar_id) {
continue;
}
columns_grouped
.entry((column_name, column_category))
.or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
.set_column(columnar_id, handle);
.or_insert_with(|| {
GroupedColumns::for_category(column_category, columnar_readers.len())
})
.set_column(columnar_id, column);
}
}
Ok(columns)
let mut merge_columns: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
Default::default();
for ((column_name, _), mut grouped_columns) in columns_grouped {
let column_type = grouped_columns.column_type_after_merge();
coerce_columns(column_type, &mut grouped_columns.columns)?;
merge_columns.insert((column_name, column_type), grouped_columns.columns);
}
Ok(merge_columns)
}
fn coerce_columns(

View File

@@ -1,5 +1,3 @@
use std::collections::BTreeMap;
use itertools::Itertools;
use super::*;
@@ -29,10 +27,22 @@ fn test_column_coercion_to_u64() {
let columnar2 = make_columnar("numbers", &[u64::MAX]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_column_no_coercion_if_all_the_same() {
let columnar1 = make_columnar("numbers", &[1u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
@@ -41,24 +51,24 @@ fn test_column_coercion_to_i64() {
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
}
//#[test]
// fn test_impossible_coercion_returns_an_error() {
// let columnar1 = make_columnar("numbers", &[u64::MAX]);
// let merge_order = StackMergeOrder::stack(&[&columnar1]).into();
// let group_error = group_columns_for_merge_iter(
//&[&columnar1],
//&[("numbers".to_string(), ColumnType::I64)],
//&merge_order,
//)
//.unwrap_err();
// assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
//}
#[test]
fn test_impossible_coercion_returns_an_error() {
let columnar1 = make_columnar("numbers", &[u64::MAX]);
let merge_order = StackMergeOrder::stack(&[&columnar1]).into();
let group_error = group_columns_for_merge(
&[&columnar1],
&[("numbers".to_string(), ColumnType::I64)],
&merge_order,
)
.unwrap_err();
assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
}
#[test]
fn test_group_columns_with_required_column() {
@@ -66,7 +76,7 @@ fn test_group_columns_with_required_column() {
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
&[&columnar1, &columnar2],
&[("numbers".to_string(), ColumnType::U64)],
@@ -74,7 +84,7 @@ fn test_group_columns_with_required_column() {
)
.unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
@@ -83,17 +93,17 @@ fn test_group_columns_required_column_with_no_existing_columns() {
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<_, _> = group_columns_for_merge(
columnars,
&[("required_col".to_string(), ColumnType::Str)],
&merge_order,
)
.unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
columnars,
&[("required_col".to_string(), ColumnType::Str)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 2);
let columns = &column_map
.get(&("required_col".to_string(), ColumnTypeCategory::Str))
.unwrap()
.columns;
let columns = column_map
.get(&("required_col".to_string(), ColumnType::Str))
.unwrap();
assert_eq!(columns.len(), 2);
assert!(columns[0].is_none());
assert!(columns[1].is_none());
@@ -105,7 +115,7 @@ fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_ru
let columnar2 = make_columnar("numbers", &[2i64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
columnars,
&[("numbers".to_string(), ColumnType::U64)],
@@ -113,7 +123,7 @@ fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_ru
)
.unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
@@ -122,23 +132,21 @@ fn test_missing_column() {
let columnar2 = make_columnar("numbers2", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 2);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
{
let columns = &column_map
.get(&("numbers".to_string(), ColumnTypeCategory::Numerical))
.unwrap()
.columns;
let columns = column_map
.get(&("numbers".to_string(), ColumnType::I64))
.unwrap();
assert!(columns[0].is_some());
assert!(columns[1].is_none());
}
{
let columns = &column_map
.get(&("numbers2".to_string(), ColumnTypeCategory::Numerical))
.unwrap()
.columns;
let columns = column_map
.get(&("numbers2".to_string(), ColumnType::U64))
.unwrap();
assert!(columns[0].is_none());
assert!(columns[1].is_some());
}

View File

@@ -102,41 +102,30 @@ impl ColumnarReader {
pub fn num_rows(&self) -> RowId {
self.num_rows
}
// Iterate over the columns in a sorted way
pub fn iter_columns(
&self,
) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
let mut stream = self.column_dictionary.stream()?;
Ok(std::iter::from_fn(move || {
if stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
// TODO Error Handling. The API gets quite ugly when returning the error here, so
// instead we could just check the first N columns upfront.
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
.unwrap();
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
Some((column_name, column_handle))
} else {
None
}
}))
}
// TODO Add unit tests
pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
Ok(self.iter_columns()?.collect())
let mut stream = self.column_dictionary.stream()?;
let mut results = Vec::new();
while stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
results.push((column_name, column_handle));
}
Ok(results)
}
fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {

View File

@@ -79,6 +79,7 @@ fn mutate_or_create_column<V, TMutator>(
impl ColumnarWriter {
pub fn mem_usage(&self) -> usize {
// TODO add dictionary builders.
self.arena.mem_usage()
+ self.numerical_field_hash_map.mem_usage()
+ self.bool_field_hash_map.mem_usage()
@@ -86,11 +87,6 @@ impl ColumnarWriter {
+ self.str_field_hash_map.mem_usage()
+ self.ip_addr_field_hash_map.mem_usage()
+ self.datetime_field_hash_map.mem_usage()
+ self
.dictionaries
.iter()
.map(|dict| dict.mem_usage())
.sum::<usize>()
}
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
@@ -105,10 +101,6 @@ impl ColumnarWriter {
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
.or_else(|| {
self.datetime_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
})
else {
return Vec::new();
};

View File

@@ -32,7 +32,6 @@ pub struct OrderedId(pub u32);
#[derive(Default)]
pub(crate) struct DictionaryBuilder {
dict: FnvHashMap<Vec<u8>, UnorderedId>,
memory_consumption: usize,
}
impl DictionaryBuilder {
@@ -44,8 +43,6 @@ impl DictionaryBuilder {
}
let new_id = UnorderedId(self.dict.len() as u32);
self.dict.insert(term.to_vec(), new_id);
self.memory_consumption += term.len();
self.memory_consumption += 40; // Term Metadata + HashMap overhead
new_id
}
@@ -66,10 +63,6 @@ impl DictionaryBuilder {
sstable_builder.finish()?;
Ok(TermIdMapping { unordered_to_ord })
}
pub(crate) fn mem_usage(&self) -> usize {
self.memory_consumption
}
}
#[cfg(test)]

View File

@@ -228,7 +228,7 @@ static_dynamic_conversions!(StrColumn, Str);
static_dynamic_conversions!(BytesColumn, Bytes);
static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct DynamicColumnHandle {
pub(crate) file_slice: FileSlice,
pub(crate) column_type: ColumnType,
@@ -247,7 +247,7 @@ impl DynamicColumnHandle {
}
/// Returns the `u64` fast field reader reader associated with `fields` of types
/// Str, u64, i64, f64, bool, or datetime.
/// Str, u64, i64, f64, or datetime.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
@@ -258,12 +258,9 @@ impl DynamicColumnHandle {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column))
}
ColumnType::Bool => Ok(None),
ColumnType::IpAddr => Ok(None),
ColumnType::Bool
| ColumnType::I64
| ColumnType::U64
| ColumnType::F64
| ColumnType::DateTime => {
ColumnType::I64 | ColumnType::U64 | ColumnType::F64 | ColumnType::DateTime => {
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
Ok(Some(column))
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-common"
version = "0.6.0"
version = "0.5.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2021"
@@ -14,7 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version= "0.6", path="../ownedbytes" }
ownedbytes = { version= "0.5", path="../ownedbytes" }
async-trait = "0.1"
time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] }

View File

@@ -1,4 +1,3 @@
use std::fs::File;
use std::ops::{Deref, Range, RangeBounds};
use std::sync::Arc;
use std::{fmt, io};
@@ -33,62 +32,6 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + fmt::Debug {
}
}
#[derive(Debug)]
/// A File with it's length included.
pub struct WrapFile {
file: File,
len: usize,
}
impl WrapFile {
/// Creates a new WrapFile and stores its length.
pub fn new(file: File) -> io::Result<Self> {
let len = file.metadata()?.len() as usize;
Ok(WrapFile { file, len })
}
}
#[async_trait]
impl FileHandle for WrapFile {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
let file_len = self.len();
// Calculate the actual range to read, ensuring it stays within file boundaries
let start = range.start;
let end = range.end.min(file_len);
// Ensure the start is before the end of the range
if start >= end {
return Err(io::Error::new(io::ErrorKind::InvalidInput, "Invalid range"));
}
let mut buffer = vec![0; end - start];
#[cfg(unix)]
{
use std::os::unix::prelude::FileExt;
self.file.read_exact_at(&mut buffer, start as u64)?;
}
#[cfg(not(unix))]
{
use std::io::{Read, Seek};
let mut file = self.file.try_clone()?; // Clone the file to read from it separately
// Seek to the start position in the file
file.seek(io::SeekFrom::Start(start as u64))?;
// Read the data into the buffer
file.read_exact(&mut buffer)?;
}
Ok(OwnedBytes::new(buffer))
}
// todo implement async
}
impl HasLen for WrapFile {
fn len(&self) -> usize {
self.len
}
}
#[async_trait]
impl FileHandle for &'static [u8] {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
@@ -124,30 +67,6 @@ impl fmt::Debug for FileSlice {
}
}
impl FileSlice {
pub fn stream_file_chunks(&self) -> impl Iterator<Item = io::Result<OwnedBytes>> + '_ {
let len = self.range.end;
let mut start = self.range.start;
std::iter::from_fn(move || {
/// Returns chunks of 1MB of data from the FileHandle.
const CHUNK_SIZE: usize = 1024 * 1024; // 1MB
if start < len {
let end = (start + CHUNK_SIZE).min(len);
let range = start..end;
let chunk = self.data.read_bytes(range);
start += CHUNK_SIZE;
match chunk {
Ok(chunk) => Some(Ok(chunk)),
Err(e) => Some(Err(e)),
}
} else {
None
}
})
}
}
/// Takes a range, a `RangeBounds` object, and returns
/// a `Range` that corresponds to the relative application of the
/// `RangeBounds` object to the original `Range`.

View File

@@ -27,15 +27,15 @@ pub trait GroupByIteratorExtended: Iterator {
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: PartialEq + Clone,
Self::Item: Clone,
K: PartialEq + Copy,
Self::Item: Copy,
{
GroupByIterator::new(self, key)
}
}
impl<I: Iterator> GroupByIteratorExtended for I {}
pub struct GroupByIterator<I, F, K: Clone>
pub struct GroupByIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -50,7 +50,7 @@ where
inner: Rc<RefCell<GroupByShared<I, F, K>>>,
}
struct GroupByShared<I, F, K: Clone>
struct GroupByShared<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -63,7 +63,7 @@ impl<I, F, K> GroupByIterator<I, F, K>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
K: Clone,
K: Copy,
{
fn new(inner: I, group_by_fn: F) -> Self {
let inner = GroupByShared {
@@ -80,28 +80,28 @@ where
impl<I, F, K> Iterator for GroupByIterator<I, F, K>
where
I: Iterator,
I::Item: Clone,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
K: Clone,
K: Copy,
{
type Item = (K, GroupIterator<I, F, K>);
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
let value = inner.iter.peek()?.clone();
let value = *inner.iter.peek()?;
let key = (inner.group_by_fn)(&value);
let inner = self.inner.clone();
let group_iter = GroupIterator {
inner,
group_key: key.clone(),
group_key: key,
};
Some((key, group_iter))
}
}
pub struct GroupIterator<I, F, K: Clone>
pub struct GroupIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -110,10 +110,10 @@ where
group_key: K,
}
impl<I, F, K: PartialEq + Clone> Iterator for GroupIterator<I, F, K>
impl<I, F, K: PartialEq + Copy> Iterator for GroupIterator<I, F, K>
where
I: Iterator,
I::Item: Clone,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
{
type Item = I::Item;
@@ -121,7 +121,7 @@ where
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
// peek if next value is in group
let peek_val = inner.iter.peek()?.clone();
let peek_val = *inner.iter.peek()?;
if (inner.group_by_fn)(&peek_val) == self.group_key {
inner.iter.next()
} else {

View File

@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
.set_index_option(IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast(None)
.set_fast("default")
.set_stored();
schema_builder.add_text_field("category", text_fieldtype);
schema_builder.add_f64_field("stock", FAST);

View File

@@ -221,19 +221,5 @@ fn main() -> tantivy::Result<()> {
println!("{}", schema.to_json(&retrieved_doc));
}
// We can also get an explanation to understand
// how a found document got its score.
let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
let (_score, doc_address) = searcher
.search(&query, &TopDocs::with_limit(1))?
.into_iter()
.next()
.unwrap();
let explanation = query.explain(&searcher, doc_address)?;
println!("{}", explanation.to_pretty_json());
Ok(())
}

View File

@@ -143,7 +143,7 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;

View File

@@ -1,7 +1,7 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.6.0"
version = "0.5.0"
edition = "2021"
description = "Expose data as static slice"
license = "MIT"

View File

@@ -1,7 +1,7 @@
use std::convert::TryInto;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
use std::{fmt, io, mem};
pub use stable_deref_trait::StableDeref;
@@ -26,8 +26,8 @@ impl OwnedBytes {
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let bytes: &[u8] = box_stable_deref.deref();
let data = unsafe { &*(bytes as *const [u8]) };
let bytes: &[u8] = box_stable_deref.as_ref();
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
OwnedBytes {
data,
box_stable_deref,
@@ -57,12 +57,6 @@ impl OwnedBytes {
self.data.len()
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
@@ -74,14 +68,13 @@ impl OwnedBytes {
#[inline]
#[must_use]
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let (left_data, right_data) = self.data.split_at(split_len);
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: left_data,
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: right_data,
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
@@ -106,45 +99,55 @@ impl OwnedBytes {
///
/// `self` is truncated to `split_len`, left with the remaining bytes.
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
let (left, right) = self.data.split_at(split_len);
let right_box_stable_deref = self.box_stable_deref.clone();
let right_piece = OwnedBytes {
data: right,
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
self.data = left;
self.data = &self.data[..split_len];
right_piece
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
#[inline]
pub fn advance(&mut self, advance_len: usize) -> &[u8] {
let (data, rest) = self.data.split_at(advance_len);
self.data = rest;
data
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
#[inline]
pub fn read_u8(&mut self) -> u8 {
self.advance(1)[0]
}
assert!(!self.is_empty());
#[inline]
fn read_n<const N: usize>(&mut self) -> [u8; N] {
self.advance(N).try_into().unwrap()
}
/// Reads an `u32` encoded as little-endian from the `OwnedBytes` and advance by 4 bytes.
#[inline]
pub fn read_u32(&mut self) -> u32 {
u32::from_le_bytes(self.read_n())
let byte = self.as_slice()[0];
self.advance(1);
byte
}
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
#[inline]
pub fn read_u64(&mut self) -> u64 {
u64::from_le_bytes(self.read_n())
assert!(self.len() > 7);
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
self.advance(8);
u64::from_le_bytes(octlet)
}
/// Reads an `u32` encoded as little-endian from the `OwnedBytes` and advance by 4 bytes.
#[inline]
pub fn read_u32(&mut self) -> u32 {
assert!(self.len() > 3);
let quad: [u8; 4] = self.as_slice()[..4].try_into().unwrap();
self.advance(4);
u32::from_le_bytes(quad)
}
}
@@ -198,33 +201,32 @@ impl Deref for OwnedBytes {
}
}
impl AsRef<[u8]> for OwnedBytes {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
#[inline]
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let data_len = self.data.len();
let buf_len = buf.len();
if data_len >= buf_len {
let data = self.advance(buf_len);
buf.copy_from_slice(data);
Ok(buf_len)
} else {
buf[..data_len].copy_from_slice(self.data);
self.data = &[];
Ok(data_len)
}
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
buf.extend(self.data);
let read_len = self.data.len();
self.data = &[];
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
@@ -240,6 +242,13 @@ impl io::Read for OwnedBytes {
}
}
impl AsRef<[u8]> for OwnedBytes {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.21.0"
version = "0.20.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,4 +12,6 @@ keywords = ["search", "information", "retrieval"]
edition = "2021"
[dependencies]
nom = "7"
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }

View File

@@ -1,353 +0,0 @@
//! nom combinators for infallible operations
use std::convert::Infallible;
use nom::{AsChar, IResult, InputLength, InputTakeAtPosition};
pub(crate) type ErrorList = Vec<LenientErrorInternal>;
pub(crate) type JResult<I, O> = IResult<I, (O, ErrorList), Infallible>;
/// An error, with an end-of-string based offset
#[derive(Debug)]
pub(crate) struct LenientErrorInternal {
pub pos: usize,
pub message: String,
}
/// A recoverable error and the position it happened at
#[derive(Debug, PartialEq)]
pub struct LenientError {
pub pos: usize,
pub message: String,
}
impl LenientError {
pub(crate) fn from_internal(internal: LenientErrorInternal, str_len: usize) -> LenientError {
LenientError {
pos: str_len - internal.pos,
message: internal.message,
}
}
}
fn unwrap_infallible<T>(res: Result<T, nom::Err<Infallible>>) -> T {
match res {
Ok(val) => val,
Err(_) => unreachable!(),
}
}
// when rfcs#1733 get stabilized, this can make things clearer
// trait InfallibleParser<I, O> = nom::Parser<I, (O, ErrorList), std::convert::Infallible>;
/// A variant of the classical `opt` parser, except it returns an infallible error type.
///
/// It's less generic than the original to ease type resolution in the rest of the code.
pub(crate) fn opt_i<I: Clone, O, F>(mut f: F) -> impl FnMut(I) -> JResult<I, Option<O>>
where F: nom::Parser<I, O, nom::error::Error<I>> {
move |input: I| {
let i = input.clone();
match f.parse(input) {
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
Err(_) => Ok((i, (None, Vec::new()))),
}
}
}
pub(crate) fn opt_i_err<'a, I: Clone + InputLength, O, F>(
mut f: F,
message: impl ToString + 'a,
) -> impl FnMut(I) -> JResult<I, Option<O>> + 'a
where
F: nom::Parser<I, O, nom::error::Error<I>> + 'a,
{
move |input: I| {
let i = input.clone();
match f.parse(input) {
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
Err(_) => {
let errs = vec![LenientErrorInternal {
pos: i.input_len(),
message: message.to_string(),
}];
Ok((i, (None, errs)))
}
}
}
}
pub(crate) fn space0_infallible<T>(input: T) -> JResult<T, T>
where
T: InputTakeAtPosition + Clone,
<T as InputTakeAtPosition>::Item: AsChar + Clone,
{
opt_i(nom::character::complete::space0)(input)
.map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors)))
}
pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>>
where
T: InputTakeAtPosition + Clone + InputLength,
<T as InputTakeAtPosition>::Item: AsChar + Clone,
{
opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| {
if spaces.is_none() {
errors.push(LenientErrorInternal {
pos: left.input_len(),
message: "missing space".to_string(),
})
}
(left, (spaces, errors))
})
}
pub(crate) fn fallible<I, O, E: nom::error::ParseError<I>, F>(
mut f: F,
) -> impl FnMut(I) -> IResult<I, O, E>
where F: nom::Parser<I, (O, ErrorList), Infallible> {
use nom::Err;
move |input: I| match f.parse(input) {
Ok((input, (output, _err))) => Ok((input, output)),
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
}
}
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
mut first: F,
mut second: G,
mut third: H,
) -> impl FnMut(I) -> JResult<I, O2>
where
F: nom::Parser<I, (O1, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
H: nom::Parser<I, (O3, ErrorList), Infallible>,
{
move |input: I| {
let (input, (_, mut err)) = first.parse(input)?;
let (input, (o2, mut err2)) = second.parse(input)?;
err.append(&mut err2);
let (input, (_, mut err3)) = third.parse(input)?;
err.append(&mut err3);
Ok((input, (o2, err)))
}
}
// Parse nothing. Just a lazy way to not implement terminated/preceded and use delimited instead
pub(crate) fn nothing(i: &str) -> JResult<&str, ()> {
Ok((i, ((), Vec::new())))
}
pub(crate) trait TupleInfallible<I, O> {
/// Parses the input and returns a tuple of results of each parser.
fn parse(&mut self, input: I) -> JResult<I, O>;
}
impl<Input, Output, F: nom::Parser<Input, (Output, ErrorList), Infallible>>
TupleInfallible<Input, (Output,)> for (F,)
{
fn parse(&mut self, input: Input) -> JResult<Input, (Output,)> {
self.0.parse(input).map(|(i, (o, e))| (i, ((o,), e)))
}
}
// these macros are heavily copied from nom, with some minor adaptations for our type
macro_rules! tuple_trait(
($name1:ident $ty1:ident, $name2: ident $ty2:ident, $($name:ident $ty:ident),*) => (
tuple_trait!(__impl $name1 $ty1, $name2 $ty2; $($name $ty),*);
);
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident, $($name2:ident $ty2:ident),*) => (
tuple_trait_impl!($($name $ty),+);
tuple_trait!(__impl $($name $ty),+ , $name1 $ty1; $($name2 $ty2),*);
);
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident) => (
tuple_trait_impl!($($name $ty),+);
tuple_trait_impl!($($name $ty),+, $name1 $ty1);
);
);
macro_rules! tuple_trait_impl(
($($name:ident $ty: ident),+) => (
impl<
Input: Clone, $($ty),+ ,
$($name: nom::Parser<Input, ($ty, ErrorList), Infallible>),+
> TupleInfallible<Input, ( $($ty),+ )> for ( $($name),+ ) {
fn parse(&mut self, input: Input) -> JResult<Input, ( $($ty),+ )> {
let mut error_list = Vec::new();
tuple_trait_inner!(0, self, input, (), error_list, $($name)+)
}
}
);
);
macro_rules! tuple_trait_inner(
($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);
succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+))
});
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);
succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+))
});
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);
Ok((i, (($($parsed)* , o), $error_list)))
});
);
macro_rules! succ (
(0, $submac:ident ! ($($rest:tt)*)) => ($submac!(1, $($rest)*));
(1, $submac:ident ! ($($rest:tt)*)) => ($submac!(2, $($rest)*));
(2, $submac:ident ! ($($rest:tt)*)) => ($submac!(3, $($rest)*));
(3, $submac:ident ! ($($rest:tt)*)) => ($submac!(4, $($rest)*));
(4, $submac:ident ! ($($rest:tt)*)) => ($submac!(5, $($rest)*));
(5, $submac:ident ! ($($rest:tt)*)) => ($submac!(6, $($rest)*));
(6, $submac:ident ! ($($rest:tt)*)) => ($submac!(7, $($rest)*));
(7, $submac:ident ! ($($rest:tt)*)) => ($submac!(8, $($rest)*));
(8, $submac:ident ! ($($rest:tt)*)) => ($submac!(9, $($rest)*));
(9, $submac:ident ! ($($rest:tt)*)) => ($submac!(10, $($rest)*));
(10, $submac:ident ! ($($rest:tt)*)) => ($submac!(11, $($rest)*));
(11, $submac:ident ! ($($rest:tt)*)) => ($submac!(12, $($rest)*));
(12, $submac:ident ! ($($rest:tt)*)) => ($submac!(13, $($rest)*));
(13, $submac:ident ! ($($rest:tt)*)) => ($submac!(14, $($rest)*));
(14, $submac:ident ! ($($rest:tt)*)) => ($submac!(15, $($rest)*));
(15, $submac:ident ! ($($rest:tt)*)) => ($submac!(16, $($rest)*));
(16, $submac:ident ! ($($rest:tt)*)) => ($submac!(17, $($rest)*));
(17, $submac:ident ! ($($rest:tt)*)) => ($submac!(18, $($rest)*));
(18, $submac:ident ! ($($rest:tt)*)) => ($submac!(19, $($rest)*));
(19, $submac:ident ! ($($rest:tt)*)) => ($submac!(20, $($rest)*));
(20, $submac:ident ! ($($rest:tt)*)) => ($submac!(21, $($rest)*));
);
tuple_trait!(FnA A, FnB B, FnC C, FnD D, FnE E, FnF F, FnG G, FnH H, FnI I, FnJ J, FnK K, FnL L,
FnM M, FnN N, FnO O, FnP P, FnQ Q, FnR R, FnS S, FnT T, FnU U);
// Special case: implement `TupleInfallible` for `()`, the unit type.
// This can come up in macros which accept a variable number of arguments.
// Literally, `()` is an empty tuple, so it should simply parse nothing.
impl<I> TupleInfallible<I, ()> for () {
fn parse(&mut self, input: I) -> JResult<I, ()> {
Ok((input, ((), Vec::new())))
}
}
pub(crate) fn tuple_infallible<I, O, List: TupleInfallible<I, O>>(
mut l: List,
) -> impl FnMut(I) -> JResult<I, O> {
move |i: I| l.parse(i)
}
pub(crate) fn separated_list_infallible<I, O, O2, F, G>(
mut sep: G,
mut f: F,
) -> impl FnMut(I) -> JResult<I, Vec<O>>
where
I: Clone + InputLength,
F: nom::Parser<I, (O, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
{
move |i: I| {
let mut res: Vec<O> = Vec::new();
let mut errors: ErrorList = Vec::new();
let (mut i, (o, mut err)) = unwrap_infallible(f.parse(i.clone()));
errors.append(&mut err);
res.push(o);
loop {
let (i_sep_parsed, (_, mut err_sep)) = unwrap_infallible(sep.parse(i.clone()));
let len_before = i_sep_parsed.input_len();
let (i_elem_parsed, (o, mut err_elem)) =
unwrap_infallible(f.parse(i_sep_parsed.clone()));
// infinite loop check: the parser must always consume
// if we consumed nothing here, don't produce an element.
if i_elem_parsed.input_len() == len_before {
return Ok((i, (res, errors)));
}
res.push(o);
errors.append(&mut err_sep);
errors.append(&mut err_elem);
i = i_elem_parsed;
}
}
}
pub(crate) trait Alt<I, O> {
/// Tests each parser in the tuple and returns the result of the first one that succeeds
fn choice(&mut self, input: I) -> Option<JResult<I, O>>;
}
macro_rules! alt_trait(
($first_cond:ident $first:ident, $($id_cond:ident $id: ident),+) => (
alt_trait!(__impl $first_cond $first; $($id_cond $id),+);
);
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
alt_trait_impl!($($current_cond $current),*);
alt_trait!(__impl $($current_cond $current,)* $head_cond $head; $($id_cond $id),+);
);
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident) => (
alt_trait_impl!($($current_cond $current),*);
alt_trait_impl!($($current_cond $current,)* $head_cond $head);
);
);
macro_rules! alt_trait_impl(
($($id_cond:ident $id:ident),+) => (
impl<
Input: Clone, Output,
$(
// () are to make things easier on me, but I'm not entirely sure whether we can do better
// with rule E0207
$id_cond: nom::Parser<Input, (), ()>,
$id: nom::Parser<Input, (Output, ErrorList), Infallible>
),+
> Alt<Input, Output> for ( $(($id_cond, $id),)+ ) {
fn choice(&mut self, input: Input) -> Option<JResult<Input, Output>> {
match self.0.0.parse(input.clone()) {
Err(_) => alt_trait_inner!(1, self, input, $($id_cond $id),+),
Ok((input_left, _)) => Some(self.0.1.parse(input_left)),
}
}
}
);
);
macro_rules! alt_trait_inner(
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
match $self.$it.0.parse($input.clone()) {
Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)),
Ok((input_left, _)) => Some($self.$it.1.parse(input_left)),
}
);
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => (
None
);
);
alt_trait!(A1 A, B1 B, C1 C, D1 D, E1 E, F1 F, G1 G, H1 H, I1 I, J1 J, K1 K,
L1 L, M1 M, N1 N, O1 O, P1 P, Q1 Q, R1 R, S1 S, T1 T, U1 U);
/// An alt() like combinator. For each branch, it first tries a fallible parser, which commits to
/// this branch, or tells to check next branch, and the execute the infallible parser which follow.
///
/// In case no branch match, the default (fallible) parser is executed.
pub(crate) fn alt_infallible<I: Clone, O, F, List: Alt<I, O>>(
mut l: List,
mut default: F,
) -> impl FnMut(I) -> JResult<I, O>
where
F: nom::Parser<I, (O, ErrorList), Infallible>,
{
move |i: I| l.choice(i.clone()).unwrap_or_else(|| default.parse(i))
}

View File

@@ -1,26 +1,19 @@
#![allow(clippy::derive_partial_eq_without_eq)]
mod infallible;
mod occur;
mod query_grammar;
mod user_input_ast;
use combine::parser::Parser;
pub use crate::infallible::LenientError;
pub use crate::occur::Occur;
use crate::query_grammar::{parse_to_ast, parse_to_ast_lenient};
use crate::query_grammar::parse_to_ast;
pub use crate::user_input_ast::{
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
};
pub struct Error;
/// Parse a query
pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
let (_remaining, user_input_ast) = parse_to_ast(query).map_err(|_| Error)?;
let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
Ok(user_input_ast)
}
/// Parse a query, trying to recover from syntax errors, and giving hints toward fixing errors.
pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
parse_to_ast_lenient(query)
}

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter};
use crate::Occur;
#[derive(PartialEq, Clone)]
#[derive(PartialEq)]
pub enum UserInputLeaf {
Literal(UserInputLiteral),
All,
@@ -18,28 +18,6 @@ pub enum UserInputLeaf {
},
}
impl UserInputLeaf {
pub(crate) fn set_field(self, field: Option<String>) -> Self {
match self {
UserInputLeaf::Literal(mut literal) => {
literal.field_name = field;
UserInputLeaf::Literal(literal)
}
UserInputLeaf::All => UserInputLeaf::All,
UserInputLeaf::Range {
field: _,
lower,
upper,
} => UserInputLeaf::Range {
field,
lower,
upper,
},
UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements },
}
}
}
impl Debug for UserInputLeaf {
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
match self {
@@ -50,7 +28,6 @@ impl Debug for UserInputLeaf {
ref upper,
} => {
if let Some(ref field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
}
lower.display_lower(formatter)?;
@@ -60,7 +37,6 @@ impl Debug for UserInputLeaf {
}
UserInputLeaf::Set { field, elements } => {
if let Some(ref field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\": ")?;
}
write!(formatter, "IN [")?;
@@ -68,7 +44,6 @@ impl Debug for UserInputLeaf {
if i != 0 {
write!(formatter, " ")?;
}
// TODO properly escape element
write!(formatter, "\"{text}\"")?;
}
write!(formatter, "]")
@@ -85,7 +60,7 @@ pub enum Delimiter {
None,
}
#[derive(PartialEq, Clone)]
#[derive(PartialEq)]
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
@@ -97,20 +72,16 @@ pub struct UserInputLiteral {
impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if let Some(ref field) = self.field_name {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
}
match self.delimiter {
Delimiter::SingleQuotes => {
// TODO properly escape element (in case of \')
write!(formatter, "'{}'", self.phrase)?;
}
Delimiter::DoubleQuotes => {
// TODO properly escape element (in case of \")
write!(formatter, "\"{}\"", self.phrase)?;
}
Delimiter::None => {
// TODO properly escape element
write!(formatter, "{}", self.phrase)?;
}
}
@@ -123,7 +94,7 @@ impl fmt::Debug for UserInputLiteral {
}
}
#[derive(PartialEq, Debug, Clone)]
#[derive(PartialEq)]
pub enum UserInputBound {
Inclusive(String),
Exclusive(String),
@@ -133,7 +104,6 @@ pub enum UserInputBound {
impl UserInputBound {
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
// TODO properly escape word if required
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{word}\""),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{word}\""),
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
@@ -142,7 +112,6 @@ impl UserInputBound {
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
// TODO properly escape word if required
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{word}\"]"),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{word}\"}}"),
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
@@ -158,7 +127,6 @@ impl UserInputBound {
}
}
#[derive(PartialEq, Clone)]
pub enum UserInputAst {
Clause(Vec<(Option<Occur>, UserInputAst)>),
Leaf(Box<UserInputLeaf>),
@@ -228,7 +196,6 @@ impl fmt::Debug for UserInputAst {
match *self {
UserInputAst::Clause(ref subqueries) => {
if subqueries.is_empty() {
// TODO this will break ast reserialization, is writing "( )" enought?
write!(formatter, "<emptyclause>")?;
} else {
write!(formatter, "(")?;

View File

@@ -44,49 +44,22 @@ use super::metric::{
/// The key is the user defined name of the aggregation.
pub type Aggregations = HashMap<String, Aggregation>;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Aggregation request.
///
/// An aggregation is either a bucket or a metric.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(try_from = "AggregationForDeserialization")]
pub struct Aggregation {
/// The aggregation variant, which can be either a bucket or a metric.
#[serde(flatten)]
pub agg: AggregationVariants,
/// The sub_aggregations, only valid for bucket type aggregations. Each bucket will aggregate
/// on the document set in the bucket.
#[serde(rename = "aggs")]
#[serde(default)]
#[serde(skip_serializing_if = "Aggregations::is_empty")]
pub sub_aggregation: Aggregations,
}
/// In order to display proper error message, we cannot rely on flattening
/// the json enum. Instead we introduce an intermediary struct to separate
/// the aggregation from the subaggregation.
#[derive(Deserialize)]
struct AggregationForDeserialization {
#[serde(flatten)]
pub aggs_remaining_json: serde_json::Value,
#[serde(rename = "aggs")]
#[serde(default)]
pub sub_aggregation: Aggregations,
}
impl TryFrom<AggregationForDeserialization> for Aggregation {
type Error = serde_json::Error;
fn try_from(value: AggregationForDeserialization) -> serde_json::Result<Self> {
let AggregationForDeserialization {
aggs_remaining_json,
sub_aggregation,
} = value;
let agg: AggregationVariants = serde_json::from_value(aggs_remaining_json)?;
Ok(Aggregation {
agg,
sub_aggregation,
})
}
}
impl Aggregation {
pub(crate) fn sub_aggregation(&self) -> &Aggregations {
&self.sub_aggregation
@@ -150,8 +123,7 @@ pub enum AggregationVariants {
}
impl AggregationVariants {
/// Returns the name of the field used by the aggregation.
pub fn get_fast_field_name(&self) -> &str {
fn get_fast_field_name(&self) -> &str {
match self {
AggregationVariants::Terms(terms) => terms.field.as_str(),
AggregationVariants::Range(range) => range.field.as_str(),

View File

@@ -13,7 +13,6 @@ use super::metric::{
};
use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
use crate::aggregation::{f64_to_fastfield_u64, Key};
use crate::SegmentReader;
#[derive(Default)]
@@ -36,82 +35,41 @@ pub struct AggregationWithAccessor {
/// based on search terms. That is not that case currently, but eventually this needs to be
/// Option or moved.
pub(crate) accessor: Column<u64>,
/// Load insert u64 for missing use case
pub(crate) missing_value_for_accessor: Option<u64>,
pub(crate) str_dict_column: Option<StrColumn>,
pub(crate) field_type: ColumnType,
/// In case there are multiple types of fast fields, e.g. string and numeric.
/// Only used for term aggregations currently.
pub(crate) accessor2: Option<(Column<u64>, ColumnType)>,
pub(crate) sub_aggregation: AggregationsWithAccessor,
pub(crate) limits: ResourceLimitGuard,
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
/// Used for missing term aggregation, which checks all columns for existence.
/// By convention the missing aggregation is chosen, when this property is set
/// (instead bein set in `agg`).
/// If this needs to used by other aggregations, we need to refactor this.
pub(crate) accessors: Vec<Column<u64>>,
pub(crate) agg: Aggregation,
}
impl AggregationWithAccessor {
/// May return multiple accessors if the aggregation is e.g. on mixed field types.
fn try_from_agg(
agg: &Aggregation,
sub_aggregation: &Aggregations,
reader: &SegmentReader,
limits: AggregationLimits,
) -> crate::Result<Vec<AggregationWithAccessor>> {
let add_agg_with_accessor = |accessor: Column<u64>,
column_type: ColumnType,
aggs: &mut Vec<AggregationWithAccessor>|
-> crate::Result<()> {
let res = AggregationWithAccessor {
accessor,
accessors: Vec::new(),
field_type: column_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
limits: limits.new_guard(),
missing_value_for_accessor: None,
str_dict_column: None,
column_block_accessor: Default::default(),
};
aggs.push(res);
Ok(())
};
let mut res: Vec<AggregationWithAccessor> = Vec::new();
) -> crate::Result<AggregationWithAccessor> {
let mut str_dict_column = None;
let mut accessor2 = None;
use AggregationVariants::*;
match &agg.agg {
let (accessor, field_type) = match &agg.agg {
Range(RangeAggregation {
field: field_name, ..
}) => {
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
}) => get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?,
Histogram(HistogramAggregation {
field: field_name, ..
}) => {
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
}) => get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?,
DateHistogram(DateHistogramAggregationReq {
field: field_name, ..
}) => {
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
}) => get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?,
Terms(TermsAggregation {
field: field_name,
missing,
..
field: field_name, ..
}) => {
let str_dict_column = reader.fast_fields().str(field_name)?;
str_dict_column = reader.fast_fields().str(field_name)?;
let allowed_column_types = [
ColumnType::I64,
ColumnType::U64,
@@ -122,144 +80,51 @@ impl AggregationWithAccessor {
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
];
// In case the column is empty we want the shim column to match the missing type
let fallback_type = missing
.as_ref()
.map(|missing| match missing {
Key::Str(_) => ColumnType::Str,
Key::F64(_) => ColumnType::F64,
})
.unwrap_or(ColumnType::U64);
let column_and_types = get_all_ff_reader_or_empty(
reader,
field_name,
Some(&allowed_column_types),
fallback_type,
)?;
let missing_and_more_than_one_col = column_and_types.len() > 1 && missing.is_some();
let text_on_non_text_col = column_and_types.len() == 1
&& column_and_types[0].1.numerical_type().is_some()
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg = missing_and_more_than_one_col || text_on_non_text_col;
if use_special_missing_agg {
let column_and_types =
get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;
let accessors: Vec<Column> =
column_and_types.iter().map(|(a, _)| a.clone()).collect();
let agg_wit_acc = AggregationWithAccessor {
missing_value_for_accessor: None,
accessor: accessors[0].clone(),
accessors,
field_type: ColumnType::U64,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column: str_dict_column.clone(),
limits: limits.new_guard(),
column_block_accessor: Default::default(),
};
res.push(agg_wit_acc);
}
for (accessor, column_type) in column_and_types {
let missing_value_term_agg = if use_special_missing_agg {
None
} else {
missing.clone()
};
let missing_value_for_accessor =
if let Some(missing) = missing_value_term_agg.as_ref() {
get_missing_val(column_type, missing, agg.agg.get_fast_field_name())?
} else {
None
};
let agg = AggregationWithAccessor {
missing_value_for_accessor,
accessor,
accessors: Vec::new(),
field_type: column_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column: str_dict_column.clone(),
limits: limits.new_guard(),
column_block_accessor: Default::default(),
};
res.push(agg);
}
let mut columns =
get_all_ff_reader_or_empty(reader, field_name, Some(&allowed_column_types))?;
let first = columns.pop().unwrap();
accessor2 = columns.pop();
first
}
Average(AverageAggregation {
field: field_name, ..
})
| Count(CountAggregation {
field: field_name, ..
})
| Max(MaxAggregation {
field: field_name, ..
})
| Min(MinAggregation {
field: field_name, ..
})
| Stats(StatsAggregation {
field: field_name, ..
})
| Sum(SumAggregation {
field: field_name, ..
}) => {
let (accessor, column_type) =
Average(AverageAggregation { field: field_name })
| Count(CountAggregation { field: field_name })
| Max(MaxAggregation { field: field_name })
| Min(MinAggregation { field: field_name })
| Stats(StatsAggregation { field: field_name })
| Sum(SumAggregation { field: field_name }) => {
let (accessor, field_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
(accessor, field_type)
}
Percentiles(percentiles) => {
let (accessor, column_type) = get_ff_reader(
let (accessor, field_type) = get_ff_reader(
reader,
percentiles.field_name(),
Some(get_numeric_or_date_column_types()),
)?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
(accessor, field_type)
}
};
Ok(res)
let sub_aggregation = sub_aggregation.clone();
Ok(AggregationWithAccessor {
accessor,
accessor2,
field_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
&sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column,
limits: limits.new_guard(),
column_block_accessor: Default::default(),
})
}
}
fn get_missing_val(
column_type: ColumnType,
missing: &Key,
field_name: &str,
) -> crate::Result<Option<u64>> {
let missing_val = match missing {
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
// Allow fallback to number on text fields
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::F64(val) if column_type.numerical_type().is_some() => {
f64_to_fastfield_u64(*val, &column_type)
}
_ => {
return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {:?} for field {} is not supported for column type {:?}",
missing, field_name, column_type
)));
}
};
Ok(missing_val)
}
fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
&[
ColumnType::F64,
@@ -276,15 +141,15 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
) -> crate::Result<AggregationsWithAccessor> {
let mut aggss = Vec::new();
for (key, agg) in aggs.iter() {
let aggs = AggregationWithAccessor::try_from_agg(
agg,
agg.sub_aggregation(),
reader,
limits.clone(),
)?;
for agg in aggs {
aggss.push((key.to_string(), agg));
}
aggss.push((
key.to_string(),
AggregationWithAccessor::try_from_agg(
agg,
agg.sub_aggregation(),
reader,
limits.clone(),
)?,
));
}
Ok(AggregationsWithAccessor::from_data(
VecWithNames::from_entries(aggss),
@@ -316,13 +181,15 @@ fn get_all_ff_reader_or_empty(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
fallback_type: ColumnType,
) -> crate::Result<Vec<(columnar::Column<u64>, ColumnType)>> {
let ff_fields = reader.fast_fields();
let mut ff_field_with_type =
ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?;
if ff_field_with_type.is_empty() {
ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type));
ff_field_with_type.push((
Column::build_empty_column(reader.num_docs()),
ColumnType::U64,
));
}
Ok(ff_field_with_type)
}

View File

@@ -558,10 +558,10 @@ fn test_aggregation_invalid_requests() -> crate::Result<()> {
assert_eq!(agg_req_1.is_err(), true);
// TODO: This should list valid values
assert!(agg_req_1
.unwrap_err()
.to_string()
.contains("unknown variant `doesnotmatchanyagg`, expected one of"));
assert_eq!(
agg_req_1.unwrap_err().to_string(),
"no variant of enum AggregationVariants found in flattened data"
);
// TODO: This should return an error
// let agg_res = avg_on_field("not_exist_field").unwrap_err();

View File

@@ -351,7 +351,6 @@ impl SegmentHistogramCollector {
let buckets_mem = self.buckets.memory_consumption();
self_mem + sub_aggs_mem + buckets_mem
}
/// Converts the collector result into a intermediate bucket result.
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &AggregationWithAccessor,
@@ -454,12 +453,15 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
let final_buckets: Vec<BucketEntry> = buckets
.into_iter()
.merge_join_by(fill_gaps_buckets, |existing_bucket, fill_gaps_bucket| {
existing_bucket
.key
.partial_cmp(fill_gaps_bucket)
.unwrap_or(Ordering::Equal)
})
.merge_join_by(
fill_gaps_buckets.into_iter(),
|existing_bucket, fill_gaps_bucket| {
existing_bucket
.key
.partial_cmp(fill_gaps_bucket)
.unwrap_or(Ordering::Equal)
},
)
.map(|either| match either {
// Ignore the generated bucket
itertools::EitherOrBoth::Both(existing, _) => existing,

View File

@@ -25,15 +25,15 @@
mod histogram;
mod range;
mod term_agg;
mod term_missing_agg;
use std::collections::HashMap;
pub(crate) use histogram::SegmentHistogramCollector;
pub use histogram::*;
pub(crate) use range::SegmentRangeCollector;
pub use range::*;
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
pub use term_agg::*;
pub use term_missing_agg::*;
/// Order for buckets in a bucket aggregation.
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize, Default)]

View File

@@ -262,7 +262,7 @@ impl SegmentRangeCollector {
pub(crate) fn from_req_and_validate(
req: &RangeAggregation,
sub_aggregation: &mut AggregationsWithAccessor,
limits: &ResourceLimitGuard,
limits: &mut ResourceLimitGuard,
field_type: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
@@ -465,7 +465,7 @@ mod tests {
SegmentRangeCollector::from_req_and_validate(
&req,
&mut Default::default(),
&AggregationLimits::default().new_guard(),
&mut AggregationLimits::default().new_guard(),
field_type,
0,
)

View File

@@ -1,6 +1,6 @@
use std::fmt::Debug;
use columnar::{BytesColumn, ColumnType, StrColumn};
use columnar::ColumnType;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
@@ -9,6 +9,7 @@ use crate::aggregation::agg_limits::MemoryConsumption;
use crate::aggregation::agg_req_with_accessor::{
AggregationWithAccessor, AggregationsWithAccessor,
};
use crate::aggregation::f64_from_fastfield_u64;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
@@ -16,7 +17,6 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
use crate::aggregation::{f64_from_fastfield_u64, Key};
use crate::error::DataCorruption;
use crate::TantivyError;
@@ -146,28 +146,6 @@ pub struct TermsAggregation {
/// { "average_price": "asc" }
#[serde(skip_serializing_if = "Option::is_none", default)]
pub order: Option<CustomOrder>,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "missing": "NO_DATA" }
///
/// # Internal
///
/// Internally, `missing` requires some specialized handling in some scenarios.
///
/// Simple Case:
/// In the simplest case, we can just put the missing value in the termmap use that. In case of
/// text we put a special u64::MAX and replace it at the end with the actual missing value,
/// when loading the text.
/// Special Case 1:
/// If we have multiple columns on one field, we need to have a union on the indices on both
/// columns, to find docids without a value. That requires a special missing aggreggation.
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
/// the special missing aggregation, since there is no mechanism in the numerical column to
/// add text.
#[serde(skip_serializing_if = "Option::is_none", default)]
pub missing: Option<Key>,
}
/// Same as TermsAggregation, but with populated defaults.
@@ -198,7 +176,6 @@ pub(crate) struct TermsAggregationInternal {
pub min_doc_count: u64,
pub order: CustomOrder,
pub missing: Option<Key>,
}
impl TermsAggregationInternal {
@@ -218,7 +195,6 @@ impl TermsAggregationInternal {
.unwrap_or_else(|| order == CustomOrder::default()),
min_doc_count: req.min_doc_count.unwrap_or(1),
order,
missing: req.missing.clone(),
}
}
}
@@ -248,6 +224,110 @@ impl TermBuckets {
}
}
/// The composite collector is used, when we have different types under one field, to support a term
/// aggregation on both.
#[derive(Clone, Debug)]
pub struct SegmentTermCollectorComposite {
term_agg1: SegmentTermCollector, // field type 1, e.g. strings
term_agg2: SegmentTermCollector, // field type 2, e.g. u64
accessor_idx: usize,
}
impl SegmentAggregationCollector for SegmentTermCollectorComposite {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
let bucket = self
.term_agg1
.into_intermediate_bucket_result(agg_with_accessor)?;
results.push(
name.to_string(),
IntermediateAggregationResult::Bucket(bucket),
)?;
let bucket = self
.term_agg2
.into_intermediate_bucket_result(agg_with_accessor)?;
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(())
}
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
self.term_agg1.collect_block(&[doc], agg_with_accessor)?;
self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
self.term_agg2.collect_block(&[doc], agg_with_accessor)?;
self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
Ok(())
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
self.term_agg1.collect_block(docs, agg_with_accessor)?;
self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
self.term_agg2.collect_block(docs, agg_with_accessor)?;
self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
Ok(())
}
fn flush(&mut self, agg_with_accessor: &mut AggregationsWithAccessor) -> crate::Result<()> {
self.term_agg1.flush(agg_with_accessor)?;
self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
self.term_agg2.flush(agg_with_accessor)?;
self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
Ok(())
}
}
impl SegmentTermCollectorComposite {
/// Swaps the accessor and field type with the second accessor and field type.
/// This way we can use the same code for both aggregations.
fn swap_accessor(&self, aggregations: &mut AggregationWithAccessor) {
if let Some(accessor) = aggregations.accessor2.as_mut() {
std::mem::swap(&mut accessor.0, &mut aggregations.accessor);
std::mem::swap(&mut accessor.1, &mut aggregations.field_type);
}
}
pub(crate) fn from_req_and_validate(
req: &TermsAggregation,
sub_aggregations: &mut AggregationsWithAccessor,
field_type: ColumnType,
field_type2: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
Ok(Self {
term_agg1: SegmentTermCollector::from_req_and_validate(
req,
sub_aggregations,
field_type,
accessor_idx,
)?,
term_agg2: SegmentTermCollector::from_req_and_validate(
req,
sub_aggregations,
field_type2,
accessor_idx,
)?,
accessor_idx,
})
}
}
/// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype.
#[derive(Clone, Debug)]
@@ -299,16 +379,9 @@ impl SegmentAggregationCollector for SegmentTermCollector {
let mem_pre = self.get_memory_consumption();
if let Some(missing) = bucket_agg_accessor.missing_value_for_accessor {
bucket_agg_accessor
.column_block_accessor
.fetch_block_with_missing(docs, &bucket_agg_accessor.accessor, missing);
} else {
bucket_agg_accessor
.column_block_accessor
.fetch_block(docs, &bucket_agg_accessor.accessor);
}
bucket_agg_accessor
.column_block_accessor
.fetch_block(docs, &bucket_agg_accessor.accessor);
for term_id in bucket_agg_accessor.column_block_accessor.iter_vals() {
let entry = self.term_buckets.entries.entry(term_id).or_default();
*entry += 1;
@@ -470,42 +543,19 @@ impl SegmentTermCollector {
let term_dict = agg_with_accessor
.str_dict_column
.as_ref()
.cloned()
.unwrap_or_else(|| {
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
});
.expect("internal error: term dictionary not found for term aggregation");
let mut buffer = String::new();
for (term_id, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
// Special case for missing key
if term_id == u64::MAX {
let missing_key = self
.req
.missing
.as_ref()
.expect("Found placeholder term_id but `missing` is None");
match missing_key {
Key::Str(missing) => {
buffer.clear();
buffer.push_str(missing);
dict.insert(
IntermediateKey::Str(buffer.to_string()),
intermediate_entry,
);
}
Key::F64(val) => {
buffer.push_str(&val.to_string());
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
}
}
} else {
if !term_dict.ord_to_str(term_id, &mut buffer)? {
return Err(TantivyError::InternalError(format!(
"Couldn't find term_id {term_id} in dict"
)));
}
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
if !term_dict.ord_to_str(term_id, &mut buffer)? {
return Err(TantivyError::InternalError(format!(
"Couldn't find term_id {term_id} in dict"
)));
}
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
}
if self.req.min_doc_count == 0 {
// TODO: Handle rev streaming for descending sorting by keys
@@ -1243,13 +1293,13 @@ mod tests {
// searching for terma, but min_doc_count will return all terms
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
assert_eq!(
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
assert_eq!(
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
@@ -1271,7 +1321,6 @@ mod tests {
];
let index = get_test_index_from_terms(false, &terms_per_segment)?;
assert_eq!(index.searchable_segments().unwrap().len(), 2);
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
@@ -1372,10 +1421,10 @@ mod tests {
let res = exec_request_with_query(agg_req, &index, None).unwrap();
println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
Ok(())
@@ -1457,47 +1506,6 @@ mod tests {
Ok(())
}
#[test]
fn terms_empty_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
// => Segment with json, but no field partially_empty
index_writer
.add_document(doc!(json => json!({"different_field": "blue"})))
.unwrap();
index_writer.commit().unwrap();
//// => Segment with field partially_empty
index_writer
.add_document(doc!(json => json!({"partially_empty": "blue"})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": {
"field": "json.partially_empty"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(res["my_texts"]["buckets"][0]["key"], "blue");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_texts"]["buckets"][1], serde_json::Value::Null);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_bytes() -> crate::Result<()> {
@@ -1535,282 +1543,4 @@ mod tests {
Ok(())
}
#[test]
fn terms_aggregation_missing_multi_value() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", FAST);
let id_field = schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "Hello Hello",
text_field => "Hello Hello",
id_field => 1u64,
id_field => 1u64,
))?;
// Missing
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "Hello Hello",
))?;
index_writer.add_document(doc!(
text_field => "Hello Hello",
))?;
index_writer.commit()?;
// Empty segment special case
index_writer.add_document(doc!())?;
index_writer.commit()?;
// Full segment special case
index_writer.add_document(doc!(
text_field => "Hello Hello",
id_field => 1u64,
))?;
index_writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": {
"field": "text",
"missing": "Empty"
},
},
"my_texts2": {
"terms": {
"field": "text",
"missing": 1337
},
},
"my_ids": {
"terms": {
"field": "id",
"missing": 1337
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hello Hello");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Empty");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
assert_eq!(
res["my_texts"]["buckets"][2]["key"],
serde_json::Value::Null
);
// text field with numner as missing fallback
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
assert_eq!(res["my_texts2"]["buckets"][1]["doc_count"], 2);
assert_eq!(
res["my_texts2"]["buckets"][2]["key"],
serde_json::Value::Null
);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
// id field
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0);
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 3);
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_missing_simple_id() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
id_field => 1u64,
))?;
// Missing
index_writer.add_document(doc!())?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_ids": {
"terms": {
"field": "id",
"missing": 1337
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// id field
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0);
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_missing1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", FAST);
let id_field = schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "Hello Hello",
id_field => 1u64,
))?;
// Missing
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "Hello Hello",
))?;
index_writer.add_document(doc!(
text_field => "Hello Hello",
))?;
index_writer.commit()?;
// Empty segment special case
index_writer.add_document(doc!())?;
index_writer.commit()?;
// Full segment special case
index_writer.add_document(doc!(
text_field => "Hello Hello",
id_field => 1u64,
))?;
index_writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": {
"field": "text",
"missing": "Empty"
},
},
"my_texts2": {
"terms": {
"field": "text",
"missing": 1337
},
},
"my_ids": {
"terms": {
"field": "id",
"missing": 1337
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hello Hello");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Empty");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
assert_eq!(
res["my_texts"]["buckets"][2]["key"],
serde_json::Value::Null
);
// text field with numner as missing fallback
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
assert_eq!(res["my_texts2"]["buckets"][1]["doc_count"], 2);
assert_eq!(
res["my_texts2"]["buckets"][2]["key"],
serde_json::Value::Null
);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
// id field
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0);
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 2);
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_missing_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("text", FAST);
schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
// Empty segment special case
index_writer.add_document(doc!())?;
index_writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": {
"field": "text",
"missing": "Empty"
},
},
"my_texts2": {
"terms": {
"field": "text",
"missing": 1337
},
},
"my_ids": {
"terms": {
"field": "id",
"missing": 1337
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Empty");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(
res["my_texts"]["buckets"][1]["key"],
serde_json::Value::Null
);
// text field with number as missing fallback
assert_eq!(res["my_texts2"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 1);
assert_eq!(
res["my_texts2"]["buckets"][1]["key"],
serde_json::Value::Null
);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
// id field
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_ids"]["buckets"][1]["key"], serde_json::Value::Null);
Ok(())
}
}

View File

@@ -1,476 +0,0 @@
use rustc_hash::FxHashMap;
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
};
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
/// The specialized missing term aggregation.
#[derive(Default, Debug, Clone)]
pub struct TermMissingAgg {
missing_count: u32,
accessor_idx: usize,
sub_agg: Option<Box<dyn SegmentAggregationCollector>>,
}
impl TermMissingAgg {
pub(crate) fn new(
accessor_idx: usize,
sub_aggregations: &mut AggregationsWithAccessor,
) -> crate::Result<Self> {
let has_sub_aggregations = !sub_aggregations.is_empty();
let sub_agg = if has_sub_aggregations {
let sub_aggregation = build_segment_agg_collector(sub_aggregations)?;
Some(sub_aggregation)
} else {
None
};
Ok(Self {
accessor_idx,
sub_agg,
..Default::default()
})
}
}
impl SegmentAggregationCollector for TermMissingAgg {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
let term_agg = agg_with_accessor
.agg
.agg
.as_term()
.expect("TermMissingAgg collector must be term agg req");
let missing = term_agg
.missing
.as_ref()
.expect("TermMissingAgg collector, but no missing found in agg req")
.clone();
let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> =
Default::default();
let mut missing_entry = IntermediateTermBucketEntry {
doc_count: self.missing_count,
sub_aggregation: Default::default(),
};
if let Some(sub_agg) = self.sub_agg {
let mut res = IntermediateAggregationResults::default();
sub_agg.add_intermediate_aggregation_result(
&agg_with_accessor.sub_aggregation,
&mut res,
)?;
missing_entry.sub_aggregation = res;
}
entries.insert(missing.into(), missing_entry);
let bucket = IntermediateBucketResult::Terms(IntermediateTermBucketResult {
entries,
sum_other_doc_count: 0,
doc_count_error_upper_bound: 0,
});
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(())
}
fn collect(
&mut self,
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let agg = &mut agg_with_accessor.aggs.values[self.accessor_idx];
let has_value = agg.accessors.iter().any(|acc| acc.index.has_value(doc));
if !has_value {
self.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.collect(doc, &mut agg.sub_aggregation)?;
}
}
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
for doc in docs {
self.collect(*doc, agg_with_accessor)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!(score => 5.0))?;
// index_writer.commit().unwrap();
//// => Segment with all values text
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer.add_document(doc!(score => 5.0))?;
// index_writer.commit().unwrap();
// => Segment with mixed values
index_writer.add_document(doc!(json => json!({"mixed_type": "red"})))?;
index_writer.add_document(doc!(json => json!({"mixed_type": -20.5})))?;
index_writer.add_document(doc!(json => json!({"mixed_type": true})))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
15.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mixed_type_sub_agg_reg1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 2);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
10.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mult_seg_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
15.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_single_seg_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
15.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
//// => Segment with all values text
index_writer
.add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
// => Segment with mixed values
index_writer
.add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": true})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
},
"replace_num": {
"terms": {
"field": "json.mixed_type",
"missing": 1337
},
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_num"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["replace_num"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_str_on_numeric_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.add_document(doc!())?;
index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mixed_type_one_seg() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
//// => Segment with all values text
index_writer
.add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer.add_document(doc!())?;
// => Segment with mixed values
index_writer
.add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": true})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
}

View File

@@ -111,6 +111,9 @@ impl IntermediateAggregationResults {
}
/// Convert intermediate result and its aggregation request to the final result.
///
/// Internal function, AggregationsInternal is used instead Aggregations, which is optimized
/// for internal processing, by splitting metric and buckets into separate groups.
pub(crate) fn into_final_result_internal(
self,
req: &Aggregations,
@@ -118,14 +121,7 @@ impl IntermediateAggregationResults {
) -> crate::Result<AggregationResults> {
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
for (key, agg_res) in self.aggs_res.into_iter() {
let req = req.get(key.as_str()).unwrap_or_else(|| {
panic!(
"Could not find key {:?} in request keys {:?}. This probably means that \
add_intermediate_aggregation_result passed the wrong agg object.",
key,
req.keys().collect::<Vec<_>>()
)
});
let req = req.get(key.as_str()).unwrap();
results.insert(key, agg_res.into_final_result(req, limits)?);
}
// Handle empty results
@@ -467,7 +463,7 @@ impl IntermediateBucketResult {
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =
buckets_left
.drain(..)
.merge_join_by(buckets_right, |left, right| {
.merge_join_by(buckets_right.into_iter(), |left, right| {
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
})
.map(|either| match either {

View File

@@ -20,21 +20,12 @@ use super::{IntermediateStats, SegmentStatsCollector};
pub struct AverageAggregation {
/// The field name to compute the average on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(default)]
pub missing: Option<f64>,
}
impl AverageAggregation {
/// Creates a new [`AverageAggregation`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
Self { field: field_name }
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {

View File

@@ -18,23 +18,14 @@ use super::{IntermediateStats, SegmentStatsCollector};
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CountAggregation {
/// The field name to compute the count on.
/// The field name to compute the minimum on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(default)]
pub missing: Option<f64>,
}
impl CountAggregation {
/// Creates a new [`CountAggregation`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
Self { field: field_name }
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {
@@ -60,7 +51,7 @@ impl IntermediateCount {
pub fn merge_fruits(&mut self, other: IntermediateCount) {
self.stats.merge_fruits(other.stats);
}
/// Computes the final count value.
/// Computes the final minimum value.
pub fn finalize(&self) -> Option<f64> {
Some(self.stats.finalize().count as f64)
}

View File

@@ -20,21 +20,12 @@ use super::{IntermediateStats, SegmentStatsCollector};
pub struct MaxAggregation {
/// The field name to compute the maximum on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(default)]
pub missing: Option<f64>,
}
impl MaxAggregation {
/// Creates a new [`MaxAggregation`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
Self { field: field_name }
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {
@@ -65,55 +56,3 @@ impl IntermediateMax {
self.stats.finalize().max
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn test_max_agg_with_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
// => Segment with json, but no field partially_empty
index_writer
.add_document(doc!(json => json!({"different_field": "blue"})))
.unwrap();
index_writer.commit().unwrap();
//// => Segment with field partially_empty
index_writer
.add_document(doc!(json => json!({"partially_empty": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_stats": {
"max": {
"field": "json.partially_empty",
"missing": 100.0,
}
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res["my_stats"],
json!({
"value": 100.0,
})
);
Ok(())
}
}

View File

@@ -20,21 +20,12 @@ use super::{IntermediateStats, SegmentStatsCollector};
pub struct MinAggregation {
/// The field name to compute the minimum on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(default)]
pub missing: Option<f64>,
}
impl MinAggregation {
/// Creates a new [`MinAggregation`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
Self { field: field_name }
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {

View File

@@ -11,7 +11,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, AggregationError};
use crate::aggregation::{f64_from_fastfield_u64, AggregationError};
use crate::{DocId, TantivyError};
/// # Percentiles
@@ -80,12 +80,6 @@ pub struct PercentilesAggregationReq {
/// Whether to return the percentiles as a hash map
#[serde(default = "default_as_true")]
pub keyed: bool,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(skip_serializing_if = "Option::is_none", default)]
pub missing: Option<f64>,
}
fn default_percentiles() -> &'static [f64] {
&[1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
@@ -101,7 +95,6 @@ impl PercentilesAggregationReq {
field: field_name,
percents: None,
keyed: default_as_true(),
missing: None,
}
}
/// Returns the field name the aggregation is computed on.
@@ -134,7 +127,6 @@ pub(crate) struct SegmentPercentilesCollector {
pub(crate) percentiles: PercentilesCollector,
pub(crate) accessor_idx: usize,
val_cache: Vec<u64>,
missing: Option<u64>,
}
#[derive(Clone, Serialize, Deserialize)]
@@ -235,16 +227,11 @@ impl SegmentPercentilesCollector {
accessor_idx: usize,
) -> crate::Result<Self> {
req.validate()?;
let missing = req
.missing
.and_then(|val| f64_to_fastfield_u64(val, &field_type));
Ok(Self {
field_type,
percentiles: PercentilesCollector::new(),
accessor_idx,
val_cache: Default::default(),
missing,
})
}
#[inline]
@@ -253,17 +240,9 @@ impl SegmentPercentilesCollector {
docs: &[DocId],
agg_accessor: &mut AggregationWithAccessor,
) {
if let Some(missing) = self.missing.as_ref() {
agg_accessor.column_block_accessor.fetch_block_with_missing(
docs,
&agg_accessor.accessor,
*missing,
);
} else {
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
}
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
for val in agg_accessor.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
@@ -298,22 +277,9 @@ impl SegmentAggregationCollector for SegmentPercentilesCollector {
) -> crate::Result<()> {
let field = &agg_with_accessor.aggs.values[self.accessor_idx].accessor;
if let Some(missing) = self.missing {
let mut has_val = false;
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.percentiles.collect(val1);
has_val = true;
}
if !has_val {
self.percentiles
.collect(f64_from_fastfield_u64(missing, &self.field_type));
}
} else {
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.percentiles.collect(val1);
}
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.percentiles.collect(val1);
}
Ok(())
@@ -343,12 +309,10 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::tests::{
exec_request_with_query, get_test_index_from_values, get_test_index_from_values_and_terms,
get_test_index_from_values, get_test_index_from_values_and_terms,
};
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn test_aggregation_percentiles_empty_index() -> crate::Result<()> {
@@ -499,7 +463,7 @@ mod tests {
fn test_aggregation_percentiles(merge_segments: bool) -> crate::Result<()> {
use rand_distr::Distribution;
let num_values_in_segment = [100, 30_000, 8000];
let num_values_in_segment = vec![100, 30_000, 8000];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let mut rng = StdRng::from_seed([1u8; 32]);
@@ -581,110 +545,4 @@ mod tests {
Ok(())
}
#[test]
fn test_percentiles_missing_sub_agg() -> crate::Result<()> {
// This test verifies the `collect` method (in contrast to `collect_block`), which is
// called when the sub-aggregations are flushed.
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("texts", FAST);
let score_field_f64 = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
// writing the segment
index_writer.add_document(doc!(
score_field_f64 => 10.0f64,
text_field => "a"
))?;
index_writer.add_document(doc!(
score_field_f64 => 10.0f64,
text_field => "a"
))?;
index_writer.add_document(doc!(text_field => "a"))?;
index_writer.commit()?;
}
let agg_req: Aggregations = {
serde_json::from_value(json!({
"range_with_stats": {
"terms": {
"field": "texts"
},
"aggs": {
"percentiles": {
"percentiles": {
"field": "score",
"missing": 5.0
}
}
}
}
}))
.unwrap()
};
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(res["range_with_stats"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["range_with_stats"]["buckets"][0]["percentiles"]["values"]["1.0"],
5.0028295751107414
);
assert_eq!(
res["range_with_stats"]["buckets"][0]["percentiles"]["values"]["99.0"],
10.07469668951144
);
Ok(())
}
#[test]
fn test_percentiles_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("texts", FAST);
let score_field_f64 = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
// writing the segment
index_writer.add_document(doc!(
score_field_f64 => 10.0f64,
text_field => "a"
))?;
index_writer.add_document(doc!(
score_field_f64 => 10.0f64,
text_field => "a"
))?;
index_writer.add_document(doc!(text_field => "a"))?;
index_writer.commit()?;
}
let agg_req: Aggregations = {
serde_json::from_value(json!({
"percentiles": {
"percentiles": {
"field": "score",
"missing": 5.0
}
}
}))
.unwrap()
};
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(res["percentiles"]["values"]["1.0"], 5.0028295751107414);
assert_eq!(res["percentiles"]["values"]["99.0"], 10.07469668951144);
Ok(())
}
}

View File

@@ -5,11 +5,11 @@ use super::*;
use crate::aggregation::agg_req_with_accessor::{
AggregationWithAccessor, AggregationsWithAccessor,
};
use crate::aggregation::f64_from_fastfield_u64;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64};
use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
@@ -29,21 +29,12 @@ use crate::{DocId, TantivyError};
pub struct StatsAggregation {
/// The field name to compute the stats on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(default)]
pub missing: Option<f64>,
}
impl StatsAggregation {
/// Creates a new [`StatsAggregation`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
StatsAggregation {
field: field_name,
missing: None,
}
StatsAggregation { field: field_name }
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {
@@ -162,7 +153,6 @@ pub(crate) enum SegmentStatsType {
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentStatsCollector {
missing: Option<u64>,
field_type: ColumnType,
pub(crate) collecting_for: SegmentStatsType,
pub(crate) stats: IntermediateStats,
@@ -175,15 +165,12 @@ impl SegmentStatsCollector {
field_type: ColumnType,
collecting_for: SegmentStatsType,
accessor_idx: usize,
missing: Option<f64>,
) -> Self {
let missing = missing.and_then(|val| f64_to_fastfield_u64(val, &field_type));
Self {
field_type,
collecting_for,
stats: IntermediateStats::default(),
accessor_idx,
missing,
val_cache: Default::default(),
}
}
@@ -193,17 +180,10 @@ impl SegmentStatsCollector {
docs: &[DocId],
agg_accessor: &mut AggregationWithAccessor,
) {
if let Some(missing) = self.missing.as_ref() {
agg_accessor.column_block_accessor.fetch_block_with_missing(
docs,
&agg_accessor.accessor,
*missing,
);
} else {
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
}
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
for val in agg_accessor.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val1);
@@ -254,22 +234,10 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let field = &agg_with_accessor.aggs.values[self.accessor_idx].accessor;
if let Some(missing) = self.missing {
let mut has_val = false;
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val1);
has_val = true;
}
if !has_val {
self.stats
.collect(f64_from_fastfield_u64(missing, &self.field_type));
}
} else {
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val1);
}
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val1);
}
Ok(())
@@ -294,13 +262,11 @@ mod tests {
use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::tests::{
exec_request_with_query, get_test_index_2_segments, get_test_index_from_values,
};
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values};
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, Term};
use crate::schema::IndexRecordOption;
use crate::Term;
#[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -487,159 +453,4 @@ mod tests {
Ok(())
}
#[test]
fn test_stats_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
// => Segment with json, but no field partially_empty
index_writer
.add_document(doc!(json => json!({"different_field": "blue"})))
.unwrap();
index_writer.commit().unwrap();
//// => Segment with field partially_empty
index_writer
.add_document(doc!(json => json!({"partially_empty": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_stats": {
"stats": {
"field": "json.partially_empty"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res["my_stats"],
json!({
"avg": 10.0,
"count": 1,
"max": 10.0,
"min": 10.0,
"sum": 10.0
})
);
Ok(())
}
#[test]
fn test_stats_json_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
// => Segment with json, but no field partially_empty
index_writer
.add_document(doc!(json => json!({"different_field": "blue"})))
.unwrap();
index_writer.commit().unwrap();
//// => Segment with field partially_empty
index_writer
.add_document(doc!(json => json!({"partially_empty": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_stats": {
"stats": {
"field": "json.partially_empty",
"missing": 0.0
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res["my_stats"],
json!({
"avg": 2.5,
"count": 4,
"max": 10.0,
"min": 0.0,
"sum": 10.0
})
);
Ok(())
}
#[test]
fn test_stats_json_missing_sub_agg() -> crate::Result<()> {
// This test verifies the `collect` method (in contrast to `collect_block`), which is
// called when the sub-aggregations are flushed.
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("texts", FAST);
let score_field_f64 = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
// writing the segment
index_writer.add_document(doc!(
score_field_f64 => 10.0f64,
text_field => "a"
))?;
index_writer.add_document(doc!(text_field => "a"))?;
index_writer.commit()?;
}
let agg_req: Aggregations = {
serde_json::from_value(json!({
"range_with_stats": {
"terms": {
"field": "texts"
},
"aggs": {
"my_stats": {
"stats": {
"field": "score",
"missing": 0.0
}
}
}
}
}))
.unwrap()
};
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res["range_with_stats"]["buckets"][0]["my_stats"]["count"],
2
);
assert_eq!(
res["range_with_stats"]["buckets"][0]["my_stats"]["min"],
0.0
);
assert_eq!(
res["range_with_stats"]["buckets"][0]["my_stats"]["avg"],
5.0
);
Ok(())
}
}

View File

@@ -20,21 +20,12 @@ use super::{IntermediateStats, SegmentStatsCollector};
pub struct SumAggregation {
/// The field name to compute the minimum on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(default)]
pub missing: Option<f64>,
}
impl SumAggregation {
/// Creates a new [`SumAggregation`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
Self { field: field_name }
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {

View File

@@ -411,7 +411,7 @@ mod tests {
.set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false),
)
.set_fast(None)
.set_fast("default")
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
@@ -466,7 +466,7 @@ mod tests {
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast(None)
.set_fast("default")
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST);

View File

@@ -15,7 +15,7 @@ use super::metric::{
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation,
};
use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::bucket::SegmentTermCollectorComposite;
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result(
@@ -82,24 +82,29 @@ pub(crate) fn build_single_agg_segment_collector(
use AggregationVariants::*;
match &req.agg.agg {
Terms(terms_req) => {
if req.accessors.is_empty() {
if let Some(acc2) = req.accessor2.as_ref() {
Ok(Box::new(
SegmentTermCollectorComposite::from_req_and_validate(
terms_req,
&mut req.sub_aggregation,
req.field_type,
acc2.1,
accessor_idx,
)?,
))
} else {
Ok(Box::new(SegmentTermCollector::from_req_and_validate(
terms_req,
&mut req.sub_aggregation,
req.field_type,
accessor_idx,
)?))
} else {
Ok(Box::new(TermMissingAgg::new(
accessor_idx,
&mut req.sub_aggregation,
)?))
}
}
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
range_req,
&mut req.sub_aggregation,
&req.limits,
&mut req.limits,
req.field_type,
accessor_idx,
)?)),
@@ -115,43 +120,35 @@ pub(crate) fn build_single_agg_segment_collector(
req.field_type,
accessor_idx,
)?)),
Average(AverageAggregation { missing, .. }) => {
Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Average,
accessor_idx,
*missing,
)))
}
Count(CountAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
Average(AverageAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Average,
accessor_idx,
))),
Count(CountAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Count,
accessor_idx,
*missing,
))),
Max(MaxAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
Max(MaxAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Max,
accessor_idx,
*missing,
))),
Min(MinAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
Min(MinAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Min,
accessor_idx,
*missing,
))),
Stats(StatsAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
Stats(StatsAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Stats,
accessor_idx,
*missing,
))),
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
Sum(SumAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Sum,
accessor_idx,
*missing,
))),
Percentiles(percentiles_req) => Ok(Box::new(
SegmentPercentilesCollector::from_req_and_validate(

View File

@@ -16,7 +16,7 @@ use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer(15_000_000).unwrap();
/// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();

View File

@@ -89,7 +89,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// // a document can be associated with any number of facets
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",

View File

@@ -38,7 +38,7 @@ use crate::{DocId, Score, SegmentReader, TantivyError};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?;
@@ -216,7 +216,7 @@ where
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;

View File

@@ -233,7 +233,7 @@ mod tests {
let val_field = schema_builder.add_i64_field("val_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64))?;
writer.add_document(doc!(val_field=>-30i64))?;
writer.add_document(doc!(val_field=>-12i64))?;
@@ -255,7 +255,7 @@ mod tests {
let val_field = schema_builder.add_i64_field("val_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64))?;
writer.commit()?;
writer.add_document(doc!(val_field=>-30i64))?;
@@ -280,7 +280,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),

View File

@@ -44,7 +44,7 @@
//! # let title = schema_builder.add_text_field("title", TEXT);
//! # let schema = schema_builder.build();
//! # let index = Index::create_in_ram(schema);
//! # let mut index_writer = index.writer(15_000_000)?;
//! # let mut index_writer = index.writer(3_000_000)?;
//! # index_writer.add_document(doc!(
//! # title => "The Name of the Wind",
//! # ))?;

View File

@@ -120,7 +120,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;

View File

@@ -26,7 +26,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;

View File

@@ -105,7 +105,7 @@ where
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
@@ -210,7 +210,7 @@ impl TopDocs {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
@@ -261,7 +261,7 @@ impl TopDocs {
/// # let schema = schema_builder.build();
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?;
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?;
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?;
@@ -349,7 +349,7 @@ impl TopDocs {
/// # let schema = schema_builder.build();
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "MadCow Inc.", revenue => 92_000_000i64))?;
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", revenue => 119_000_000i64))?;
/// # index_writer.add_document(doc!(title => "Declining Cow", revenue => -63_000_000i64))?;
@@ -449,7 +449,7 @@ impl TopDocs {
/// fn create_index() -> tantivy::Result<Index> {
/// let schema = create_schema();
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?;
@@ -556,7 +556,7 @@ impl TopDocs {
/// # fn main() -> tantivy::Result<()> {
/// # let schema = create_schema();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap();
/// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
@@ -752,7 +752,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?;
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?;
index_writer.add_document(doc!(text_field=>"I like Droopy"))?;
@@ -1122,7 +1122,7 @@ mod tests {
mut doc_adder: impl FnMut(&mut IndexWriter),
) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 15_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
doc_adder(&mut index_writer);
index_writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]);

View File

@@ -16,7 +16,7 @@ use crate::directory::error::OpenReadError;
use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Field, FieldType, Schema};
@@ -120,8 +120,8 @@ impl IndexBuilder {
Self {
schema: None,
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default(),
fast_field_tokenizer_manager: TokenizerManager::default(),
tokenizer_manager: TokenizerManager::default_for_indexing(),
fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
}
}
@@ -400,8 +400,8 @@ impl Index {
settings: metas.index_settings.clone(),
directory,
schema,
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
tokenizers: TokenizerManager::default_for_indexing(),
fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
executor: Arc::new(Executor::single_thread()),
inventory,
}
@@ -523,9 +523,9 @@ impl Index {
/// - `num_threads` defines the number of indexing workers that
/// should work at the same time.
///
/// - `overall_memory_budget_in_bytes` sets the amount of memory
/// - `overall_memory_arena_in_bytes` sets the amount of memory
/// allocated for all indexing thread.
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
/// Each thread will receive a budget of `overall_memory_arena_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
@@ -534,7 +534,7 @@ impl Index {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
overall_memory_arena_in_bytes: usize,
) -> crate::Result<IndexWriter> {
let directory_lock = self
.directory
@@ -550,7 +550,7 @@ impl Index {
),
)
})?;
let memory_arena_in_bytes_per_thread = overall_memory_budget_in_bytes / num_threads;
let memory_arena_in_bytes_per_thread = overall_memory_arena_in_bytes / num_threads;
IndexWriter::new(
self,
num_threads,
@@ -561,11 +561,11 @@ impl Index {
/// Helper to create an index writer for tests.
///
/// That index writer only simply has a single thread and a memory budget of 15 MB.
/// That index writer only simply has a single thread and a memory arena of 10 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 15_000_000)
self.writer_with_num_threads(1, 10_000_000)
}
/// Creates a multithreaded writer
@@ -579,13 +579,13 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer(&self, memory_budget_in_bytes: usize) -> crate::Result<IndexWriter> {
pub fn writer(&self, memory_arena_num_bytes: usize) -> crate::Result<IndexWriter> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
num_threads = (memory_budget_in_bytes / MEMORY_BUDGET_NUM_BYTES_MIN).max(1);
let memory_arena_num_bytes_per_thread = memory_arena_num_bytes / num_threads;
if memory_arena_num_bytes_per_thread < MEMORY_ARENA_NUM_BYTES_MIN {
num_threads = (memory_arena_num_bytes / MEMORY_ARENA_NUM_BYTES_MIN).max(1);
}
self.writer_with_num_threads(num_threads, memory_budget_in_bytes)
self.writer_with_num_threads(num_threads, memory_arena_num_bytes)
}
/// Accessor to the index settings

View File

@@ -60,7 +60,7 @@ impl IndexingPositionsPerPath {
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
self.positions_per_path
.entry(murmurhash2(term.serialized_term()))
.or_default()
.or_insert_with(Default::default)
}
}
@@ -619,21 +619,21 @@ mod tests {
#[test]
fn test_split_json_path_escaped_dot() {
let json_path = split_json_path(r"toto\.titi");
let json_path = split_json_path(r#"toto\.titi"#);
assert_eq!(&json_path, &["toto.titi"]);
let json_path_2 = split_json_path(r"k8s\.container\.name");
let json_path_2 = split_json_path(r#"k8s\.container\.name"#);
assert_eq!(&json_path_2, &["k8s.container.name"]);
}
#[test]
fn test_split_json_path_escaped_backslash() {
let json_path = split_json_path(r"toto\\titi");
assert_eq!(&json_path, &[r"toto\titi"]);
let json_path = split_json_path(r#"toto\\titi"#);
assert_eq!(&json_path, &[r#"toto\titi"#]);
}
#[test]
fn test_split_json_path_escaped_normal_letter() {
let json_path = split_json_path(r"toto\titi");
let json_path = split_json_path(r#"toto\titi"#);
assert_eq!(&json_path, &[r#"tototiti"#]);
}
}

View File

@@ -283,7 +283,7 @@ fn test_single_segment_index_writer() -> crate::Result<()> {
let directory = RamDirectory::default();
let mut single_segment_index_writer = Index::builder()
.schema(schema)
.single_segment_index_writer(directory, 15_000_000)?;
.single_segment_index_writer(directory, 10_000_000)?;
for _ in 0..10 {
let doc = doc!(text_field=>"hello");
single_segment_index_writer.add_document(doc)?;

View File

@@ -73,7 +73,7 @@ impl From<io::Error> for TryAcquireLockError {
fn try_acquire_lock(
filepath: &Path,
directory: &dyn Directory,
directory: &mut dyn Directory,
) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
@@ -191,10 +191,10 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
///
/// The method is blocking or not depending on the [`Lock`] object.
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let box_directory = self.box_clone();
let mut box_directory = self.box_clone();
let mut retry_policy = retry_policy(lock.is_blocking);
loop {
match try_acquire_lock(&lock.filepath, &*box_directory) {
match try_acquire_lock(&lock.filepath, &mut *box_directory) {
Ok(result) => {
return Ok(result);
}

View File

@@ -446,7 +446,8 @@ mod tests {
#[test]
fn test_text_fastfield() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -939,7 +940,7 @@ mod tests {
.unwrap()
.first_or_default_col(0);
let numbers = [100, 200, 300];
let numbers = vec![100, 200, 300];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
@@ -1013,7 +1014,7 @@ mod tests {
.unwrap()
.first_or_default_col(0);
let numbers = [1000, 1001, 1003];
let numbers = vec![1000, 1001, 1003];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
@@ -1082,7 +1083,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(None);
let json_option = JsonObjectOptions::default().set_fast("default");
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1098,7 +1099,7 @@ mod tests {
.unwrap()
.is_none());
let column = fast_field_reader
.column_opt::<i64>(r"json.attr\.age")
.column_opt::<i64>(r#"json.attr\.age"#)
.unwrap()
.unwrap();
let vals: Vec<i64> = column.values_for_doc(0u32).collect();
@@ -1108,7 +1109,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
let json_option = JsonObjectOptions::default().set_fast("default");
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1134,7 +1135,7 @@ mod tests {
fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast(None)
.set_fast("default")
.set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
@@ -1202,10 +1203,10 @@ mod tests {
#[test]
fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
let opt = TextOptions::default().set_fast("custom_lowercase");
let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default();
let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default())
@@ -1238,7 +1239,7 @@ mod tests {
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast(Some("default"))
.set_fast("default")
.set_stored();
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
@@ -1271,7 +1272,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast(None)
.set_fast("default")
.set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);

View File

@@ -276,7 +276,7 @@ impl FastFieldReaders {
}
/// Returns the all `u64` column used to represent any `u64`-mapped typed (String/Bytes term
/// ids, i64, u64, f64, bool, DateTime).
/// ids, i64, u64, f64, DateTime).
///
/// In case of JSON, there may be two columns. One for term and one for numerical types. (This
/// may change later to 3 types if JSON handles DateTime)
@@ -349,7 +349,7 @@ mod tests {
schema_builder.add_json_field(
"json_expand_dots_enabled",
JsonObjectOptions::default()
.set_fast(None)
.set_fast("default")
.set_expand_dots_enabled(),
);
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -18,6 +18,8 @@ const JSON_DEPTH_LIMIT: usize = 20;
pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
// Field -> Fast field tokenizer mapping.
// All text fast fields should have a tokenizer.
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
date_precisions: Vec<DateTimePrecision>,
expand_dots: Vec<bool>,
@@ -61,7 +63,7 @@ impl FastFieldsWriter {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer {tokenizer_name:?} not found"
"Tokenizer `{tokenizer_name}` not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
@@ -157,9 +159,6 @@ impl FastFieldsWriter {
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
@@ -201,18 +200,20 @@ impl FastFieldsWriter {
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer =
let text_analyzer_opt =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
if let Some(text_analyzer) = text_analyzer_opt {
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
@@ -263,7 +264,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
text_analyzer: &mut TextAnalyzer,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
@@ -288,7 +289,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit,
json_path_buffer,
columnar_writer,
tokenizer,
text_analyzer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
@@ -302,7 +303,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
text_analyzer: &mut TextAnalyzer,
) {
if remaining_depth_limit == 0 {
return;
@@ -321,14 +322,10 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
});
}
serde_json::Value::Array(arr) => {
for el in arr {
@@ -339,7 +336,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
tokenizer,
text_analyzer,
);
}
}
@@ -351,7 +348,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
tokenizer,
text_analyzer,
);
}
}
@@ -371,6 +368,9 @@ mod tests {
) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
let mut json_path = String::new();
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
.unwrap();
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
@@ -379,7 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&mut None,
&mut text_analyzer,
);
}
let mut buffer = Vec::new();
@@ -399,6 +399,7 @@ mod tests {
});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 5);
{
assert_eq!(columns[0].0, "arr");
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
@@ -434,7 +435,9 @@ mod tests {
{
assert_eq!(columns[4].0, "text");
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
let column_text = column_text_opt.unwrap();
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
assert_eq!(&term_ords[..], &[0]);
}
}

View File

@@ -2,7 +2,6 @@ use std::collections::HashSet;
use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
@@ -31,7 +30,7 @@ fn test_functional_store() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut index_writer = index.writer_with_num_threads(3, 12_000_000)?;
let mut doc_set: Vec<u64> = Vec::new();

View File

@@ -152,11 +152,8 @@ pub(crate) fn get_doc_id_mapping_from_field(
#[cfg(test)]
mod tests_indexsorting {
use common::DateTime;
use crate::collector::TopDocs;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Schema, *};
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
@@ -447,93 +444,48 @@ mod tests_indexsorting {
Ok(())
}
#[test]
fn test_sort_index_fast_field() -> crate::Result<()> {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()
);
// #[test]
// fn test_sort_index_fast_field() -> crate::Result<()> {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "my_number".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// get_text_options(),
// )?;
// assert_eq!(
// index.settings().sort_by_field.as_ref().unwrap().field,
// "my_number".to_string()
// );
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
// let searcher = index.reader()?.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields
.u64("my_number")
.unwrap()
.first_or_default_col(999);
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
// let fast_field = fast_fields.u64(my_number).unwrap();
// assert_eq!(fast_field.get_val(0), 10u64);
// assert_eq!(fast_field.get_val(1), 20u64);
// assert_eq!(fast_field.get_val(2), 30u64);
let multifield = fast_fields.u64("multi_numbers").unwrap();
let vals: Vec<u64> = multifield.values_for_doc(0u32).collect();
assert_eq!(vals, &[] as &[u64]);
let vals: Vec<_> = multifield.values_for_doc(1u32).collect();
assert_eq!(vals, &[5, 6]);
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let multifield = fast_fields.u64s(multi_numbers).unwrap();
// let mut vals = vec![];
// multifield.get_vals(0u32, &mut vals);
// assert_eq!(vals, &[] as &[u64]);
// let mut vals = vec![];
// multifield.get_vals(1u32, &mut vals);
// assert_eq!(vals, &[5, 6]);
let vals: Vec<_> = multifield.values_for_doc(2u32).collect();
assert_eq!(vals, &[3]);
Ok(())
}
#[test]
fn test_with_sort_by_date_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", INDEXED | STORED | FAST);
let schema = schema_builder.build();
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "date".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => DateTime::from_timestamp_secs(1000),
))?;
index_writer.add_document(doc!(
date_field => DateTime::from_timestamp_secs(999),
))?;
index_writer.add_document(doc!(
date_field => DateTime::from_timestamp_secs(1001),
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields
.date("date")
.unwrap()
.first_or_default_col(DateTime::from_timestamp_secs(0));
assert_eq!(fast_field.get_val(0), DateTime::from_timestamp_secs(1001));
assert_eq!(fast_field.get_val(1), DateTime::from_timestamp_secs(1000));
assert_eq!(fast_field.get_val(2), DateTime::from_timestamp_secs(999));
Ok(())
}
// let mut vals = vec![];
// multifield.get_vals(2u32, &mut vals);
// assert_eq!(vals, &[3]);
// Ok(())
// }
#[test]
fn test_doc_mapping() {

View File

@@ -27,9 +27,9 @@ use crate::{FutureResult, Opstamp};
// in the `memory_arena` goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: usize = 1_000_000;
// We impose the memory per thread to be at least 15 MB, as the baseline consumption is 12MB.
pub const MEMORY_BUDGET_NUM_BYTES_MIN: usize = ((MARGIN_IN_BYTES as u32) * 15u32) as usize;
pub const MEMORY_BUDGET_NUM_BYTES_MAX: usize = u32::MAX as usize - MARGIN_IN_BYTES;
// We impose the memory per thread to be at least 3 MB.
pub const MEMORY_ARENA_NUM_BYTES_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
pub const MEMORY_ARENA_NUM_BYTES_MAX: usize = u32::MAX as usize - MARGIN_IN_BYTES;
// We impose the number of index writer threads to be at most this.
pub const MAX_NUM_THREAD: usize = 8;
@@ -57,8 +57,7 @@ pub struct IndexWriter {
index: Index,
// The memory budget per thread, after which a commit is triggered.
memory_budget_in_bytes_per_thread: usize,
memory_arena_in_bytes_per_thread: usize,
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
@@ -168,7 +167,7 @@ fn index_documents(
memory_budget: usize,
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
segment_updater: &SegmentUpdater,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
@@ -265,19 +264,19 @@ impl IndexWriter {
pub(crate) fn new(
index: &Index,
num_threads: usize,
memory_budget_in_bytes_per_thread: usize,
memory_arena_in_bytes_per_thread: usize,
directory_lock: DirectoryLock,
) -> crate::Result<IndexWriter> {
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
if memory_arena_in_bytes_per_thread < MEMORY_ARENA_NUM_BYTES_MIN {
let err_msg = format!(
"The memory arena in bytes per thread needs to be at least \
{MEMORY_BUDGET_NUM_BYTES_MIN}."
{MEMORY_ARENA_NUM_BYTES_MIN}."
);
return Err(TantivyError::InvalidArgument(err_msg));
}
if memory_budget_in_bytes_per_thread >= MEMORY_BUDGET_NUM_BYTES_MAX {
if memory_arena_in_bytes_per_thread >= MEMORY_ARENA_NUM_BYTES_MAX {
let err_msg = format!(
"The memory arena in bytes per thread cannot exceed {MEMORY_BUDGET_NUM_BYTES_MAX}"
"The memory arena in bytes per thread cannot exceed {MEMORY_ARENA_NUM_BYTES_MAX}"
);
return Err(TantivyError::InvalidArgument(err_msg));
}
@@ -296,7 +295,7 @@ impl IndexWriter {
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
memory_budget_in_bytes_per_thread,
memory_arena_in_bytes_per_thread,
index: index.clone(),
index_writer_status: IndexWriterStatus::from(document_receiver),
operation_sender: document_sender,
@@ -393,11 +392,11 @@ impl IndexWriter {
let document_receiver_clone = self.operation_receiver()?;
let index_writer_bomb = self.index_writer_status.create_bomb();
let segment_updater = self.segment_updater.clone();
let mut segment_updater = self.segment_updater.clone();
let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.memory_budget_in_bytes_per_thread;
let mem_budget = self.memory_arena_in_bytes_per_thread;
let index = self.index.clone();
let join_handle: JoinHandle<crate::Result<()>> = thread::Builder::new()
.name(format!("thrd-tantivy-index{}", self.worker_id))
@@ -429,7 +428,7 @@ impl IndexWriter {
mem_budget,
index.new_segment(),
&mut document_iterator,
&segment_updater,
&mut segment_updater,
delete_cursor.clone(),
)?;
}
@@ -555,7 +554,7 @@ impl IndexWriter {
let new_index_writer: IndexWriter = IndexWriter::new(
&self.index,
self.num_threads,
self.memory_budget_in_bytes_per_thread,
self.memory_arena_in_bytes_per_thread,
directory_lock,
)?;
@@ -811,7 +810,6 @@ mod tests {
use crate::collector::TopDocs;
use crate::directory::error::LockError;
use crate::error::*;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::{
@@ -943,7 +941,7 @@ mod tests {
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_for_tests().unwrap();
let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap();
assert_eq!(batch_opstamp1, 0u64);
@@ -956,8 +954,8 @@ mod tests {
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
let _index_writer = index.writer(3_000_000).unwrap();
match index.writer(3_000_000) {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
}
@@ -981,7 +979,7 @@ mod tests {
fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_for_tests().unwrap();
let index_writer = index.writer(3_000_000).unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
@@ -1000,11 +998,11 @@ mod tests {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer = index.writer_for_tests().unwrap();
let _index_writer = index.writer(3_000_000).unwrap();
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two = index.writer_for_tests().unwrap();
let _index_writer_two = index.writer(3_000_000).unwrap();
}
#[test]
@@ -1024,7 +1022,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer(3_000_000)?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.rollback()?;
assert_eq!(index_writer.commit_opstamp(), 0u64);
@@ -1056,7 +1054,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer(12_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
// this should create 1 segment
@@ -1096,7 +1094,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer(12_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -1142,7 +1140,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer = index.writer(MEMORY_BUDGET_NUM_BYTES_MIN).unwrap();
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -1198,8 +1196,7 @@ mod tests {
{
// writing the segment
let mut index_writer =
index.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)?;
let mut index_writer = index.writer_with_num_threads(4, 12_000_000)?;
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
@@ -1248,9 +1245,7 @@ mod tests {
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term).unwrap()
};
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
let commit_tstamp = index_writer.commit().unwrap();
@@ -1267,9 +1262,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
@@ -1318,9 +1311,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let res = index_writer.delete_all_documents();
assert!(res.is_ok());
@@ -1347,9 +1338,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// add one simple doc
assert!(index_writer.add_document(doc!(text_field => "a")).is_ok());
@@ -1382,9 +1371,7 @@ mod tests {
fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let clear = index_writer.delete_all_documents();
let commit = index_writer.commit();
assert!(clear.is_ok());
@@ -1395,9 +1382,7 @@ mod tests {
fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let clear = index_writer.delete_all_documents();
let commit = index_writer.commit();
assert!(clear.is_ok());
@@ -1703,8 +1688,7 @@ mod tests {
let old_reader = index.reader()?;
// Every 3rd doc has only id field
let id_is_full_doc = |id| id % 3 != 0;
let id_exists = |id| id % 3 != 0; // 0 does not exist
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
@@ -1720,7 +1704,7 @@ mod tests {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_is_full_doc(id) {
if !id_exists(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
@@ -1840,7 +1824,7 @@ mod tests {
let num_docs_with_values = expected_ids_and_num_occurrences
.iter()
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
.filter(|(id, _id_occurrences)| id_exists(**id))
.map(|(_, id_occurrences)| *id_occurrences as usize)
.sum::<usize>();
@@ -1864,7 +1848,7 @@ mod tests {
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| id_is_full_doc(**id))
.filter(|id| id_exists(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1902,7 +1886,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.flat_map(|id| {
if !id_is_full_doc(*id) {
if !id_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1914,7 +1898,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.filter_map(|id| {
if !id_is_full_doc(*id) {
if !id_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1949,7 +1933,7 @@ mod tests {
let id = id_reader.first(doc).unwrap();
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) {
if id_exists(id) {
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1959,7 +1943,7 @@ mod tests {
}
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) {
if id_exists(id) {
assert_eq!(bool_vals.len(), 2);
assert_ne!(bool_vals[0], bool_vals[1]);
} else {
@@ -1988,7 +1972,7 @@ mod tests {
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_is_full_doc(id) {
if id_exists(id) {
let id2 = store_reader
.get(doc_id)
.unwrap()
@@ -2035,7 +2019,7 @@ mod tests {
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
assert_eq!(get_num_hits(text_field), count);
@@ -2085,7 +2069,7 @@ mod tests {
//
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2102,84 +2086,34 @@ mod tests {
}
}
// Range query
// assert data is like expected
//
// Take half as sample
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
sample.sort_by_key(|(k, _num_occurences)| *k);
// sample.truncate(sample.len() / 2);
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.sum::<u64>()
};
fn gen_query_inclusive<T1: ToString, T2: ToString>(
field: &str,
from: T1,
to: T2,
) -> String {
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
}
};
let ip = ip_from_id(existing_id);
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
}
assert_eq!(do_search_ip_field(&query), count);
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2207,7 +2141,7 @@ mod tests {
.first_or_default_col(9999);
for doc_id in segment_reader.doc_ids_alive() {
let id = ff_reader.get_val(doc_id);
if !id_is_full_doc(id) {
if !id_exists(id) {
continue;
}
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2245,12 +2179,6 @@ mod tests {
Ok(index)
}
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
#[test]
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(
@@ -2498,13 +2426,6 @@ mod tests {
test_operation_strategy(&ops[..], false, true).unwrap();
}
#[test]
fn test_merge_regression_1() {
use IndexingOp::*;
let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge];
test_operation_strategy(&ops[..], false, true).unwrap();
}
#[test]
fn test_range_query_bug_1() {
use IndexingOp::*;

View File

@@ -178,7 +178,7 @@ impl IndexMerger {
alive_bitset_opt: Vec<Option<AliveBitSet>>,
) -> crate::Result<IndexMerger> {
let mut readers = vec![];
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) {
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt.into_iter()) {
if segment.meta().num_docs() > 0 {
let reader =
SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?;

View File

@@ -89,7 +89,7 @@ mod tests_mmap {
let parse_query = QueryParser::for_index(&index, Vec::new());
{
let query = parse_query
.parse_query(r"json.k8s\.container\.name:prometheus")
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
@@ -127,7 +127,7 @@ mod tests_mmap {
}
{
let query = parse_query
.parse_query(r"json.k8s\.container\.name:prometheus")
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);

View File

@@ -26,8 +26,6 @@ use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
let table_memory_upper_bound = per_thread_memory_budget / 3;
(10..20) // We cap it at 2^19 = 512K capacity.
// TODO: There are cases where this limit causes a
// reallocation in the hashmap. Check if this affects performance.
.map(|power| 1 << power)
.take_while(|capacity| compute_table_memory_size(*capacity) < table_memory_upper_bound)
.last()

View File

@@ -119,7 +119,7 @@ pub mod tests {
serializer.close_term()?;
serializer.close()?;
let position_delta = OwnedBytes::new(positions_buffer);
let mut output_delta_pos_buffer = [0u32; 5];
let mut output_delta_pos_buffer = vec![0u32; 5];
let mut position_reader = PositionReader::open(position_delta)?;
position_reader.read(0, &mut output_delta_pos_buffer[..]);
assert_eq!(

View File

@@ -225,7 +225,7 @@ pub mod tests {
{
let mut segment_writer =
SegmentWriter::for_segment(15_000_000, segment.clone()).unwrap();
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
{
// checking that position works if the field has two values
let op = AddOperation {

View File

@@ -32,7 +32,7 @@ use crate::schema::{IndexRecordOption, Term};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;

View File

@@ -98,7 +98,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
per_occur_scorers
.entry(*occur)
.or_default()
.or_insert_with(Vec::new)
.push(sub_scorer);
}
Ok(per_occur_scorers)
@@ -193,7 +193,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
return Ok(Explanation::new("BooleanQuery with no scoring", 1.0));
}
let mut explanation = Explanation::new("BooleanClause. sum of ...", scorer.score());
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
for (occur, subweight) in &self.weights {
if is_positive_occur(*occur) {
if let Ok(child_explanation) = subweight.explain(reader, doc) {

View File

@@ -297,7 +297,7 @@ mod tests {
let text = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?;
index_writer.add_document(doc!(text=>"a"))?;
index_writer.add_document(doc!(text=>"b"))?;
index_writer.commit()?;

View File

@@ -2,6 +2,7 @@ use std::fmt;
use crate::docset::BUFFER_LEN;
use crate::fastfield::AliveBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, Term};
@@ -72,9 +73,13 @@ impl Weight for BoostWeight {
}
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let mut explanation =
Explanation::new(format!("Boost x{} of ...", self.boost), scorer.score());
let underlying_explanation = self.weight.explain(reader, doc)?;
let score = underlying_explanation.value() * self.boost;
let mut explanation = Explanation::new(format!("Boost x{} of ...", self.boost), score);
explanation.add_detail(underlying_explanation);
Ok(explanation)
}

View File

@@ -23,7 +23,7 @@ use crate::{Score, Term};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of Girl",
/// ))?;

View File

@@ -46,7 +46,7 @@ impl Automaton for DfaWrapper {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;

View File

@@ -23,15 +23,13 @@ impl Eq for ScoreTerm {}
impl PartialOrd for ScoreTerm {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
self.score.partial_cmp(&other.score)
}
}
impl Ord for ScoreTerm {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.score
.partial_cmp(&other.score)
.unwrap_or(std::cmp::Ordering::Equal)
self.partial_cmp(other).unwrap_or(std::cmp::Ordering::Equal)
}
}

View File

@@ -75,7 +75,7 @@ pub mod tests {
let index = create_index(&["a b b d c g c", "a b a b c"])?;
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let terms: Vec<Term> = ["a", "b", "c"]
let terms: Vec<Term> = vec!["a", "b", "c"]
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();

View File

@@ -5,7 +5,6 @@ use std::str::{FromStr, ParseBoolError};
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use itertools::Itertools;
use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use rustc_hash::FxHashMap;
@@ -167,7 +166,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. e.g.,
/// `title:"Barack Obama"` will only find documents that have "barack" immediately followed by
/// "obama". Single quotes can also be used. If the text to be searched contains quotation mark,
/// it is possible to escape them with a `\`.
/// it is possible to escape them with a \.
///
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains a
@@ -228,25 +227,6 @@ fn all_negative(ast: &LogicalAst) -> bool {
}
}
// Make an all-negative ast into a normal ast. Must not be used on an already okay ast.
fn make_non_negative(ast: &mut LogicalAst) {
match ast {
LogicalAst::Leaf(_) => (),
LogicalAst::Boost(ref mut child_ast, _) => make_non_negative(child_ast),
LogicalAst::Clause(children) => children.push((Occur::Should, LogicalLiteral::All.into())),
}
}
/// Similar to the try/? macro, but returns a tuple of (None, Vec<Error>) instead of Err(Error)
macro_rules! try_tuple {
($expr:expr) => {{
match $expr {
Ok(val) => val,
Err(e) => return (None, vec![e.into()]),
}
}};
}
impl QueryParser {
/// Creates a `QueryParser`, given
/// * schema - index Schema
@@ -328,24 +308,17 @@ impl QueryParser {
///
/// Note that `parse_query` returns an error if the input
/// is not a valid query.
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<dyn Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(&self.fuzzy, logical_ast))
}
/// Parse a query leniently
///
/// This variant parses invalid query on a best effort basis. If some part of the query can't
/// reasonably be executed (range query without field, searching on a non existing field,
/// searching without precising field when no default field is provided...), they may get
/// turned into a "match-nothing" subquery.
///
/// In case it encountered such issues, they are reported as a Vec of errors.
pub fn parse_query_lenient(&self, query: &str) -> (Box<dyn Query>, Vec<QueryParserError>) {
let (logical_ast, errors) = self.parse_query_to_logical_ast_lenient(query);
(convert_to_query(&self.fuzzy, logical_ast), errors)
}
/// Build a query from an already parsed user input AST
///
/// This can be useful if the user input AST parsed using [`query_grammar`]
@@ -355,70 +328,31 @@ impl QueryParser {
&self,
user_input_ast: UserInputAst,
) -> Result<Box<dyn Query>, QueryParserError> {
let (logical_ast, mut err) = self.compute_logical_ast_lenient(user_input_ast);
if !err.is_empty() {
return Err(err.swap_remove(0));
}
let logical_ast = self.compute_logical_ast(user_input_ast)?;
Ok(convert_to_query(&self.fuzzy, logical_ast))
}
/// Build leniently a query from an already parsed user input AST.
///
/// See also [`QueryParser::build_query_from_user_input_ast`]
pub fn build_query_from_user_input_ast_lenient(
&self,
user_input_ast: UserInputAst,
) -> (Box<dyn Query>, Vec<QueryParserError>) {
let (logical_ast, errors) = self.compute_logical_ast_lenient(user_input_ast);
(convert_to_query(&self.fuzzy, logical_ast), errors)
}
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
let user_input_ast = query_grammar::parse_query(query)
.map_err(|_| QueryParserError::SyntaxError(query.to_string()))?;
let (ast, mut err) = self.compute_logical_ast_lenient(user_input_ast);
if !err.is_empty() {
return Err(err.swap_remove(0));
}
Ok(ast)
self.compute_logical_ast(user_input_ast)
}
/// Parse the user query into an AST.
fn parse_query_to_logical_ast_lenient(
&self,
query: &str,
) -> (LogicalAst, Vec<QueryParserError>) {
let (user_input_ast, errors) = query_grammar::parse_query_lenient(query);
let mut errors: Vec<_> = errors
.into_iter()
.map(|error| {
QueryParserError::SyntaxError(format!(
"{} at position {}",
error.message, error.pos
))
})
.collect();
let (ast, mut ast_errors) = self.compute_logical_ast_lenient(user_input_ast);
errors.append(&mut ast_errors);
(ast, errors)
}
fn compute_logical_ast_lenient(
fn compute_logical_ast(
&self,
user_input_ast: UserInputAst,
) -> (LogicalAst, Vec<QueryParserError>) {
let (mut ast, mut err) = self.compute_logical_ast_with_occur_lenient(user_input_ast);
) -> Result<LogicalAst, QueryParserError> {
let ast = self.compute_logical_ast_with_occur(user_input_ast)?;
if let LogicalAst::Clause(children) = &ast {
if children.is_empty() {
return (ast, err);
return Ok(ast);
}
}
if all_negative(&ast) {
err.push(QueryParserError::AllButQueryForbidden);
make_non_negative(&mut ast);
return Err(QueryParserError::AllButQueryForbidden);
}
(ast, err)
Ok(ast)
}
fn compute_boundary_term(
@@ -637,37 +571,26 @@ impl QueryParser {
}
}
fn compute_logical_ast_with_occur_lenient(
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAst,
) -> (LogicalAst, Vec<QueryParserError>) {
) -> Result<LogicalAst, QueryParserError> {
match user_input_ast {
UserInputAst::Clause(sub_queries) => {
let default_occur = self.default_occur();
let mut logical_sub_queries: Vec<(Occur, LogicalAst)> = Vec::new();
let mut errors = Vec::new();
for (occur_opt, sub_ast) in sub_queries {
let (sub_ast, mut sub_errors) =
self.compute_logical_ast_with_occur_lenient(sub_ast);
let sub_ast = self.compute_logical_ast_with_occur(sub_ast)?;
let occur = occur_opt.unwrap_or(default_occur);
logical_sub_queries.push((occur, sub_ast));
errors.append(&mut sub_errors);
}
(LogicalAst::Clause(logical_sub_queries), errors)
Ok(LogicalAst::Clause(logical_sub_queries))
}
UserInputAst::Boost(ast, boost) => {
let (ast, errors) = self.compute_logical_ast_with_occur_lenient(*ast);
(ast.boost(boost as Score), errors)
}
UserInputAst::Leaf(leaf) => {
let (ast, errors) = self.compute_logical_ast_from_leaf_lenient(*leaf);
// if the error is not recoverable, replace it with an empty clause. We will end up
// trimming those later
(
ast.unwrap_or_else(|| LogicalAst::Clause(Vec::new())),
errors,
)
let ast = self.compute_logical_ast_with_occur(*ast)?;
Ok(ast.boost(boost as Score))
}
UserInputAst::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
}
}
@@ -735,31 +658,23 @@ impl QueryParser {
Ok(triplets)
}
fn compute_logical_ast_from_leaf_lenient(
fn compute_logical_ast_from_leaf(
&self,
leaf: UserInputLeaf,
) -> (Option<LogicalAst>, Vec<QueryParserError>) {
) -> Result<LogicalAst, QueryParserError> {
match leaf {
UserInputLeaf::Literal(literal) => {
let term_phrases: Vec<(Field, &str, &str)> =
try_tuple!(self.compute_path_triplets_for_literal(&literal));
self.compute_path_triplets_for_literal(&literal)?;
let mut asts: Vec<LogicalAst> = Vec::new();
let mut errors: Vec<QueryParserError> = Vec::new();
for (field, json_path, phrase) in term_phrases {
let unboosted_asts = match self.compute_logical_ast_for_leaf(
for ast in self.compute_logical_ast_for_leaf(
field,
json_path,
phrase,
literal.slop,
literal.prefix,
) {
Ok(asts) => asts,
Err(e) => {
errors.push(e);
continue;
}
};
for ast in unboosted_asts {
)? {
// Apply some field specific boost defined at the query parser level.
let boost = self.field_boost(field);
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
@@ -770,82 +685,56 @@ impl QueryParser {
} else {
LogicalAst::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
(Some(result_ast), errors)
Ok(result_ast)
}
UserInputLeaf::All => (
Some(LogicalAst::Leaf(Box::new(LogicalLiteral::All))),
Vec::new(),
),
UserInputLeaf::All => Ok(LogicalAst::Leaf(Box::new(LogicalLiteral::All))),
UserInputLeaf::Range {
field: full_field_opt,
lower,
upper,
} => {
let Some(full_path) = full_field_opt else {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
)],
);
};
let (field, json_path) = try_tuple!(self
let full_path = full_field_opt.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
)
})?;
let (field, json_path) = self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let mut errors = Vec::new();
let lower = match self.resolve_bound(field, json_path, &lower) {
Ok(bound) => bound,
Err(error) => {
errors.push(error);
Bound::Unbounded
}
};
let upper = match self.resolve_bound(field, json_path, &upper) {
Ok(bound) => bound,
Err(error) => {
errors.push(error);
Bound::Unbounded
}
};
if lower == Bound::Unbounded && upper == Bound::Unbounded {
// this range is useless, either because a user requested [* TO *], or because
// we failed to parse something. Either way, there is no point emiting it
return (None, errors);
}
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
field: self.schema.get_field_name(field).to_string(),
value_type,
lower,
upper,
lower: self.resolve_bound(field, json_path, &lower)?,
upper: self.resolve_bound(field, json_path, &upper)?,
}));
(Some(logical_ast), errors)
Ok(logical_ast)
}
UserInputLeaf::Set {
field: full_field_opt,
elements,
} => {
let full_path = try_tuple!(full_field_opt.ok_or_else(|| {
let full_path = full_field_opt.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
"Set query need to target a specific field.".to_string(),
)
}));
let (field, json_path) = try_tuple!(self
})?;
let (field, json_path) = self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let (elements, errors) = elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.partition_result();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
elements,
elements: elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.collect::<Result<Vec<_>, _>>()?,
field,
value_type,
}));
(Some(logical_ast), errors)
Ok(logical_ast)
}
}
}
@@ -1067,7 +956,7 @@ mod test {
.iter()
.flat_map(|field_name| schema.get_field(field_name))
.collect();
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer::default())
@@ -1305,7 +1194,7 @@ mod test {
"k8s\u{1}node\u{1}name\0shello"
);
assert_eq!(
extract_query_term_json_path(r"json.k8s\.node\.name:hello"),
extract_query_term_json_path(r#"json.k8s\.node\.name:hello"#),
"k8s.node.name\0shello"
);
}
@@ -1558,7 +1447,7 @@ mod test {
let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let default_fields = vec![title];
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
assert_matches!(
@@ -1731,10 +1620,11 @@ mod test {
#[test]
fn test_escaped_field() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field(r"a\.b", STRING);
schema_builder.add_text_field(r#"a\.b"#, STRING);
let schema = schema_builder.build();
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
let query = query_parser.parse_query(r"a\.b:hello").unwrap();
let query_parser =
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!(
format!("{query:?}"),
"TermQuery(Term(field=0, type=Str, \"hello\"))"
@@ -1750,8 +1640,11 @@ mod test {
schema_builder.add_text_field("first.toto.titi", STRING);
schema_builder.add_text_field("third.a.b.c", STRING);
let schema = schema_builder.build();
let query_parser =
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
let query_parser = QueryParser::new(
schema.clone(),
Vec::new(),
TokenizerManager::default_for_indexing(),
);
assert_eq!(
query_parser.split_full_path("first.toto"),
Some((schema.get_field("first.toto").unwrap(), ""))

View File

@@ -31,8 +31,8 @@ impl VecCursor {
self.current_pos = 0;
&mut self.docs
}
fn last_doc(&self) -> Option<u32> {
self.docs.last().cloned()
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
}
fn is_empty(&self) -> bool {
self.current().is_none()
@@ -112,15 +112,15 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
finished_to_end = true;
}
let last_doc = self.loaded_docs.last_doc();
let last_value = self.loaded_docs.last_value();
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
doc_buffer,
);
if let Some(last_doc) = last_doc {
while self.loaded_docs.current() == Some(last_doc) {
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
@@ -136,7 +136,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
if let Some(docid) = self.loaded_docs.next() {
return docid;
}
if self.next_fetch_start >= self.column.num_docs() {
if self.next_fetch_start >= self.column.values.num_vals() {
return TERMINATED;
}
self.fetch_block();
@@ -177,54 +177,3 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, Document, IndexBuilder};
#[test]
fn range_query_fast_optional_field_minimum() {
let mut schema_builder = schema::SchemaBuilder::new();
let id_field = schema_builder.add_text_field("id", schema::STRING);
let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED);
let dir = RamDirectory::default();
let index = IndexBuilder::new()
.schema(schema_builder.build())
.open_or_create(dir)
.unwrap();
{
let mut writer = index.writer(15_000_000).unwrap();
let count = 1000;
for i in 0..count {
let mut doc = Document::new();
doc.add_text(id_field, format!("doc{i}"));
let nb_scores = i % 2; // 0 or 1 scores
for _ in 0..nb_scores {
doc.add_u64(score_field, 80);
}
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();
assert_eq!(count, 500);
}
}

View File

@@ -48,7 +48,7 @@ use crate::{DateTime, DocId, Score};
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year {

View File

@@ -26,7 +26,7 @@ use crate::schema::Field;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;

View File

@@ -27,7 +27,7 @@ use crate::Term;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
@@ -151,7 +151,7 @@ mod tests {
let ip_addr_2 = Ipv6Addr::from_u128(10);
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer
.add_document(doc!(
ip_field => ip_addr_1

View File

@@ -191,7 +191,7 @@ impl InnerIndexReader {
}
/// Opens the freshest segments [`SegmentReader`].
///
/// This function acquires a lock to prevent GC from removing files
/// This function acquires a lot to prevent GC from removing files
/// as we are opening our index.
fn open_segment_readers(index: &Index) -> crate::Result<Vec<SegmentReader>> {
// Prevents segment files from getting deleted while we are in the process of opening them

View File

@@ -179,7 +179,6 @@ mod tests {
use super::Warmer;
use crate::core::searcher::SearcherGeneration;
use crate::directory::RamDirectory;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::{Schema, INDEXED};
use crate::{Index, IndexSettings, ReloadPolicy, Searcher, SegmentId};
@@ -256,10 +255,7 @@ mod tests {
let num_writer_threads = 4;
let mut writer = index
.writer_with_num_threads(
num_writer_threads,
MEMORY_BUDGET_NUM_BYTES_MIN * num_writer_threads,
)
.writer_with_num_threads(num_writer_threads, 25_000_000)
.unwrap();
for i in 0u64..1000u64 {

View File

@@ -79,9 +79,12 @@ impl BytesOptions {
self
}
/// Set the field as a fast field.
/// Set the field as a single-valued fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self) -> BytesOptions {
self.fast = true;

View File

@@ -78,9 +78,12 @@ impl DateOptions {
self
}
/// Set the field as a fast field.
/// Set the field as a single-valued fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self) -> DateOptions {
self.fast = true;

View File

@@ -83,6 +83,9 @@ impl IpAddrOptions {
/// Set the field as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self) -> Self {
self.fast = true;

View File

@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use super::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions};
use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
/// The `JsonObjectOptions` make it possible to
/// configure how a json object field should be indexed and stored.
@@ -58,20 +58,19 @@ impl JsonObjectOptions {
/// Returns true if and only if the json object fields are
/// to be treated as fast fields.
pub fn is_fast(&self) -> bool {
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
match self.fast {
FastFieldTextOptions::Disabled => false,
FastFieldTextOptions::Enabled { .. } => true,
}
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::Enabled {
tokenizer: with_tokenizer,
} => Some(with_tokenizer.name()),
}
}
@@ -80,12 +79,12 @@ impl JsonObjectOptions {
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// It option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
///
/// If disabled, the "." needs to be escaped:
/// If disabled, the "." need to be escaped:
/// `k8s\.node\.id:5`.
pub fn is_expand_dots_enabled(&self) -> bool {
self.expand_dots_enabled
@@ -130,15 +129,11 @@ impl JsonObjectOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
let with_tokenizer = TokenizerName::from_name(tokenizer_name);
self.fast = FastFieldTextOptions::Enabled {
tokenizer: with_tokenizer,
};
self
}
@@ -166,7 +161,9 @@ impl From<FastFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: false,
indexing: None,
fast: FastFieldTextOptions::IsEnabled(true),
fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
},
expand_dots_enabled: false,
}
}

View File

@@ -1,14 +1,12 @@
//! Schema definition for tantivy's indices.
//!
//! # Setting your schema in Tantivy
//!
//!
//! Tantivy has a very strict schema.
//! The schema defines information about the fields your index contains, that is, for each field:
//!
//! - the field name (may contain any characted, can't start with a `-` and can't be empty. Some
//! characters may require escaping when using the query parser).
//! - the type of the field (currently `text`, `u64`, `i64`, `f64`, `bool`, `date`, `IpAddr`,
//! facets, bytes and json are supported)
//! - the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`)
//! - the type of the field (currently only `text` and `u64` are supported)
//! - how the field should be indexed / stored.
//!
//! This very last point is critical as it will enable / disable some of the functionality
@@ -155,6 +153,8 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value;
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
/// Validator for a potential `field_name`.
/// Returns true if the name can be use for a field name.
///

View File

@@ -120,9 +120,12 @@ impl NumericOptions {
self
}
/// Set the field as a fast field.
/// Set the field as a single-valued fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self) -> NumericOptions {
self.fast = true;

View File

@@ -514,8 +514,8 @@ mod tests {
#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r"a\.b.c"), &[4]);
assert_eq!(&super::locate_splitting_dots(r"a\..b.c"), &[3, 5]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}
#[test]

View File

@@ -24,19 +24,68 @@ pub struct TextOptions {
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
#[serde(
into = "FastFieldTextOptionsForSerialization",
from = "FastFieldTextOptionsForSerialization"
)]
/// Enum to control how the fast field setting of a text field.
#[derive(Default)]
pub(crate) enum FastFieldTextOptions {
/// Flag to enable/disable
IsEnabled(bool),
/// Fastfield disabled
#[default]
Disabled,
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
EnabledWithTokenizer { with_tokenizer: TokenizerName },
Enabled { tokenizer: TokenizerName },
}
impl Default for FastFieldTextOptions {
fn default() -> Self {
FastFieldTextOptions::IsEnabled(false)
/// Enum used to control the way we serialize fast field text options.
///
/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
/// `false` -> Disabled
/// `true` -> Enabled with default tokenizer
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
#[derive(Serialize, Deserialize)]
#[serde(untagged)]
enum FastFieldTextOptionsForSerialization {
IsEnabled(bool),
EnabledWithTokenizer {
#[serde(alias = "with_tokenizer")]
tokenizer: TokenizerName,
},
}
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
match value {
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
if enabled {
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
),
}
} else {
FastFieldTextOptions::Disabled
}
}
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
FastFieldTextOptions::Enabled { tokenizer }
}
}
}
}
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
fn from(value: FastFieldTextOptions) -> Self {
match value {
FastFieldTextOptions::Disabled => {
FastFieldTextOptionsForSerialization::IsEnabled(false)
}
FastFieldTextOptions::Enabled { tokenizer } => {
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
}
}
}
}
@@ -45,23 +94,13 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
match (self, other) {
(
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
_,
)
| (
_,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
) => FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
(FastFieldTextOptions::IsEnabled(true), _)
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
(FastFieldTextOptions::Enabled { tokenizer }, _)
| (_, FastFieldTextOptions::Enabled { tokenizer }) => {
FastFieldTextOptions::Enabled { tokenizer }
}
(FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
FastFieldTextOptions::Disabled
}
}
}
}
@@ -83,20 +122,17 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool {
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
match &self.fast {
FastFieldTextOptions::Disabled => false,
FastFieldTextOptions::Enabled { .. } => true,
}
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
}
}
@@ -121,15 +157,9 @@ impl TextOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
let tokenizer = TokenizerName::from_name(tokenizer_name);
self.fast = FastFieldTextOptions::Enabled { tokenizer };
self
}
@@ -263,7 +293,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
fast: FastFieldTextOptions::IsEnabled(false),
fast: FastFieldTextOptions::Disabled,
coerce: false,
};
@@ -276,7 +306,7 @@ pub const TEXT: TextOptions = TextOptions {
}),
stored: false,
coerce: false,
fast: FastFieldTextOptions::IsEnabled(false),
fast: FastFieldTextOptions::Disabled,
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -326,7 +356,9 @@ impl From<FastFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: FastFieldTextOptions::IsEnabled(true),
fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
},
coerce: false,
}
}
@@ -392,21 +424,21 @@ mod tests {
#[test]
fn serde_fast_field_tokenizer() {
let json = r#" {
"fast": { "with_tokenizer": "default" }
"fast": { "tokenizer": "default" }
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static("default")
}
);
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static("default")
}
);
@@ -414,18 +446,28 @@ mod tests {
"fast": true
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
let json = r#" {
"fast": false
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
}
}

View File

@@ -262,7 +262,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
/// # let text_field = schema_builder.add_text_field("text", TEXT);
/// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
/// # Je ne me sentis plus guidé par les haleurs :
/// # Des Peaux-Rouges criards les avaient pris pour cibles,

View File

@@ -89,7 +89,6 @@ pub struct ZstdCompressor {
pub compression_level: Option<i32>,
}
#[cfg(feature = "zstd-compression")]
impl ZstdCompressor {
fn deser_from_str(val: &str) -> Result<ZstdCompressor, String> {
if !val.starts_with("zstd") {
@@ -174,7 +173,7 @@ impl Compressor {
}
}
#[cfg(all(feature = "zstd-compression", test))]
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -189,7 +189,7 @@ pub mod tests {
#[test]
fn test_raw_tokenizer2() {
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
@@ -206,7 +206,7 @@ pub mod tests {
#[test]
fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
@@ -228,7 +228,7 @@ pub mod tests {
#[test]
fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
@@ -256,7 +256,7 @@ pub mod tests {
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<Token> = vec![];
@@ -282,7 +282,7 @@ pub mod tests {
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let tokenizer_manager = TokenizerManager::default_for_indexing();
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{

View File

@@ -27,6 +27,7 @@ pub struct TokenizerManager {
impl TokenizerManager {
/// Creates an empty tokenizer manager.
#[allow(clippy::new_without_default)]
pub fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
@@ -51,12 +52,10 @@ impl TokenizerManager {
.get(tokenizer_name)
.cloned()
}
}
impl Default for TokenizerManager {
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`.
fn default() -> TokenizerManager {
pub fn default_for_indexing() -> TokenizerManager {
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer::default());
manager.register(
@@ -77,4 +76,28 @@ impl Default for TokenizerManager {
manager.register("whitespace", WhitespaceTokenizer::default());
manager
}
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`
/// for fast fields.
///
/// Fast fields usually do not really tokenize the text.
/// It is however very useful to filter / normalize the text.
pub fn default_for_fast_fields() -> TokenizerManager {
let manager = TokenizerManager::new();
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.build();
manager.register(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
lower_tokenizer.clone(),
);
manager.register("raw", raw_tokenizer);
manager.register("lower", lower_tokenizer);
manager
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-sstable"
version = "0.2.0"
version = "0.1.0"
edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
@@ -10,7 +10,7 @@ categories = ["database-implementations", "data-structures", "compression"]
description = "sstables for tantivy"
[dependencies]
common = {version= "0.6", path="../common", package="tantivy-common"}
common = {version= "0.5", path="../common", package="tantivy-common"}
tantivy-fst = "0.4"
# experimental gives us access to Decompressor::upper_bound
zstd = { version = "0.12", features = ["experimental"] }

View File

@@ -15,7 +15,7 @@ impl<B: AsRef<[u8]>> Ord for HeapItem<B> {
}
impl<B: AsRef<[u8]>> PartialOrd for HeapItem<B> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
Some(other.0.as_ref().cmp(self.0.as_ref()))
}
}

Some files were not shown because too many files have changed in this diff Show More