Compare commits

..

1 Commits

Author SHA1 Message Date
Raphaël Marinier
0890503fc2 Speed up searches by removing repeated memsets coming from vec.resize()
Also, reserve exactly the size needed, which is surprisingly needed to
get the full speedup of ~5% on a good fraction of the queries.
2024-03-12 17:50:23 +01:00
101 changed files with 524 additions and 1123 deletions

View File

@@ -11,12 +11,12 @@ repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2021" edition = "2021"
rust-version = "1.63" rust-version = "1.62"
exclude = ["benches/*.json", "benches/*.txt"] exclude = ["benches/*.json", "benches/*.txt"]
[dependencies] [dependencies]
oneshot = "0.1.5" oneshot = "0.1.5"
base64 = "0.22.0" base64 = "0.21.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
once_cell = "1.10.0" once_cell = "1.10.0"
@@ -78,9 +78,6 @@ paste = "1.0.11"
more-asserts = "0.3.1" more-asserts = "0.3.1"
rand_distr = "0.4.3" rand_distr = "0.4.3"
time = { version = "0.3.10", features = ["serde-well-known", "macros"] } time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [
"use-std",
], default-features = false }
[target.'cfg(not(windows))'.dev-dependencies] [target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false } criterion = { version = "0.5", default-features = false }

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::io; use std::io;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
@@ -124,6 +125,8 @@ impl BitUnpacker {
// Decodes the range of bitpacked `u32` values with idx // Decodes the range of bitpacked `u32` values with idx
// in [start_idx, start_idx + output.len()). // in [start_idx, start_idx + output.len()).
// It is guaranteed to completely fill `output` and not read from it, so passing a vector with
// un-initialized values is safe.
// //
// #Panics // #Panics
// //
@@ -236,7 +239,19 @@ impl BitUnpacker {
data: &[u8], data: &[u8],
positions: &mut Vec<u32>, positions: &mut Vec<u32>,
) { ) {
positions.resize(id_range.len(), 0u32); // We use the code below instead of positions.resize(id_range.len(), 0u32) for performance
// reasons: on some queries, the CPU cost of memsetting the array and of using a bigger
// vector than necessary is noticeable (~5%).
// In particular, searches are a few percent faster when using reserve_exact() as below
// instead of reserve().
// The un-initialized values are safe as get_batch_u32s() completely fills `positions`
// and does not read from it.
positions.clear();
positions.reserve_exact(id_range.len());
#[allow(clippy::uninit_vec)]
unsafe {
positions.set_len(id_range.len());
}
self.get_batch_u32s(id_range.start, data, positions); self.get_batch_u32s(id_range.start, data, positions);
crate::filter_vec::filter_vec_in_place(value_range, id_range.start, positions) crate::filter_vec::filter_vec_in_place(value_range, id_range.start, positions)
} }

View File

@@ -17,7 +17,6 @@ sstable = { version= "0.2", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.6", path = "../common", package = "tantivy-common" } common = { version= "0.6", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" } tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" }
serde = "1.0.152" serde = "1.0.152"
downcast-rs = "1.2.0"
[dev-dependencies] [dev-dependencies]
proptest = "1" proptest = "1"

View File

@@ -1,155 +0,0 @@
#![feature(test)]
extern crate test;
use std::sync::Arc;
use rand::prelude::*;
use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
use tantivy_columnar::*;
use test::{black_box, Bencher};
struct Columns {
pub optional: Column,
pub full: Column,
pub multi: Column,
}
fn get_test_columns() -> Columns {
let data = generate_permutation();
let mut dataframe_writer = ColumnarWriter::default();
for (idx, val) in data.iter().enumerate() {
dataframe_writer.record_numerical(idx as u32, "full_values", NumericalValue::U64(*val));
if idx % 2 == 0 {
dataframe_writer.record_numerical(
idx as u32,
"optional_values",
NumericalValue::U64(*val),
);
}
dataframe_writer.record_numerical(idx as u32, "multi_values", NumericalValue::U64(*val));
dataframe_writer.record_numerical(idx as u32, "multi_values", NumericalValue::U64(*val));
}
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(data.len() as u32, None, &mut buffer)
.unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("optional_values").unwrap();
assert_eq!(cols.len(), 1);
let optional = cols[0].open_u64_lenient().unwrap().unwrap();
assert_eq!(optional.index.get_cardinality(), Cardinality::Optional);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("full_values").unwrap();
assert_eq!(cols.len(), 1);
let column_full = cols[0].open_u64_lenient().unwrap().unwrap();
assert_eq!(column_full.index.get_cardinality(), Cardinality::Full);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("multi_values").unwrap();
assert_eq!(cols.len(), 1);
let multi = cols[0].open_u64_lenient().unwrap().unwrap();
assert_eq!(multi.index.get_cardinality(), Cardinality::Multivalued);
Columns {
optional,
full: column_full,
multi,
}
}
const NUM_VALUES: u64 = 100_000;
fn generate_permutation() -> Vec<u64> {
let mut permutation: Vec<u64> = (0u64..NUM_VALUES).collect();
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation
}
pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn ColumnValues<u64>> {
serialize_and_load_u64_based_column_values(&column, &[codec_type])
}
fn run_bench_on_column_full_scan(b: &mut Bencher, column: Column) {
let num_iter = black_box(NUM_VALUES);
b.iter(|| {
let mut sum = 0u64;
for i in 0..num_iter as u32 {
let val = column.first(i);
sum += val.unwrap_or(0);
}
sum
});
}
fn run_bench_on_column_block_fetch(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
column.first_vals(&fetch_docids, &mut block);
block[0]
});
}
fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
for i in 0..fetch_docids.len() {
block[i] = column.first(fetch_docids[i]);
}
block[0]
});
}
/// Column first method
#[bench]
fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_full_scan(b, column);
}
/// Block fetch column accessor
#[bench]
fn bench_get_block_first_on_optional_column(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_single_calls(b, column);
}

View File

@@ -16,6 +16,14 @@ fn generate_permutation() -> Vec<u64> {
permutation permutation
} }
fn generate_random() -> Vec<u64> {
let mut permutation: Vec<u64> = (0u64..100_000u64)
.map(|el| el + random::<u16>() as u64)
.collect();
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation
}
// Warning: this generates the same permutation at each call // Warning: this generates the same permutation at each call
fn generate_permutation_gcd() -> Vec<u64> { fn generate_permutation_gcd() -> Vec<u64> {
let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect(); let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect();

View File

@@ -14,32 +14,20 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
ColumnBlockAccessor<T> ColumnBlockAccessor<T>
{ {
#[inline] #[inline]
pub fn fetch_block<'a>(&'a mut self, docs: &'a [u32], accessor: &Column<T>) { pub fn fetch_block(&mut self, docs: &[u32], accessor: &Column<T>) {
if accessor.index.get_cardinality().is_full() { self.docid_cache.clear();
self.val_cache.resize(docs.len(), T::default()); self.row_id_cache.clear();
accessor.values.get_vals(docs, &mut self.val_cache); accessor.row_ids_for_docs(docs, &mut self.docid_cache, &mut self.row_id_cache);
} else { self.val_cache.resize(self.row_id_cache.len(), T::default());
self.docid_cache.clear(); accessor
self.row_id_cache.clear(); .values
accessor.row_ids_for_docs(docs, &mut self.docid_cache, &mut self.row_id_cache); .get_vals(&self.row_id_cache, &mut self.val_cache);
self.val_cache.resize(self.row_id_cache.len(), T::default());
accessor
.values
.get_vals(&self.row_id_cache, &mut self.val_cache);
}
} }
#[inline] #[inline]
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) { pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
self.fetch_block(docs, accessor); self.fetch_block(docs, accessor);
// no missing values // We can compare docid_cache with docs to find missing docs
if accessor.index.get_cardinality().is_full() { if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() {
return;
}
// We can compare docid_cache length with docs to find missing docs
// For multi value columns we can't rely on the length and always need to scan
if accessor.index.get_cardinality().is_multivalue() || docs.len() != self.docid_cache.len()
{
self.missing_docids_cache.clear(); self.missing_docids_cache.clear();
find_missing_docs(docs, &self.docid_cache, |doc| { find_missing_docs(docs, &self.docid_cache, |doc| {
self.missing_docids_cache.push(doc); self.missing_docids_cache.push(doc);
@@ -56,25 +44,11 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
} }
#[inline] #[inline]
/// Returns an iterator over the docids and values pub fn iter_docid_vals(&self) -> impl Iterator<Item = (DocId, T)> + '_ {
/// The passed in `docs` slice needs to be the same slice that was passed to `fetch_block` or self.docid_cache
/// `fetch_block_with_missing`. .iter()
/// .cloned()
/// The docs is used if the column is full (each docs has exactly one value), otherwise the .zip(self.val_cache.iter().cloned())
/// internal docid vec is used for the iterator, which e.g. may contain duplicate docs.
pub fn iter_docid_vals<'a>(
&'a self,
docs: &'a [u32],
accessor: &Column<T>,
) -> impl Iterator<Item = (DocId, T)> + '_ {
if accessor.index.get_cardinality().is_full() {
docs.iter().cloned().zip(self.val_cache.iter().cloned())
} else {
self.docid_cache
.iter()
.cloned()
.zip(self.val_cache.iter().cloned())
}
} }
} }

View File

@@ -3,17 +3,17 @@ mod serialize;
use std::fmt::{self, Debug}; use std::fmt::{self, Debug};
use std::io::Write; use std::io::Write;
use std::ops::{Range, RangeInclusive}; use std::ops::{Deref, Range, RangeInclusive};
use std::sync::Arc; use std::sync::Arc;
use common::BinarySerializable; use common::BinarySerializable;
pub use dictionary_encoded::{BytesColumn, StrColumn}; pub use dictionary_encoded::{BytesColumn, StrColumn};
pub use serialize::{ pub use serialize::{
open_column_bytes, open_column_str, open_column_u128, open_column_u128_as_compact_u64, open_column_bytes, open_column_str, open_column_u128, open_column_u64,
open_column_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64,
}; };
use crate::column_index::{ColumnIndex, Set}; use crate::column_index::ColumnIndex;
use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal; use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
use crate::column_values::{monotonic_map_column, ColumnValues}; use crate::column_values::{monotonic_map_column, ColumnValues};
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId}; use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
@@ -83,36 +83,10 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
self.values.max_value() self.values.max_value()
} }
#[inline]
pub fn first(&self, row_id: RowId) -> Option<T> { pub fn first(&self, row_id: RowId) -> Option<T> {
self.values_for_doc(row_id).next() self.values_for_doc(row_id).next()
} }
/// Load the first value for each docid in the provided slice.
#[inline]
pub fn first_vals(&self, docids: &[DocId], output: &mut [Option<T>]) {
match &self.index {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => self.values.get_vals_opt(docids, output),
ColumnIndex::Optional(optional_index) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid));
}
}
ColumnIndex::Multivalued(multivalued_index) => {
for (i, docid) in docids.iter().enumerate() {
let range = multivalued_index.range(*docid);
let is_empty = range.start == range.end;
if !is_empty {
output[i] = Some(self.values.get_val(range.start));
}
}
}
}
}
/// Translates a block of docis to row_ids. /// Translates a block of docis to row_ids.
/// ///
/// returns the row_ids and the matching docids on the same index /// returns the row_ids and the matching docids on the same index
@@ -131,8 +105,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ { pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
self.index self.value_row_ids(doc_id)
.value_row_ids(doc_id)
.map(|value_row_id: RowId| self.values.get_val(value_row_id)) .map(|value_row_id: RowId| self.values.get_val(value_row_id))
} }
@@ -174,6 +147,14 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
} }
impl<T> Deref for Column<T> {
type Target = ColumnIndex;
fn deref(&self) -> &Self::Target {
&self.index
}
}
impl BinarySerializable for Cardinality { impl BinarySerializable for Cardinality {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
self.to_code().serialize(writer) self.to_code().serialize(writer)
@@ -195,7 +176,6 @@ struct FirstValueWithDefault<T: Copy> {
impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T> impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T>
for FirstValueWithDefault<T> for FirstValueWithDefault<T>
{ {
#[inline(always)]
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u32) -> T {
self.column.first(idx).unwrap_or(self.default_value) self.column.first(idx).unwrap_or(self.default_value)
} }

View File

@@ -76,26 +76,6 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
}) })
} }
/// Open the column as u64.
///
/// See [`open_u128_as_compact_u64`] for more details.
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
.as_slice()
.try_into()
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
Ok(Column {
index: column_index,
values: column_values,
})
}
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> { pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
let (body, dictionary_len_bytes) = data.rsplit(4); let (body, dictionary_len_bytes) = data.rsplit(4);
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap()); let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());

View File

@@ -140,7 +140,7 @@ mod tests {
#[test] #[test]
fn test_merge_column_index_optional_shuffle() { fn test_merge_column_index_optional_shuffle() {
let optional_index: ColumnIndex = OptionalIndex::for_test(2, &[0]).into(); let optional_index: ColumnIndex = OptionalIndex::for_test(2, &[0]).into();
let column_indexes = [optional_index, ColumnIndex::Full]; let column_indexes = vec![optional_index, ColumnIndex::Full];
let row_addrs = vec![ let row_addrs = vec![
RowAddr { RowAddr {
segment_ord: 0u32, segment_ord: 0u32,

View File

@@ -42,6 +42,10 @@ impl From<MultiValueIndex> for ColumnIndex {
} }
impl ColumnIndex { impl ColumnIndex {
#[inline]
pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_))
}
/// Returns the cardinality of the column index. /// Returns the cardinality of the column index.
/// ///
/// By convention, if the column contains no docs, we consider that it is /// By convention, if the column contains no docs, we consider that it is

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::io::{self, Write}; use std::io::{self, Write};
use common::BinarySerializable; use common::BinarySerializable;

View File

@@ -1,4 +1,5 @@
use proptest::prelude::*; use proptest::prelude::{any, prop, *};
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest}; use proptest::{prop_oneof, proptest};
use super::*; use super::*;

View File

@@ -10,7 +10,7 @@ pub(crate) struct MergedColumnValues<'a, T> {
pub(crate) merge_row_order: &'a MergeRowOrder, pub(crate) merge_row_order: &'a MergeRowOrder,
} }
impl<'a, T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'a, T> { impl<'a, T: Copy + PartialOrd + Debug> Iterable<T> for MergedColumnValues<'a, T> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
match self.merge_row_order { match self.merge_row_order {
MergeRowOrder::Stack(_) => Box::new( MergeRowOrder::Stack(_) => Box::new(

View File

@@ -10,7 +10,6 @@ use std::fmt::Debug;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
use std::sync::Arc; use std::sync::Arc;
use downcast_rs::DowncastSync;
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use monotonic_mapping_u128::MonotonicallyMappableToU128; pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
@@ -26,10 +25,7 @@ mod monotonic_column;
pub(crate) use merge::MergedColumnValues; pub(crate) use merge::MergedColumnValues;
pub use stats::ColumnStats; pub use stats::ColumnStats;
pub use u128_based::{ pub use u128_based::{open_u128_mapped, serialize_column_values_u128};
open_u128_as_compact_u64, open_u128_mapped, serialize_column_values_u128,
CompactSpaceU64Accessor,
};
pub use u64_based::{ pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values, load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES, serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
@@ -45,7 +41,7 @@ use crate::RowId;
/// ///
/// Any methods with a default and specialized implementation need to be called in the /// Any methods with a default and specialized implementation need to be called in the
/// wrappers that implement the trait: Arc and MonotonicMappingColumn /// wrappers that implement the trait: Arc and MonotonicMappingColumn
pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync { pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
/// Return the value associated with the given idx. /// Return the value associated with the given idx.
/// ///
/// This accessor should return as fast as possible. /// This accessor should return as fast as possible.
@@ -72,40 +68,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
out_x4[3] = self.get_val(idx_x4[3]); out_x4[3] = self.get_val(idx_x4[3]);
} }
let out_and_idx_chunks = output let step_size = 4;
.chunks_exact_mut(4) let cutoff = indexes.len() - indexes.len() % step_size;
.into_remainder()
.iter_mut()
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
*out = self.get_val(*idx);
}
}
/// Allows to push down multiple fetch calls, to avoid dynamic dispatch overhead. for idx in cutoff..indexes.len() {
/// The slightly weird `Option<T>` in output allows pushdown to full columns. output[idx] = self.get_val(indexes[idx]);
///
/// idx and output should have the same length
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_vals_opt(&self, indexes: &[u32], output: &mut [Option<T>]) {
assert!(indexes.len() == output.len());
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
out_x4[0] = Some(self.get_val(idx_x4[0]));
out_x4[1] = Some(self.get_val(idx_x4[1]));
out_x4[2] = Some(self.get_val(idx_x4[2]));
out_x4[3] = Some(self.get_val(idx_x4[3]));
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
.into_remainder()
.iter_mut()
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
*out = Some(self.get_val(*idx));
} }
} }
@@ -172,7 +139,6 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
} }
} }
downcast_rs::impl_downcast!(sync ColumnValues<T> where T: PartialOrd);
/// Empty column of values. /// Empty column of values.
pub struct EmptyColumnValues; pub struct EmptyColumnValues;
@@ -195,17 +161,12 @@ impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
} }
} }
impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnValues<T>> { impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
#[inline(always)] #[inline(always)]
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u32) -> T {
self.as_ref().get_val(idx) self.as_ref().get_val(idx)
} }
#[inline(always)]
fn get_vals_opt(&self, indexes: &[u32], output: &mut [Option<T>]) {
self.as_ref().get_vals_opt(indexes, output)
}
#[inline(always)] #[inline(always)]
fn min_value(&self) -> T { fn min_value(&self) -> T {
self.as_ref().min_value() self.as_ref().min_value()

View File

@@ -31,10 +31,10 @@ pub fn monotonic_map_column<C, T, Input, Output>(
monotonic_mapping: T, monotonic_mapping: T,
) -> impl ColumnValues<Output> ) -> impl ColumnValues<Output>
where where
C: ColumnValues<Input> + 'static, C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync + 'static, T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Debug + Send + Sync + Clone + 'static, Input: PartialOrd + Debug + Send + Sync + Clone,
Output: PartialOrd + Debug + Send + Sync + Clone + 'static, Output: PartialOrd + Debug + Send + Sync + Clone,
{ {
MonotonicMappingColumn { MonotonicMappingColumn {
from_column, from_column,
@@ -45,10 +45,10 @@ where
impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input> impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input>
where where
C: ColumnValues<Input> + 'static, C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync + 'static, T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Debug + Sync + Clone + 'static, Input: PartialOrd + Send + Debug + Sync + Clone,
Output: PartialOrd + Send + Debug + Sync + Clone + 'static, Output: PartialOrd + Send + Debug + Sync + Clone,
{ {
#[inline(always)] #[inline(always)]
fn get_val(&self, idx: u32) -> Output { fn get_val(&self, idx: u32) -> Output {
@@ -107,7 +107,7 @@ mod tests {
#[test] #[test]
fn test_monotonic_mapping_iter() { fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect(); let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
let col = VecColumn::from(vals); let col = VecColumn::from(&vals);
let mapped = monotonic_map_column( let mapped = monotonic_map_column(
col, col,
StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<i64>::new()), StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<i64>::new()),

View File

@@ -22,7 +22,7 @@ mod build_compact_space;
use build_compact_space::get_compact_space; use build_compact_space::get_compact_space;
use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128}; use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
use tantivy_bitpacker::{BitPacker, BitUnpacker}; use tantivy_bitpacker::{self, BitPacker, BitUnpacker};
use crate::column_values::ColumnValues; use crate::column_values::ColumnValues;
use crate::RowId; use crate::RowId;
@@ -148,7 +148,7 @@ impl CompactSpace {
.binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start) .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start)
// Correctness: Overflow. The first range starts at compact space 0, the error from // Correctness: Overflow. The first range starts at compact space 0, the error from
// binary search can never be 0 // binary search can never be 0
.unwrap_or_else(|e| e - 1); .map_or_else(|e| e - 1, |v| v);
let range_mapping = &self.ranges_mapping[pos]; let range_mapping = &self.ranges_mapping[pos];
let diff = compact - range_mapping.compact_start; let diff = compact - range_mapping.compact_start;
@@ -292,63 +292,6 @@ impl BinarySerializable for IPCodecParams {
} }
} }
/// Exposes the compact space compressed values as u64.
///
/// This allows faster access to the values, as u64 is faster to work with than u128.
/// It also allows to handle u128 values like u64, via the `open_u64_lenient` as a uniform
/// access interface.
///
/// When converting from the internal u64 to u128 `compact_to_u128` can be used.
pub struct CompactSpaceU64Accessor(CompactSpaceDecompressor);
impl CompactSpaceU64Accessor {
pub(crate) fn open(data: OwnedBytes) -> io::Result<CompactSpaceU64Accessor> {
let decompressor = CompactSpaceU64Accessor(CompactSpaceDecompressor::open(data)?);
Ok(decompressor)
}
/// Convert a compact space value to u128
pub fn compact_to_u128(&self, compact: u32) -> u128 {
self.0.compact_to_u128(compact)
}
}
impl ColumnValues<u64> for CompactSpaceU64Accessor {
#[inline]
fn get_val(&self, doc: u32) -> u64 {
let compact = self.0.get_compact(doc);
compact as u64
}
fn min_value(&self) -> u64 {
self.0.u128_to_compact(self.0.min_value()).unwrap() as u64
}
fn max_value(&self) -> u64 {
self.0.u128_to_compact(self.0.max_value()).unwrap() as u64
}
fn num_vals(&self) -> u32 {
self.0.params.num_vals
}
#[inline]
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(self.0.iter_compact().map(|el| el as u64))
}
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u64>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let value_range = self.0.compact_to_u128(*value_range.start() as u32)
..=self.0.compact_to_u128(*value_range.end() as u32);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
}
impl ColumnValues<u128> for CompactSpaceDecompressor { impl ColumnValues<u128> for CompactSpaceDecompressor {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u128 { fn get_val(&self, doc: u32) -> u128 {
@@ -459,14 +402,9 @@ impl CompactSpaceDecompressor {
.map(|compact| self.compact_to_u128(compact)) .map(|compact| self.compact_to_u128(compact))
} }
#[inline]
pub fn get_compact(&self, idx: u32) -> u32 {
self.params.bit_unpacker.get(idx, &self.data) as u32
}
#[inline] #[inline]
pub fn get(&self, idx: u32) -> u128 { pub fn get(&self, idx: u32) -> u128 {
let compact = self.get_compact(idx); let compact = self.params.bit_unpacker.get(idx, &self.data) as u32;
self.compact_to_u128(compact) self.compact_to_u128(compact)
} }

View File

@@ -6,9 +6,7 @@ use std::sync::Arc;
mod compact_space; mod compact_space;
use common::{BinarySerializable, OwnedBytes, VInt}; use common::{BinarySerializable, OwnedBytes, VInt};
pub use compact_space::{ use compact_space::{CompactSpaceCompressor, CompactSpaceDecompressor};
CompactSpaceCompressor, CompactSpaceDecompressor, CompactSpaceU64Accessor,
};
use crate::column_values::monotonic_map_column; use crate::column_values::monotonic_map_column;
use crate::column_values::monotonic_mapping::{ use crate::column_values::monotonic_mapping::{
@@ -110,23 +108,6 @@ pub fn open_u128_mapped<T: MonotonicallyMappableToU128 + Debug>(
StrictlyMonotonicMappingToInternal::<T>::new().into(); StrictlyMonotonicMappingToInternal::<T>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted))) Ok(Arc::new(monotonic_map_column(reader, inverted)))
} }
/// Returns the u64 representation of the u128 data.
/// The internal representation of the data as u64 is useful for faster processing.
///
/// In order to convert to u128 back cast to `CompactSpaceU64Accessor` and call
/// `compact_to_u128`.
///
/// # Notice
/// In case there are new codecs added, check for usages of `CompactSpaceDecompressorU64` and
/// also handle the new codecs.
pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<u64>>> {
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceU64Accessor::open(bytes)?;
Ok(Arc::new(reader))
}
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use super::*; use super::*;

View File

@@ -63,6 +63,7 @@ impl ColumnValues for BitpackedReader {
fn get_val(&self, doc: u32) -> u64 { fn get_val(&self, doc: u32) -> u64 {
self.stats.min_value + self.stats.gcd.get() * self.bit_unpacker.get(doc, &self.data) self.stats.min_value + self.stats.gcd.get() * self.bit_unpacker.get(doc, &self.data)
} }
#[inline] #[inline]
fn min_value(&self) -> u64 { fn min_value(&self) -> u64 {
self.stats.min_value self.stats.min_value

View File

@@ -63,10 +63,7 @@ impl BlockwiseLinearEstimator {
if self.block.is_empty() { if self.block.is_empty() {
return; return;
} }
let column = VecColumn::from(std::mem::take(&mut self.block)); let line = Line::train(&VecColumn::from(&self.block));
let line = Line::train(&column);
self.block = column.into();
let mut max_value = 0u64; let mut max_value = 0u64;
for (i, buffer_val) in self.block.iter().enumerate() { for (i, buffer_val) in self.block.iter().enumerate() {
let interpolated_val = line.eval(i as u32); let interpolated_val = line.eval(i as u32);
@@ -128,7 +125,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
*buffer_val = gcd_divider.divide(*buffer_val - stats.min_value); *buffer_val = gcd_divider.divide(*buffer_val - stats.min_value);
} }
let line = Line::train(&VecColumn::from(buffer.to_vec())); let line = Line::train(&VecColumn::from(&buffer));
assert!(!buffer.is_empty()); assert!(!buffer.is_empty());

View File

@@ -184,7 +184,7 @@ mod tests {
} }
fn test_eval_max_err(ys: &[u64]) -> Option<u64> { fn test_eval_max_err(ys: &[u64]) -> Option<u64> {
let line = Line::train(&VecColumn::from(ys.to_vec())); let line = Line::train(&VecColumn::from(&ys));
ys.iter() ys.iter()
.enumerate() .enumerate()
.map(|(x, y)| y.wrapping_sub(line.eval(x as u32))) .map(|(x, y)| y.wrapping_sub(line.eval(x as u32)))

View File

@@ -173,9 +173,7 @@ impl LinearCodecEstimator {
fn collect_before_line_estimation(&mut self, value: u64) { fn collect_before_line_estimation(&mut self, value: u64) {
self.block.push(value); self.block.push(value);
if self.block.len() == LINE_ESTIMATION_BLOCK_LEN { if self.block.len() == LINE_ESTIMATION_BLOCK_LEN {
let column = VecColumn::from(std::mem::take(&mut self.block)); let line = Line::train(&VecColumn::from(&self.block));
let line = Line::train(&column);
self.block = column.into();
let block = std::mem::take(&mut self.block); let block = std::mem::take(&mut self.block);
for val in block { for val in block {
self.collect_after_line_estimation(&line, val); self.collect_after_line_estimation(&line, val);

View File

@@ -1,4 +1,5 @@
use proptest::prelude::*; use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest}; use proptest::{prop_oneof, proptest};
#[test] #[test]

View File

@@ -4,14 +4,14 @@ use tantivy_bitpacker::minmax;
use crate::ColumnValues; use crate::ColumnValues;
/// VecColumn provides `Column` over a `Vec<T>`. /// VecColumn provides `Column` over a slice.
pub struct VecColumn<T = u64> { pub struct VecColumn<'a, T = u64> {
pub(crate) values: Vec<T>, pub(crate) values: &'a [T],
pub(crate) min_value: T, pub(crate) min_value: T,
pub(crate) max_value: T, pub(crate) max_value: T,
} }
impl<T: Copy + PartialOrd + Send + Sync + Debug + 'static> ColumnValues<T> for VecColumn<T> { impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> ColumnValues<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T { fn get_val(&self, position: u32) -> T {
self.values[position as usize] self.values[position as usize]
} }
@@ -37,8 +37,11 @@ impl<T: Copy + PartialOrd + Send + Sync + Debug + 'static> ColumnValues<T> for V
} }
} }
impl<T: Copy + PartialOrd + Default> From<Vec<T>> for VecColumn<T> { impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
fn from(values: Vec<T>) -> Self { where V: AsRef<[T]> + ?Sized
{
fn from(values: &'a V) -> Self {
let values = values.as_ref();
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default(); let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
Self { Self {
values, values,
@@ -47,8 +50,3 @@ impl<T: Copy + PartialOrd + Default> From<Vec<T>> for VecColumn<T> {
} }
} }
} }
impl From<VecColumn> for Vec<u64> {
fn from(column: VecColumn) -> Self {
column.values
}
}

View File

@@ -1,3 +1,7 @@
use std::collections::BTreeMap;
use itertools::Itertools;
use super::*; use super::*;
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId}; use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};

View File

@@ -13,7 +13,9 @@ pub(crate) use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena}; use stacker::{Addr, ArenaHashMap, MemoryArena};
use crate::column_index::SerializableColumnIndex; use crate::column_index::SerializableColumnIndex;
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use crate::column_values::{
ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
};
use crate::columnar::column_type::ColumnType; use crate::columnar::column_type::ColumnType;
use crate::columnar::writer::column_writers::{ use crate::columnar::writer::column_writers::{
ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter, ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
@@ -643,7 +645,10 @@ fn send_to_serialize_column_mappable_to_u128<
value_index_builders: &mut PreallocatedIndexBuilders, value_index_builders: &mut PreallocatedIndexBuilders,
values: &mut Vec<T>, values: &mut Vec<T>,
mut wrt: impl io::Write, mut wrt: impl io::Write,
) -> io::Result<()> { ) -> io::Result<()>
where
for<'a> VecColumn<'a, T>: ColumnValues<T>,
{
values.clear(); values.clear();
// TODO: split index and values // TODO: split index and values
let serializable_column_index = match cardinality { let serializable_column_index = match cardinality {
@@ -696,7 +701,10 @@ fn send_to_serialize_column_mappable_to_u64(
value_index_builders: &mut PreallocatedIndexBuilders, value_index_builders: &mut PreallocatedIndexBuilders,
values: &mut Vec<u64>, values: &mut Vec<u64>,
mut wrt: impl io::Write, mut wrt: impl io::Write,
) -> io::Result<()> { ) -> io::Result<()>
where
for<'a> VecColumn<'a, u64>: ColumnValues<u64>,
{
values.clear(); values.clear();
let serializable_column_index = match cardinality { let serializable_column_index = match cardinality {
Cardinality::Full => { Cardinality::Full => {

View File

@@ -18,12 +18,7 @@ pub struct ColumnarSerializer<W: io::Write> {
/// code. /// code.
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) { fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
buffer.clear(); buffer.clear();
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path. buffer.extend_from_slice(key);
if key.contains(&0u8) {
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
} else {
buffer.extend_from_slice(key);
}
buffer.push(0u8); buffer.push(0u8);
buffer.push(column_type.to_code()); buffer.push(column_type.to_code());
} }
@@ -101,13 +96,14 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::columnar::column_type::ColumnType;
#[test] #[test]
fn test_prepare_key_bytes() { fn test_prepare_key_bytes() {
let mut buffer: Vec<u8> = b"somegarbage".to_vec(); let mut buffer: Vec<u8> = b"somegarbage".to_vec();
prepare_key(b"root\0child", ColumnType::Str, &mut buffer); prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
assert_eq!(buffer.len(), 12); assert_eq!(buffer.len(), 12);
assert_eq!(&buffer[..10], b"root0child"); assert_eq!(&buffer[..10], b"root\0child");
assert_eq!(buffer[10], 0u8); assert_eq!(buffer[10], 0u8);
assert_eq!(buffer[11], ColumnType::Str.to_code()); assert_eq!(buffer[11], ColumnType::Str.to_code());
} }

View File

@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
use crate::column::{BytesColumn, Column, StrColumn}; use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn}; use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
use crate::columnar::ColumnType; use crate::columnar::ColumnType;
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType}; use crate::{Cardinality, ColumnIndex, NumericalType};
#[derive(Clone)] #[derive(Clone)]
pub enum DynamicColumn { pub enum DynamicColumn {
@@ -247,12 +247,7 @@ impl DynamicColumnHandle {
} }
/// Returns the `u64` fast field reader reader associated with `fields` of types /// Returns the `u64` fast field reader reader associated with `fields` of types
/// Str, u64, i64, f64, bool, ip, or datetime. /// Str, u64, i64, f64, bool, or datetime.
///
/// Notice that for IpAddr, the fastfield reader will return the u64 representation of the
/// IpAddr.
/// In order to convert to u128 back cast to `CompactSpaceU64Accessor` and call
/// `compact_to_u128`.
/// ///
/// If not, the fastfield reader will returns the u64-value associated with the original /// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue. /// FastValue.
@@ -263,10 +258,7 @@ impl DynamicColumnHandle {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?; let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column)) Ok(Some(column.term_ord_column))
} }
ColumnType::IpAddr => { ColumnType::IpAddr => Ok(None),
let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
Ok(Some(column))
}
ColumnType::Bool ColumnType::Bool
| ColumnType::I64 | ColumnType::I64
| ColumnType::U64 | ColumnType::U64

View File

@@ -113,9 +113,6 @@ impl Cardinality {
pub fn is_multivalue(&self) -> bool { pub fn is_multivalue(&self) -> bool {
matches!(self, Cardinality::Multivalued) matches!(self, Cardinality::Multivalued)
} }
pub fn is_full(&self) -> bool {
matches!(self, Cardinality::Full)
}
pub(crate) fn to_code(self) -> u8 { pub(crate) fn to_code(self) -> u8 {
self as u8 self as u8
} }

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::io::Write; use std::io::Write;
use std::{fmt, io, u64}; use std::{fmt, io, u64};

View File

@@ -40,7 +40,7 @@ pub type DatePrecision = DateTimePrecision;
/// All constructors and conversions are provided as explicit /// All constructors and conversions are provided as explicit
/// functions and not by implementing any `From`/`Into` traits /// functions and not by implementing any `From`/`Into` traits
/// to prevent unintended usage. /// to prevent unintended usage.
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] #[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DateTime { pub struct DateTime {
// Timestamp in nanoseconds. // Timestamp in nanoseconds.
pub(crate) timestamp_nanos: i64, pub(crate) timestamp_nanos: i64,

View File

@@ -290,7 +290,8 @@ impl<'a> BinarySerializable for Cow<'a, [u8]> {
#[cfg(test)] #[cfg(test)]
pub mod test { pub mod test {
use super::*; use super::{VInt, *};
use crate::serialize::BinarySerializable;
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() { pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap(); O::default().serialize(&mut buffer).unwrap();

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::ops::{Deref, Range}; use std::ops::{Deref, Range};
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};

View File

@@ -170,8 +170,8 @@ impl AggregationWithAccessor {
ColumnType::Str, ColumnType::Str,
ColumnType::DateTime, ColumnType::DateTime,
ColumnType::Bool, ColumnType::Bool,
ColumnType::IpAddr,
// ColumnType::Bytes Unsupported // ColumnType::Bytes Unsupported
// ColumnType::IpAddr Unsupported
]; ];
// In case the column is empty we want the shim column to match the missing type // In case the column is empty we want the shim column to match the missing type
@@ -292,7 +292,7 @@ impl AggregationWithAccessor {
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?; add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
} }
TopHits(ref mut top_hits) => { TopHits(ref mut top_hits) => {
top_hits.validate_and_resolve_field_names(reader.fast_fields().columnar())?; top_hits.validate_and_resolve(reader.fast_fields().columnar())?;
let accessors: Vec<(Column<u64>, ColumnType)> = top_hits let accessors: Vec<(Column<u64>, ColumnType)> = top_hits
.field_names() .field_names()
.iter() .iter()

View File

@@ -4,7 +4,6 @@ use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE; use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector; use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::segment_agg_result::AggregationLimits; use crate::aggregation::segment_agg_result::AggregationLimits;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms}; use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector; use crate::aggregation::DistributedAggregationCollector;
@@ -67,22 +66,6 @@ fn test_aggregation_flushing(
} }
} }
}, },
"top_hits_test":{
"terms": {
"field": "string_id"
},
"aggs": {
"bucketsL2": {
"top_hits": {
"size": 2,
"sort": [
{ "score": "asc" }
],
"docvalue_fields": ["score"]
}
}
}
},
"histogram_test":{ "histogram_test":{
"histogram": { "histogram": {
"field": "score", "field": "score",
@@ -125,16 +108,6 @@ fn test_aggregation_flushing(
let searcher = reader.searcher(); let searcher = reader.searcher();
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap(); let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
// Test postcard roundtrip serialization
let intermediate_agg_result_bytes = postcard::to_allocvec(&intermediate_agg_result).expect(
"Postcard Serialization failed, flatten etc. is not supported in the intermediate \
result",
);
let intermediate_agg_result: IntermediateAggregationResults =
postcard::from_bytes(&intermediate_agg_result_bytes)
.expect("Post deserialization failed");
intermediate_agg_result intermediate_agg_result
.into_final_result(agg_req, &Default::default()) .into_final_result(agg_req, &Default::default())
.unwrap() .unwrap()
@@ -843,38 +816,38 @@ fn test_aggregation_on_json_object_mixed_types() {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0, "mixed_price": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
// => Segment with all values text // => Segment with all values text
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
// => Segment with all boolen // => Segment with all boolen
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"}))) .add_document(doc!(json => json!({"mixed_type": true})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
// => Segment with mixed values // => Segment with mixed values
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "red", "mixed_price": 1.0}))) .add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "red", "mixed_price": 1.0}))) .add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5, "mixed_price": -20.5}))) .add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"}))) .add_document(doc!(json => json!({"mixed_type": true})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -888,7 +861,7 @@ fn test_aggregation_on_json_object_mixed_types() {
"order": { "min_price": "desc" } "order": { "min_price": "desc" }
}, },
"aggs": { "aggs": {
"min_price": { "min": { "field": "json.mixed_price" } } "min_price": { "min": { "field": "json.mixed_type" } }
} }
}, },
"rangeagg": { "rangeagg": {
@@ -912,6 +885,7 @@ fn test_aggregation_on_json_object_mixed_types() {
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
// pretty print as json
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
assert_eq!( assert_eq!(
&aggregation_res_json, &aggregation_res_json,
@@ -927,10 +901,10 @@ fn test_aggregation_on_json_object_mixed_types() {
"termagg": { "termagg": {
"buckets": [ "buckets": [
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } }, { "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } }, { "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
{ "doc_count": 2, "key": "red", "min_price": { "value": null } },
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } }, { "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
{ "doc_count": 3, "key": "blue", "min_price": { "value": null } },
], ],
"sum_other_doc_count": 0 "sum_other_doc_count": 0
} }

View File

@@ -1,5 +1,8 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::fmt::Display;
use columnar::ColumnType;
use itertools::Itertools;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tantivy_bitpacker::minmax; use tantivy_bitpacker::minmax;
@@ -15,7 +18,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateHistogramBucketEntry, IntermediateHistogramBucketEntry,
}; };
use crate::aggregation::segment_agg_result::{ use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector, build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
}; };
use crate::aggregation::*; use crate::aggregation::*;
use crate::TantivyError; use crate::TantivyError;
@@ -307,10 +310,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
.column_block_accessor .column_block_accessor
.fetch_block(docs, &bucket_agg_accessor.accessor); .fetch_block(docs, &bucket_agg_accessor.accessor);
for (doc, val) in bucket_agg_accessor for (doc, val) in bucket_agg_accessor.column_block_accessor.iter_docid_vals() {
.column_block_accessor
.iter_docid_vals(docs, &bucket_agg_accessor.accessor)
{
let val = self.f64_from_fastfield_u64(val); let val = self.f64_from_fastfield_u64(val);
let bucket_pos = get_bucket_pos(val); let bucket_pos = get_bucket_pos(val);
@@ -597,11 +597,13 @@ mod tests {
use serde_json::Value; use serde_json::Value;
use super::*; use super::*;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::tests::{ use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit, exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs, get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs,
}; };
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery; use crate::query::AllQuery;
#[test] #[test]

View File

@@ -1,6 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Range; use std::ops::Range;
use columnar::{ColumnType, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -235,10 +236,7 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
.column_block_accessor .column_block_accessor
.fetch_block(docs, &bucket_agg_accessor.accessor); .fetch_block(docs, &bucket_agg_accessor.accessor);
for (doc, val) in bucket_agg_accessor for (doc, val) in bucket_agg_accessor.column_block_accessor.iter_docid_vals() {
.column_block_accessor
.iter_docid_vals(docs, &bucket_agg_accessor.accessor)
{
let bucket_pos = self.get_bucket_pos(val); let bucket_pos = self.get_bucket_pos(val);
let bucket = &mut self.buckets[bucket_pos]; let bucket = &mut self.buckets[bucket_pos];
@@ -449,6 +447,7 @@ pub(crate) fn range_to_key(range: &Range<u64>, field_type: &ColumnType) -> crate
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use columnar::MonotonicallyMappableToU64;
use serde_json::Value; use serde_json::Value;
use super::*; use super::*;
@@ -457,6 +456,7 @@ mod tests {
exec_request, exec_request_with_query, get_test_index_2_segments, exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_with_num_docs, get_test_index_with_num_docs,
}; };
use crate::aggregation::AggregationLimits;
pub fn get_collector_from_ranges( pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>, ranges: Vec<RangeAggregationRange>,

View File

@@ -1,10 +1,6 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::net::Ipv6Addr;
use columnar::column_values::CompactSpaceU64Accessor; use columnar::{BytesColumn, ColumnType, MonotonicallyMappableToU64, StrColumn};
use columnar::{
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -109,9 +105,9 @@ pub struct TermsAggregation {
/// ///
/// Defaults to 10 * size. /// Defaults to 10 * size.
#[serde(skip_serializing_if = "Option::is_none", default)] #[serde(skip_serializing_if = "Option::is_none", default)]
#[serde(alias = "shard_size")] #[serde(alias = "segment_size")]
#[serde(alias = "split_size")] #[serde(alias = "split_size")]
pub segment_size: Option<u32>, pub shard_size: Option<u32>,
/// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will /// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will
/// include doc_count_error_upper_bound, which is an upper bound to the error on the /// include doc_count_error_upper_bound, which is an upper bound to the error on the
@@ -200,7 +196,7 @@ impl TermsAggregationInternal {
pub(crate) fn from_req(req: &TermsAggregation) -> Self { pub(crate) fn from_req(req: &TermsAggregation) -> Self {
let size = req.size.unwrap_or(10); let size = req.size.unwrap_or(10);
let mut segment_size = req.segment_size.unwrap_or(size * 10); let mut segment_size = req.shard_size.unwrap_or(size * 10);
let order = req.order.clone().unwrap_or_default(); let order = req.order.clone().unwrap_or_default();
segment_size = segment_size.max(size); segment_size = segment_size.max(size);
@@ -310,10 +306,7 @@ impl SegmentAggregationCollector for SegmentTermCollector {
} }
// has subagg // has subagg
if let Some(blueprint) = self.blueprint.as_ref() { if let Some(blueprint) = self.blueprint.as_ref() {
for (doc, term_id) in bucket_agg_accessor for (doc, term_id) in bucket_agg_accessor.column_block_accessor.iter_docid_vals() {
.column_block_accessor
.iter_docid_vals(docs, &bucket_agg_accessor.accessor)
{
let sub_aggregations = self let sub_aggregations = self
.term_buckets .term_buckets
.sub_aggs .sub_aggs
@@ -542,27 +535,6 @@ impl SegmentTermCollector {
let val = bool::from_u64(val); let val = bool::from_u64(val);
dict.insert(IntermediateKey::Bool(val), intermediate_entry); dict.insert(IntermediateKey::Bool(val), intermediate_entry);
} }
} else if self.column_type == ColumnType::IpAddr {
let compact_space_accessor = agg_with_accessor
.accessor
.values
.clone()
.downcast_arc::<CompactSpaceU64Accessor>()
.map_err(|_| {
TantivyError::AggregationError(
crate::aggregation::AggregationError::InternalError(
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
.to_string(),
),
)
})?;
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
let val = Ipv6Addr::from_u128(val);
dict.insert(IntermediateKey::IpAddr(val), intermediate_entry);
}
} else { } else {
for (val, doc_count) in entries { for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?; let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
@@ -615,9 +587,6 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::net::IpAddr;
use std::str::FromStr;
use common::DateTime; use common::DateTime;
use time::{Date, Month}; use time::{Date, Month};
@@ -628,7 +597,7 @@ mod tests {
}; };
use crate::aggregation::AggregationLimits; use crate::aggregation::AggregationLimits;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING}; use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter}; use crate::{Index, IndexWriter};
#[test] #[test]
@@ -1210,9 +1179,9 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc"); assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0); assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
@@ -1958,44 +1927,4 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn terms_aggregation_ip_addr() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
// IpV6 loopback
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
// IpV4
writer.add_document(
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
)?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_bool": {
"terms": {
"field": "ip_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// print as json
// println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_bool"]["buckets"][0]["key"], "::1");
assert_eq!(res["my_bool"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_bool"]["buckets"][1]["key"], "127.0.0.1");
assert_eq!(res["my_bool"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_bool"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
} }

View File

@@ -5,7 +5,6 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::hash::Hash; use std::hash::Hash;
use std::net::Ipv6Addr;
use columnar::ColumnType; use columnar::ColumnType;
use itertools::Itertools; use itertools::Itertools;
@@ -20,7 +19,7 @@ use super::bucket::{
}; };
use super::metric::{ use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats, IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer, IntermediateSum, PercentilesCollector, TopHitsCollector,
}; };
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey}; use super::{format_date, AggregationError, Key, SerializedKey};
@@ -42,8 +41,6 @@ pub struct IntermediateAggregationResults {
/// This might seem redundant with `Key`, but the point is to have a different /// This might seem redundant with `Key`, but the point is to have a different
/// Serialize implementation. /// Serialize implementation.
pub enum IntermediateKey { pub enum IntermediateKey {
/// Ip Addr key
IpAddr(Ipv6Addr),
/// Bool key /// Bool key
Bool(bool), Bool(bool),
/// String key /// String key
@@ -63,14 +60,6 @@ impl From<IntermediateKey> for Key {
fn from(value: IntermediateKey) -> Self { fn from(value: IntermediateKey) -> Self {
match value { match value {
IntermediateKey::Str(s) => Self::Str(s), IntermediateKey::Str(s) => Self::Str(s),
IntermediateKey::IpAddr(s) => {
// Prefer to use the IPv4 representation if possible
if let Some(ip) = s.to_ipv4_mapped() {
Self::Str(ip.to_string())
} else {
Self::Str(s.to_string())
}
}
IntermediateKey::F64(f) => Self::F64(f), IntermediateKey::F64(f) => Self::F64(f),
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64), IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
} }
@@ -86,7 +75,6 @@ impl std::hash::Hash for IntermediateKey {
IntermediateKey::Str(text) => text.hash(state), IntermediateKey::Str(text) => text.hash(state),
IntermediateKey::F64(val) => val.to_bits().hash(state), IntermediateKey::F64(val) => val.to_bits().hash(state),
IntermediateKey::Bool(val) => val.hash(state), IntermediateKey::Bool(val) => val.hash(state),
IntermediateKey::IpAddr(val) => val.hash(state),
} }
} }
} }
@@ -221,9 +209,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Percentiles(_) => IntermediateAggregationResult::Metric( Percentiles(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::Percentiles(PercentilesCollector::default()), IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
), ),
TopHits(ref req) => IntermediateAggregationResult::Metric( TopHits(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::TopHits(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())), TopHitsCollector::default(),
), )),
} }
} }
@@ -285,7 +273,7 @@ pub enum IntermediateMetricResult {
/// Intermediate sum result. /// Intermediate sum result.
Sum(IntermediateSum), Sum(IntermediateSum),
/// Intermediate top_hits result /// Intermediate top_hits result
TopHits(TopHitsTopNComputer), TopHits(TopHitsCollector),
} }
impl IntermediateMetricResult { impl IntermediateMetricResult {
@@ -314,7 +302,7 @@ impl IntermediateMetricResult {
.into_final_result(req.agg.as_percentile().expect("unexpected metric type")), .into_final_result(req.agg.as_percentile().expect("unexpected metric type")),
), ),
IntermediateMetricResult::TopHits(top_hits) => { IntermediateMetricResult::TopHits(top_hits) => {
MetricResult::TopHits(top_hits.into_final_result()) MetricResult::TopHits(top_hits.finalize())
} }
} }
} }

View File

@@ -25,8 +25,6 @@ mod stats;
mod sum; mod sum;
mod top_hits; mod top_hits;
use std::collections::HashMap;
pub use average::*; pub use average::*;
pub use count::*; pub use count::*;
pub use max::*; pub use max::*;
@@ -38,8 +36,6 @@ pub use stats::*;
pub use sum::*; pub use sum::*;
pub use top_hits::*; pub use top_hits::*;
use crate::schema::OwnedValue;
/// Single-metric aggregations use this common result structure. /// Single-metric aggregations use this common result structure.
/// ///
/// Main reason to wrap it in value is to match elasticsearch output structure. /// Main reason to wrap it in value is to match elasticsearch output structure.
@@ -96,9 +92,8 @@ pub struct TopHitsVecEntry {
/// Search results, for queries that include field retrieval requests /// Search results, for queries that include field retrieval requests
/// (`docvalue_fields`). /// (`docvalue_fields`).
#[serde(rename = "docvalue_fields")] #[serde(flatten)]
#[serde(skip_serializing_if = "HashMap::is_empty")] pub search_results: FieldRetrivalResult,
pub doc_value_fields: HashMap<String, OwnedValue>,
} }
/// The top_hits metric aggregation results a list of top hits by sort criteria. /// The top_hits metric aggregation results a list of top hits by sort criteria.

View File

@@ -1,5 +1,6 @@
use std::fmt::Debug; use std::fmt::Debug;
use columnar::ColumnType;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;

View File

@@ -1,3 +1,4 @@
use columnar::ColumnType;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;

View File

@@ -1,8 +1,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::net::Ipv6Addr; use std::fmt::Formatter;
use columnar::{ColumnarReader, DynamicColumn}; use columnar::{ColumnarReader, DynamicColumn};
use common::DateTime;
use regex::Regex; use regex::Regex;
use serde::ser::SerializeMap; use serde::ser::SerializeMap;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
@@ -93,61 +92,53 @@ pub struct TopHitsAggregation {
size: usize, size: usize,
from: Option<usize>, from: Option<usize>,
#[serde(flatten)]
retrieval: RetrievalFields,
}
const fn default_doc_value_fields() -> Vec<String> {
Vec::new()
}
/// Search query spec for each matched document
/// TODO: move this to a common module
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub struct RetrievalFields {
/// The fast fields to return for each hit.
/// This is the only variant supported for now.
/// TODO: support the {field, format} variant for custom formatting.
#[serde(rename = "docvalue_fields")] #[serde(rename = "docvalue_fields")]
#[serde(default)] #[serde(default = "default_doc_value_fields")]
doc_value_fields: Vec<String>, pub doc_value_fields: Vec<String>,
} }
#[derive(Debug, Clone, PartialEq, Default)] /// Search query result for each matched document
struct KeyOrder { /// TODO: move this to a common module
field: String, #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
order: Order, pub struct FieldRetrivalResult {
/// The fast fields returned for each hit.
#[serde(rename = "docvalue_fields")]
#[serde(skip_serializing_if = "HashMap::is_empty")]
pub doc_value_fields: HashMap<String, OwnedValue>,
} }
impl Serialize for KeyOrder { impl RetrievalFields {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> { fn get_field_names(&self) -> Vec<&str> {
let KeyOrder { field, order } = self; self.doc_value_fields.iter().map(|s| s.as_str()).collect()
let mut map = serializer.serialize_map(Some(1))?;
map.serialize_entry(field, order)?;
map.end()
} }
}
impl<'de> Deserialize<'de> for KeyOrder { fn resolve_field_names(&mut self, reader: &ColumnarReader) -> crate::Result<()> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> // Tranform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
where D: Deserializer<'de> { let globbed_string_to_regex = |glob: &str| {
let mut key_order = <HashMap<String, Order>>::deserialize(deserializer)?.into_iter(); // Replace `*` glob with `.*` regex
let (field, order) = key_order.next().ok_or(serde::de::Error::custom( let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
"Expected exactly one key-value pair in sort parameter of top_hits, found none", Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
))?; crate::TantivyError::SchemaError(format!(
if key_order.next().is_some() { "Invalid regex '{}' in docvalue_fields: {}",
return Err(serde::de::Error::custom(format!( glob, e
"Expected exactly one key-value pair in sort parameter of top_hits, found {:?}", ))
key_order })
))); };
}
Ok(Self { field, order })
}
}
// Tranform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
// Replace `*` glob with `.*` regex
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
crate::TantivyError::SchemaError(format!(
"Invalid regex '{}' in docvalue_fields: {}",
glob, e
))
})
}
impl TopHitsAggregation {
/// Validate and resolve field retrieval parameters
pub fn validate_and_resolve_field_names(
&mut self,
reader: &ColumnarReader,
) -> crate::Result<()> {
self.doc_value_fields = self self.doc_value_fields = self
.doc_value_fields .doc_value_fields
.iter() .iter()
@@ -184,25 +175,12 @@ impl TopHitsAggregation {
Ok(()) Ok(())
} }
/// Return fields accessed by the aggregator, in order.
pub fn field_names(&self) -> Vec<&str> {
self.sort
.iter()
.map(|KeyOrder { field, .. }| field.as_str())
.collect()
}
/// Return fields accessed by the aggregator's value retrieval.
pub fn value_field_names(&self) -> Vec<&str> {
self.doc_value_fields.iter().map(|s| s.as_str()).collect()
}
fn get_document_field_data( fn get_document_field_data(
&self, &self,
accessors: &HashMap<String, Vec<DynamicColumn>>, accessors: &HashMap<String, Vec<DynamicColumn>>,
doc_id: DocId, doc_id: DocId,
) -> HashMap<String, FastFieldValue> { ) -> FieldRetrivalResult {
let doc_value_fields = self let dvf = self
.doc_value_fields .doc_value_fields
.iter() .iter()
.map(|field| { .map(|field| {
@@ -210,20 +188,20 @@ impl TopHitsAggregation {
.get(field) .get(field)
.unwrap_or_else(|| panic!("field '{}' not found in accessors", field)); .unwrap_or_else(|| panic!("field '{}' not found in accessors", field));
let values: Vec<FastFieldValue> = accessors let values: Vec<OwnedValue> = accessors
.iter() .iter()
.flat_map(|accessor| match accessor { .flat_map(|accessor| match accessor {
DynamicColumn::U64(accessor) => accessor DynamicColumn::U64(accessor) => accessor
.values_for_doc(doc_id) .values_for_doc(doc_id)
.map(FastFieldValue::U64) .map(OwnedValue::U64)
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::I64(accessor) => accessor DynamicColumn::I64(accessor) => accessor
.values_for_doc(doc_id) .values_for_doc(doc_id)
.map(FastFieldValue::I64) .map(OwnedValue::I64)
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::F64(accessor) => accessor DynamicColumn::F64(accessor) => accessor
.values_for_doc(doc_id) .values_for_doc(doc_id)
.map(FastFieldValue::F64) .map(OwnedValue::F64)
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::Bytes(accessor) => accessor DynamicColumn::Bytes(accessor) => accessor
.term_ords(doc_id) .term_ords(doc_id)
@@ -235,7 +213,7 @@ impl TopHitsAggregation {
.expect("could not read term dictionary"), .expect("could not read term dictionary"),
"term corresponding to term_ord does not exist" "term corresponding to term_ord does not exist"
); );
FastFieldValue::Bytes(buffer) OwnedValue::Bytes(buffer)
}) })
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::Str(accessor) => accessor DynamicColumn::Str(accessor) => accessor
@@ -248,82 +226,94 @@ impl TopHitsAggregation {
.expect("could not read term dictionary"), .expect("could not read term dictionary"),
"term corresponding to term_ord does not exist" "term corresponding to term_ord does not exist"
); );
FastFieldValue::Str(String::from_utf8(buffer).unwrap()) OwnedValue::Str(String::from_utf8(buffer).unwrap())
}) })
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::Bool(accessor) => accessor DynamicColumn::Bool(accessor) => accessor
.values_for_doc(doc_id) .values_for_doc(doc_id)
.map(FastFieldValue::Bool) .map(OwnedValue::Bool)
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::IpAddr(accessor) => accessor DynamicColumn::IpAddr(accessor) => accessor
.values_for_doc(doc_id) .values_for_doc(doc_id)
.map(FastFieldValue::IpAddr) .map(OwnedValue::IpAddr)
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
DynamicColumn::DateTime(accessor) => accessor DynamicColumn::DateTime(accessor) => accessor
.values_for_doc(doc_id) .values_for_doc(doc_id)
.map(FastFieldValue::Date) .map(OwnedValue::Date)
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
}) })
.collect(); .collect();
(field.to_owned(), FastFieldValue::Array(values)) (field.to_owned(), OwnedValue::Array(values))
}) })
.collect(); .collect();
doc_value_fields FieldRetrivalResult {
} doc_value_fields: dvf,
}
/// A retrieved value from a fast field.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum FastFieldValue {
/// The str type is used for any text information.
Str(String),
/// Unsigned 64-bits Integer `u64`
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64),
/// 64-bits Float `f64`
F64(f64),
/// Bool value
Bool(bool),
/// Date/time with nanoseconds precision
Date(DateTime),
/// Arbitrarily sized byte array
Bytes(Vec<u8>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
/// A list of values.
Array(Vec<Self>),
}
impl From<FastFieldValue> for OwnedValue {
fn from(value: FastFieldValue) -> Self {
match value {
FastFieldValue::Str(s) => OwnedValue::Str(s),
FastFieldValue::U64(u) => OwnedValue::U64(u),
FastFieldValue::I64(i) => OwnedValue::I64(i),
FastFieldValue::F64(f) => OwnedValue::F64(f),
FastFieldValue::Bool(b) => OwnedValue::Bool(b),
FastFieldValue::Date(d) => OwnedValue::Date(d),
FastFieldValue::Bytes(b) => OwnedValue::Bytes(b),
FastFieldValue::IpAddr(ip) => OwnedValue::IpAddr(ip),
FastFieldValue::Array(a) => {
OwnedValue::Array(a.into_iter().map(OwnedValue::from).collect())
}
} }
} }
} }
/// Holds a fast field value in its u64 representation, and the order in which it should be sorted. #[derive(Debug, Clone, PartialEq, Default)]
#[derive(Clone, Serialize, Deserialize, Debug)] struct KeyOrder {
struct DocValueAndOrder { field: String,
/// A fast field value in its u64 representation.
value: Option<u64>,
/// Sort order for the value
order: Order, order: Order,
} }
impl Ord for DocValueAndOrder { impl Serialize for KeyOrder {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
let KeyOrder { field, order } = self;
let mut map = serializer.serialize_map(Some(1))?;
map.serialize_entry(field, order)?;
map.end()
}
}
impl<'de> Deserialize<'de> for KeyOrder {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> {
let mut k_o = <HashMap<String, Order>>::deserialize(deserializer)?.into_iter();
let (k, v) = k_o.next().ok_or(serde::de::Error::custom(
"Expected exactly one key-value pair in KeyOrder, found none",
))?;
if k_o.next().is_some() {
return Err(serde::de::Error::custom(
"Expected exactly one key-value pair in KeyOrder, found more",
));
}
Ok(Self { field: k, order: v })
}
}
impl TopHitsAggregation {
/// Validate and resolve field retrieval parameters
pub fn validate_and_resolve(&mut self, reader: &ColumnarReader) -> crate::Result<()> {
self.retrieval.resolve_field_names(reader)
}
/// Return fields accessed by the aggregator, in order.
pub fn field_names(&self) -> Vec<&str> {
self.sort
.iter()
.map(|KeyOrder { field, .. }| field.as_str())
.collect()
}
/// Return fields accessed by the aggregator's value retrieval.
pub fn value_field_names(&self) -> Vec<&str> {
self.retrieval.get_field_names()
}
}
/// Holds a single comparable doc feature, and the order in which it should be sorted.
#[derive(Clone, Serialize, Deserialize, Debug)]
struct ComparableDocFeature {
/// Stores any u64-mappable feature.
value: Option<u64>,
/// Sort order for the doc feature
order: Order,
}
impl Ord for ComparableDocFeature {
fn cmp(&self, other: &Self) -> std::cmp::Ordering { fn cmp(&self, other: &Self) -> std::cmp::Ordering {
let invert = |cmp: std::cmp::Ordering| match self.order { let invert = |cmp: std::cmp::Ordering| match self.order {
Order::Asc => cmp, Order::Asc => cmp,
@@ -339,32 +329,26 @@ impl Ord for DocValueAndOrder {
} }
} }
impl PartialOrd for DocValueAndOrder { impl PartialOrd for ComparableDocFeature {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl PartialEq for DocValueAndOrder { impl PartialEq for ComparableDocFeature {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.value.cmp(&other.value) == std::cmp::Ordering::Equal self.value.cmp(&other.value) == std::cmp::Ordering::Equal
} }
} }
impl Eq for DocValueAndOrder {} impl Eq for ComparableDocFeature {}
#[derive(Clone, Serialize, Deserialize, Debug)] #[derive(Clone, Serialize, Deserialize, Debug)]
struct DocSortValuesAndFields { struct ComparableDocFeatures(Vec<ComparableDocFeature>, FieldRetrivalResult);
sorts: Vec<DocValueAndOrder>,
#[serde(rename = "docvalue_fields")] impl Ord for ComparableDocFeatures {
#[serde(skip_serializing_if = "HashMap::is_empty")]
doc_value_fields: HashMap<String, FastFieldValue>,
}
impl Ord for DocSortValuesAndFields {
fn cmp(&self, other: &Self) -> std::cmp::Ordering { fn cmp(&self, other: &Self) -> std::cmp::Ordering {
for (self_feature, other_feature) in self.sorts.iter().zip(other.sorts.iter()) { for (self_feature, other_feature) in self.0.iter().zip(other.0.iter()) {
let cmp = self_feature.cmp(other_feature); let cmp = self_feature.cmp(other_feature);
if cmp != std::cmp::Ordering::Equal { if cmp != std::cmp::Ordering::Equal {
return cmp; return cmp;
@@ -374,43 +358,53 @@ impl Ord for DocSortValuesAndFields {
} }
} }
impl PartialOrd for DocSortValuesAndFields { impl PartialOrd for ComparableDocFeatures {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl PartialEq for DocSortValuesAndFields { impl PartialEq for ComparableDocFeatures {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal self.cmp(other) == std::cmp::Ordering::Equal
} }
} }
impl Eq for DocSortValuesAndFields {} impl Eq for ComparableDocFeatures {}
/// The TopHitsCollector used for collecting over segments and merging results. /// The TopHitsCollector used for collecting over segments and merging results.
#[derive(Clone, Serialize, Deserialize, Debug)] #[derive(Clone, Serialize, Deserialize)]
pub struct TopHitsTopNComputer { pub struct TopHitsCollector {
req: TopHitsAggregation, req: TopHitsAggregation,
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>, top_n: TopNComputer<ComparableDocFeatures, DocAddress, false>,
} }
impl std::cmp::PartialEq for TopHitsTopNComputer { impl Default for TopHitsCollector {
fn default() -> Self {
Self {
req: TopHitsAggregation::default(),
top_n: TopNComputer::new(1),
}
}
}
impl std::fmt::Debug for TopHitsCollector {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TopHitsCollector")
.field("req", &self.req)
.field("top_n_threshold", &self.top_n.threshold)
.finish()
}
}
impl std::cmp::PartialEq for TopHitsCollector {
fn eq(&self, _other: &Self) -> bool { fn eq(&self, _other: &Self) -> bool {
false false
} }
} }
impl TopHitsTopNComputer { impl TopHitsCollector {
/// Create a new TopHitsCollector fn collect(&mut self, features: ComparableDocFeatures, doc: DocAddress) {
pub fn new(req: TopHitsAggregation) -> Self {
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req,
}
}
fn collect(&mut self, features: DocSortValuesAndFields, doc: DocAddress) {
self.top_n.push(features, doc); self.top_n.push(features, doc);
} }
@@ -422,19 +416,14 @@ impl TopHitsTopNComputer {
} }
/// Finalize by converting self into the final result form /// Finalize by converting self into the final result form
pub fn into_final_result(self) -> TopHitsMetricResult { pub fn finalize(self) -> TopHitsMetricResult {
let mut hits: Vec<TopHitsVecEntry> = self let mut hits: Vec<TopHitsVecEntry> = self
.top_n .top_n
.into_sorted_vec() .into_sorted_vec()
.into_iter() .into_iter()
.map(|doc| TopHitsVecEntry { .map(|doc| TopHitsVecEntry {
sort: doc.feature.sorts.iter().map(|f| f.value).collect(), sort: doc.feature.0.iter().map(|f| f.value).collect(),
doc_value_fields: doc search_results: doc.feature.1,
.feature
.doc_value_fields
.into_iter()
.map(|(k, v)| (k, v.into()))
.collect(),
}) })
.collect(); .collect();
@@ -447,63 +436,48 @@ impl TopHitsTopNComputer {
} }
} }
#[derive(Clone, Debug)] #[derive(Clone)]
pub(crate) struct TopHitsSegmentCollector { pub(crate) struct SegmentTopHitsCollector {
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
accessor_idx: usize, accessor_idx: usize,
req: TopHitsAggregation, inner_collector: TopHitsCollector,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
} }
impl TopHitsSegmentCollector { impl SegmentTopHitsCollector {
pub fn from_req( pub fn from_req(
req: &TopHitsAggregation, req: &TopHitsAggregation,
accessor_idx: usize, accessor_idx: usize,
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
) -> Self { ) -> Self {
Self { Self {
req: req.clone(), inner_collector: TopHitsCollector {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)), req: req.clone(),
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
},
segment_ordinal, segment_ordinal,
accessor_idx, accessor_idx,
} }
} }
fn into_top_hits_collector( }
self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
let top_results = self.top_n.into_vec();
for res in top_results { impl std::fmt::Debug for SegmentTopHitsCollector {
let doc_value_fields = self fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
.req f.debug_struct("SegmentTopHitsCollector")
.get_document_field_data(value_accessors, res.doc.doc_id); .field("segment_id", &self.segment_ordinal)
top_hits_computer.collect( .field("accessor_idx", &self.accessor_idx)
DocSortValuesAndFields { .field("inner_collector", &self.inner_collector)
sorts: res.feature, .finish()
doc_value_fields,
},
res.doc,
);
}
top_hits_computer
} }
} }
impl SegmentAggregationCollector for TopHitsSegmentCollector { impl SegmentAggregationCollector for SegmentTopHitsCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
self: Box<Self>, self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults, results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> { ) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string(); let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let intermediate_result = IntermediateMetricResult::TopHits(self.inner_collector);
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let intermediate_result =
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
results.push( results.push(
name, name,
IntermediateAggregationResult::Metric(intermediate_result), IntermediateAggregationResult::Metric(intermediate_result),
@@ -516,7 +490,9 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> { ) -> crate::Result<()> {
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors; let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
let sorts: Vec<DocValueAndOrder> = self let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let features: Vec<ComparableDocFeature> = self
.inner_collector
.req .req
.sort .sort
.iter() .iter()
@@ -529,12 +505,18 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
.0 .0
.values_for_doc(doc_id) .values_for_doc(doc_id)
.next(); .next();
DocValueAndOrder { value, order } ComparableDocFeature { value, order }
}) })
.collect(); .collect();
self.top_n.push( let retrieval_result = self
sorts, .inner_collector
.req
.retrieval
.get_document_field_data(value_accessors, doc_id);
self.inner_collector.collect(
ComparableDocFeatures(features, retrieval_result),
DocAddress { DocAddress {
segment_ord: self.segment_ordinal, segment_ord: self.segment_ordinal,
doc_id, doc_id,
@@ -548,7 +530,11 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
docs: &[crate::DocId], docs: &[crate::DocId],
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> { ) -> crate::Result<()> {
// TODO: Consider getting fields with the column block accessor. // TODO: Consider getting fields with the column block accessor and refactor this.
// ---
// Would the additional complexity of getting fields with the column_block_accessor
// make sense here? Probably yes, but I want to get a first-pass review first
// before proceeding.
for doc in docs { for doc in docs {
self.collect(*doc, agg_with_accessor)?; self.collect(*doc, agg_with_accessor)?;
} }
@@ -563,7 +549,7 @@ mod tests {
use serde_json::Value; use serde_json::Value;
use time::macros::datetime; use time::macros::datetime;
use super::{DocSortValuesAndFields, DocValueAndOrder, Order}; use super::{ComparableDocFeature, ComparableDocFeatures, Order};
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::bucket::tests::get_test_index_from_docs; use crate::aggregation::bucket::tests::get_test_index_from_docs;
@@ -571,44 +557,44 @@ mod tests {
use crate::aggregation::AggregationCollector; use crate::aggregation::AggregationCollector;
use crate::collector::ComparableDoc; use crate::collector::ComparableDoc;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::schema::OwnedValue; use crate::schema::OwnedValue as SchemaValue;
fn invert_order(cmp_feature: DocValueAndOrder) -> DocValueAndOrder { fn invert_order(cmp_feature: ComparableDocFeature) -> ComparableDocFeature {
let DocValueAndOrder { value, order } = cmp_feature; let ComparableDocFeature { value, order } = cmp_feature;
let order = match order { let order = match order {
Order::Asc => Order::Desc, Order::Asc => Order::Desc,
Order::Desc => Order::Asc, Order::Desc => Order::Asc,
}; };
DocValueAndOrder { value, order } ComparableDocFeature { value, order }
} }
fn collector_with_capacity(capacity: usize) -> super::TopHitsTopNComputer { fn collector_with_capacity(capacity: usize) -> super::TopHitsCollector {
super::TopHitsTopNComputer { super::TopHitsCollector {
top_n: super::TopNComputer::new(capacity), top_n: super::TopNComputer::new(capacity),
req: Default::default(), ..Default::default()
} }
} }
fn invert_order_features(mut cmp_features: DocSortValuesAndFields) -> DocSortValuesAndFields { fn invert_order_features(cmp_features: ComparableDocFeatures) -> ComparableDocFeatures {
cmp_features.sorts = cmp_features let ComparableDocFeatures(cmp_features, search_results) = cmp_features;
.sorts let cmp_features = cmp_features
.into_iter() .into_iter()
.map(invert_order) .map(invert_order)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
cmp_features ComparableDocFeatures(cmp_features, search_results)
} }
#[test] #[test]
fn test_comparable_doc_feature() -> crate::Result<()> { fn test_comparable_doc_feature() -> crate::Result<()> {
let small = DocValueAndOrder { let small = ComparableDocFeature {
value: Some(1), value: Some(1),
order: Order::Asc, order: Order::Asc,
}; };
let big = DocValueAndOrder { let big = ComparableDocFeature {
value: Some(2), value: Some(2),
order: Order::Asc, order: Order::Asc,
}; };
let none = DocValueAndOrder { let none = ComparableDocFeature {
value: None, value: None,
order: Order::Asc, order: Order::Asc,
}; };
@@ -630,21 +616,21 @@ mod tests {
#[test] #[test]
fn test_comparable_doc_features() -> crate::Result<()> { fn test_comparable_doc_features() -> crate::Result<()> {
let features_1 = DocSortValuesAndFields { let features_1 = ComparableDocFeatures(
sorts: vec![DocValueAndOrder { vec![ComparableDocFeature {
value: Some(1), value: Some(1),
order: Order::Asc, order: Order::Asc,
}], }],
doc_value_fields: Default::default(), Default::default(),
}; );
let features_2 = DocSortValuesAndFields { let features_2 = ComparableDocFeatures(
sorts: vec![DocValueAndOrder { vec![ComparableDocFeature {
value: Some(2), value: Some(2),
order: Order::Asc, order: Order::Asc,
}], }],
doc_value_fields: Default::default(), Default::default(),
}; );
assert!(features_1 < features_2); assert!(features_1 < features_2);
@@ -703,39 +689,39 @@ mod tests {
segment_ord: 0, segment_ord: 0,
doc_id: 0, doc_id: 0,
}, },
feature: DocSortValuesAndFields { feature: ComparableDocFeatures(
sorts: vec![DocValueAndOrder { vec![ComparableDocFeature {
value: Some(1), value: Some(1),
order: Order::Asc, order: Order::Asc,
}], }],
doc_value_fields: Default::default(), Default::default(),
}, ),
}, },
ComparableDoc { ComparableDoc {
doc: crate::DocAddress { doc: crate::DocAddress {
segment_ord: 0, segment_ord: 0,
doc_id: 2, doc_id: 2,
}, },
feature: DocSortValuesAndFields { feature: ComparableDocFeatures(
sorts: vec![DocValueAndOrder { vec![ComparableDocFeature {
value: Some(3), value: Some(3),
order: Order::Asc, order: Order::Asc,
}], }],
doc_value_fields: Default::default(), Default::default(),
}, ),
}, },
ComparableDoc { ComparableDoc {
doc: crate::DocAddress { doc: crate::DocAddress {
segment_ord: 0, segment_ord: 0,
doc_id: 1, doc_id: 1,
}, },
feature: DocSortValuesAndFields { feature: ComparableDocFeatures(
sorts: vec![DocValueAndOrder { vec![ComparableDocFeature {
value: Some(5), value: Some(5),
order: Order::Asc, order: Order::Asc,
}], }],
doc_value_fields: Default::default(), Default::default(),
}, ),
}, },
]; ];
@@ -744,23 +730,23 @@ mod tests {
collector.collect(doc.feature, doc.doc); collector.collect(doc.feature, doc.doc);
} }
let res = collector.into_final_result(); let res = collector.finalize();
assert_eq!( assert_eq!(
res, res,
super::TopHitsMetricResult { super::TopHitsMetricResult {
hits: vec![ hits: vec![
super::TopHitsVecEntry { super::TopHitsVecEntry {
sort: vec![docs[0].feature.sorts[0].value], sort: vec![docs[0].feature.0[0].value],
doc_value_fields: Default::default(), search_results: Default::default(),
}, },
super::TopHitsVecEntry { super::TopHitsVecEntry {
sort: vec![docs[1].feature.sorts[0].value], sort: vec![docs[1].feature.0[0].value],
doc_value_fields: Default::default(), search_results: Default::default(),
}, },
super::TopHitsVecEntry { super::TopHitsVecEntry {
sort: vec![docs[2].feature.sorts[0].value], sort: vec![docs[2].feature.0[0].value],
doc_value_fields: Default::default(), search_results: Default::default(),
}, },
] ]
} }
@@ -817,7 +803,7 @@ mod tests {
{ {
"sort": [common::i64_to_u64(date_2017.unix_timestamp_nanos() as i64)], "sort": [common::i64_to_u64(date_2017.unix_timestamp_nanos() as i64)],
"docvalue_fields": { "docvalue_fields": {
"date": [ OwnedValue::Date(DateTime::from_utc(date_2017)) ], "date": [ SchemaValue::Date(DateTime::from_utc(date_2017)) ],
"text": [ "ccc" ], "text": [ "ccc" ],
"text2": [ "ddd" ], "text2": [ "ddd" ],
"mixed.dyn_arr": [ 3, "4" ], "mixed.dyn_arr": [ 3, "4" ],
@@ -826,7 +812,7 @@ mod tests {
{ {
"sort": [common::i64_to_u64(date_2016.unix_timestamp_nanos() as i64)], "sort": [common::i64_to_u64(date_2016.unix_timestamp_nanos() as i64)],
"docvalue_fields": { "docvalue_fields": {
"date": [ OwnedValue::Date(DateTime::from_utc(date_2016)) ], "date": [ SchemaValue::Date(DateTime::from_utc(date_2016)) ],
"text": [ "aaa" ], "text": [ "aaa" ],
"text2": [ "bbb" ], "text2": [ "bbb" ],
"mixed.dyn_arr": [ 6, "7" ], "mixed.dyn_arr": [ 6, "7" ],

View File

@@ -417,6 +417,7 @@ mod tests {
use time::OffsetDateTime; use time::OffsetDateTime;
use super::agg_req::Aggregations; use super::agg_req::Aggregations;
use super::segment_agg_result::AggregationLimits;
use super::*; use super::*;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};

View File

@@ -16,7 +16,7 @@ use super::metric::{
SumAggregation, SumAggregation,
}; };
use crate::aggregation::bucket::TermMissingAgg; use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::TopHitsSegmentCollector; use crate::aggregation::metric::SegmentTopHitsCollector;
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug { pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
@@ -161,7 +161,7 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx, accessor_idx,
)?, )?,
)), )),
TopHits(top_hits_req) => Ok(Box::new(TopHitsSegmentCollector::from_req( TopHits(top_hits_req) => Ok(Box::new(SegmentTopHitsCollector::from_req(
top_hits_req, top_hits_req,
accessor_idx, accessor_idx,
req.segment_ordinal, req.segment_ordinal,

View File

@@ -160,7 +160,7 @@ mod tests {
use super::{add_vecs, HistogramCollector, HistogramComputer}; use super::{add_vecs, HistogramCollector, HistogramComputer};
use crate::schema::{Schema, FAST}; use crate::schema::{Schema, FAST};
use crate::time::{Date, Month}; use crate::time::{Date, Month};
use crate::{query, DateTime, Index}; use crate::{doc, query, DateTime, Index};
#[test] #[test]
fn test_add_histograms_simple() { fn test_add_histograms_simple() {

View File

@@ -274,10 +274,6 @@ pub trait SegmentCollector: 'static {
fn collect(&mut self, doc: DocId, score: Score); fn collect(&mut self, doc: DocId, score: Score);
/// The query pushes the scored document to the collector via this method. /// The query pushes the scored document to the collector via this method.
/// This method is used when the collector does not require scoring.
///
/// See [`COLLECT_BLOCK_BUFFER_LEN`](crate::COLLECT_BLOCK_BUFFER_LEN) for the
/// buffer size passed to the collector.
fn collect_block(&mut self, docs: &[DocId]) { fn collect_block(&mut self, docs: &[DocId]) {
for doc in docs { for doc in docs {
self.collect(*doc, 0.0); self.collect(*doc, 0.0);

View File

@@ -52,16 +52,10 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
impl SegmentCollector for Box<dyn BoxableSegmentCollector> { impl SegmentCollector for Box<dyn BoxableSegmentCollector> {
type Fruit = Box<dyn Fruit>; type Fruit = Box<dyn Fruit>;
#[inline]
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: Score) {
self.as_mut().collect(doc, score); self.as_mut().collect(doc, score);
} }
#[inline]
fn collect_block(&mut self, docs: &[DocId]) {
self.as_mut().collect_block(docs);
}
fn harvest(self) -> Box<dyn Fruit> { fn harvest(self) -> Box<dyn Fruit> {
BoxableSegmentCollector::harvest_from_box(self) BoxableSegmentCollector::harvest_from_box(self)
} }
@@ -69,11 +63,6 @@ impl SegmentCollector for Box<dyn BoxableSegmentCollector> {
pub trait BoxableSegmentCollector { pub trait BoxableSegmentCollector {
fn collect(&mut self, doc: u32, score: Score); fn collect(&mut self, doc: u32, score: Score);
fn collect_block(&mut self, docs: &[DocId]) {
for &doc in docs {
self.collect(doc, 0.0);
}
}
fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit>; fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit>;
} }
@@ -82,14 +71,9 @@ pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegment
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
for SegmentCollectorWrapper<TSegmentCollector> for SegmentCollectorWrapper<TSegmentCollector>
{ {
#[inline]
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: Score) {
self.0.collect(doc, score); self.0.collect(doc, score);
} }
#[inline]
fn collect_block(&mut self, docs: &[DocId]) {
self.0.collect_block(docs);
}
fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit> { fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit> {
Box::new(self.0.harvest()) Box::new(self.0.harvest())

View File

@@ -1,11 +1,15 @@
use columnar::{BytesColumn, Column}; use columnar::{BytesColumn, Column};
use super::*; use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::index::SegmentReader;
use crate::query::{AllQuery, QueryParser}; use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT}; use crate::schema::{Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, Index, Searcher, TantivyDocument}; use crate::{
doc, DateTime, DocAddress, DocId, Index, Score, Searcher, SegmentOrdinal, TantivyDocument,
};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector { pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true, compute_score: true,

View File

@@ -732,19 +732,6 @@ pub struct TopNComputer<Score, D, const REVERSE_ORDER: bool = true> {
top_n: usize, top_n: usize,
pub(crate) threshold: Option<Score>, pub(crate) threshold: Option<Score>,
} }
impl<Score: std::fmt::Debug, D, const REVERSE_ORDER: bool> std::fmt::Debug
for TopNComputer<Score, D, REVERSE_ORDER>
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TopNComputer")
.field("buffer_len", &self.buffer.len())
.field("top_n", &self.top_n)
.field("current_threshold", &self.threshold)
.finish()
}
}
// Intermediate struct for TopNComputer for deserialization, to keep vec capacity // Intermediate struct for TopNComputer for deserialization, to keep vec capacity
#[derive(Deserialize)] #[derive(Deserialize)]
struct TopNComputerDeser<Score, D, const REVERSE_ORDER: bool> { struct TopNComputerDeser<Score, D, const REVERSE_ORDER: bool> {

View File

@@ -137,6 +137,7 @@ mod mmap_specific {
use tempfile::TempDir; use tempfile::TempDir;
use super::*; use super::*;
use crate::Directory;
#[test] #[test]
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> { fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> {

View File

@@ -1,5 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use std::iter::ExactSizeIterator;
use std::ops::Range; use std::ops::Range;
use common::{BinarySerializable, CountingWriter, HasLen, VInt}; use common::{BinarySerializable, CountingWriter, HasLen, VInt};

View File

@@ -1,4 +1,5 @@
use std::io::Write; use std::io::Write;
use std::marker::{Send, Sync};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
@@ -39,7 +40,6 @@ impl RetryPolicy {
/// The `DirectoryLock` is an object that represents a file lock. /// The `DirectoryLock` is an object that represents a file lock.
/// ///
/// It is associated with a lock file, that gets deleted on `Drop.` /// It is associated with a lock file, that gets deleted on `Drop.`
#[allow(dead_code)]
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>); pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
struct DirectoryLockGuard { struct DirectoryLockGuard {

View File

@@ -1,6 +1,6 @@
use std::io::Write; use std::io::Write;
use std::mem; use std::mem;
use std::path::Path; use std::path::{Path, PathBuf};
use std::sync::atomic::Ordering::SeqCst; use std::sync::atomic::Ordering::SeqCst;
use std::sync::atomic::{AtomicBool, AtomicUsize}; use std::sync::atomic::{AtomicBool, AtomicUsize};
use std::sync::Arc; use std::sync::Arc;

View File

@@ -32,7 +32,6 @@ pub struct WatchCallbackList {
/// file change is detected. /// file change is detected.
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."] #[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
#[derive(Clone)] #[derive(Clone)]
#[allow(dead_code)]
pub struct WatchHandle(Arc<WatchCallback>); pub struct WatchHandle(Arc<WatchCallback>);
impl WatchHandle { impl WatchHandle {

View File

@@ -9,10 +9,7 @@ use crate::DocId;
/// to compare `[u32; 4]`. /// to compare `[u32; 4]`.
pub const TERMINATED: DocId = i32::MAX as u32; pub const TERMINATED: DocId = i32::MAX as u32;
/// The collect_block method on `SegmentCollector` uses a buffer of this size. pub const BUFFER_LEN: usize = 64;
/// Passed results to `collect_block` will not exceed this size and will be
/// exactly this size as long as we can fill the buffer.
pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64;
/// Represents an iterable set of sorted doc ids. /// Represents an iterable set of sorted doc ids.
pub trait DocSet: Send { pub trait DocSet: Send {
@@ -64,7 +61,7 @@ pub trait DocSet: Send {
/// This method is only here for specific high-performance /// This method is only here for specific high-performance
/// use case where batching. The normal way to /// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`. /// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize {
if self.doc() == TERMINATED { if self.doc() == TERMINATED {
return 0; return 0;
} }
@@ -154,7 +151,7 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target) unboxed.seek(target)
} }
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.fill_buffer(buffer) unboxed.fill_buffer(buffer)
} }

View File

@@ -79,7 +79,7 @@ mod tests {
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
use std::path::Path; use std::path::Path;
use columnar::StrColumn; use columnar::{Column, MonotonicallyMappableToU64, StrColumn};
use common::{ByteCount, HasLen, TerminatingWrite}; use common::{ByteCount, HasLen, TerminatingWrite};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use rand::prelude::SliceRandom; use rand::prelude::SliceRandom;

View File

@@ -1,4 +1,4 @@
use std::cmp::Ordering; use std::cmp::{Ord, Ordering};
use std::error::Error; use std::error::Error;
use std::fmt; use std::fmt;
use std::str::FromStr; use std::str::FromStr;

View File

@@ -516,8 +516,8 @@ impl fmt::Debug for SegmentReader {
mod test { mod test {
use super::*; use super::*;
use crate::index::Index; use crate::index::Index;
use crate::schema::{SchemaBuilder, Term, STORED, TEXT}; use crate::schema::{Schema, SchemaBuilder, Term, STORED, TEXT};
use crate::IndexWriter; use crate::{DocId, IndexWriter};
#[test] #[test]
fn test_merge_field_meta_data_same() { fn test_merge_field_meta_data_same() {

View File

@@ -158,7 +158,8 @@ mod tests_indexsorting {
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::*; use crate::schema::document::Value;
use crate::schema::{Schema, *};
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order}; use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
fn create_test_index( fn create_test_index(

View File

@@ -22,7 +22,6 @@ where
} }
} }
#[allow(dead_code)]
pub trait FlatMapWithBufferIter: Iterator { pub trait FlatMapWithBufferIter: Iterator {
/// Function similar to `flat_map`, but allows reusing a shared `Vec`. /// Function similar to `flat_map`, but allows reusing a shared `Vec`.
fn flat_map_with_buffer<F, T>(self, fill_buffer: F) -> FlatMapWithBuffer<T, F, Self> fn flat_map_with_buffer<F, T>(self, fill_buffer: F) -> FlatMapWithBuffer<T, F, Self>

View File

@@ -806,6 +806,7 @@ mod tests {
use columnar::{Cardinality, Column, MonotonicallyMappableToU128}; use columnar::{Cardinality, Column, MonotonicallyMappableToU128};
use itertools::Itertools; use itertools::Itertools;
use proptest::prop_oneof; use proptest::prop_oneof;
use proptest::strategy::Strategy;
use super::super::operation::UserOperation; use super::super::operation::UserOperation;
use crate::collector::TopDocs; use crate::collector::TopDocs;

View File

@@ -144,9 +144,10 @@ mod tests {
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use super::*; use super::*;
use crate::index::SegmentMetaInventory; use crate::index::{SegmentId, SegmentMeta, SegmentMetaInventory};
use crate::indexer::merge_policy::MergePolicy;
use crate::schema;
use crate::schema::INDEXED; use crate::schema::INDEXED;
use crate::{schema, SegmentId};
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default); static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);

View File

@@ -39,6 +39,7 @@ impl MergePolicy for NoMergePolicy {
pub mod tests { pub mod tests {
use super::*; use super::*;
use crate::index::{SegmentId, SegmentMeta};
/// `MergePolicy` useful for test purposes. /// `MergePolicy` useful for test purposes.
/// ///

View File

@@ -576,7 +576,7 @@ impl IndexMerger {
// //
// Overall the reliable way to know if we have actual frequencies loaded or not // Overall the reliable way to know if we have actual frequencies loaded or not
// is to check whether the actual decoded array is empty or not. // is to check whether the actual decoded array is empty or not.
if has_term_freq == postings.block_cursor.freqs().is_empty() { if has_term_freq != !postings.block_cursor.freqs().is_empty() {
return Err(DataCorruption::comment_only( return Err(DataCorruption::comment_only(
"Term freqs are inconsistent across segments", "Term freqs are inconsistent across segments",
) )

View File

@@ -144,115 +144,6 @@ mod tests_mmap {
assert_eq!(num_docs, 256); assert_eq!(num_docs, 256);
} }
} }
#[test]
fn test_json_field_null_byte() {
// Test when field name contains a zero byte, which has special meaning in tantivy.
// As a workaround, we convert the zero byte to the ASCII character '0'.
// https://github.com/quickwit-oss/tantivy/issues/2340
// https://github.com/quickwit-oss/tantivy/issues/2193
let field_name_in = "\u{0000}";
let field_name_out = "0";
test_json_field_name(field_name_in, field_name_out);
}
#[test]
fn test_json_field_1byte() {
// Test when field name contains a 1 byte, which has special meaning in tantivy.
let field_name_in = "\u{0001}";
let field_name_out = "\u{0001}";
test_json_field_name(field_name_in, field_name_out);
// Test when field name contains a 1 byte, which has special meaning in tantivy.
let field_name_in = "\u{0001}";
let field_name_out = ".";
test_json_field_name(field_name_in, field_name_out);
}
fn test_json_field_name(field_name_in: &str, field_name_out: &str) {
let mut schema_builder = Schema::builder();
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
let field = schema_builder.add_json_field("json", options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"})))
.unwrap();
index_writer
.add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"})))
.unwrap();
index_writer
.add_document(doc!(field=>json!({format!("a{field_name_in}a"): "test3"})))
.unwrap();
index_writer
.add_document(
doc!(field=>json!({format!("a{field_name_in}a{field_name_in}"): "test4"})),
)
.unwrap();
index_writer
.add_document(
doc!(field=>json!({format!("a{field_name_in}.ab{field_name_in}"): "test5"})),
)
.unwrap();
index_writer
.add_document(
doc!(field=>json!({format!("a{field_name_in}"): json!({format!("a{field_name_in}"): "test6"}) })),
)
.unwrap();
index_writer
.add_document(doc!(field=>json!({format!("{field_name_in}a" ): "test7"})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let parse_query = QueryParser::for_index(&index, Vec::new());
let test_query = |field_name: &str| {
let query = parse_query.parse_query(field_name).unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
};
test_query(format!("json.{field_name_out}:test1").as_str());
test_query(format!("json.a{field_name_out}:test2").as_str());
test_query(format!("json.a{field_name_out}a:test3").as_str());
test_query(format!("json.a{field_name_out}a{field_name_out}:test4").as_str());
test_query(format!("json.a{field_name_out}.ab{field_name_out}:test5").as_str());
test_query(format!("json.a{field_name_out}.a{field_name_out}:test6").as_str());
test_query(format!("json.{field_name_out}a:test7").as_str());
let test_agg = |field_name: &str, expected: &str| {
let agg_req_str = json!(
{
"termagg": {
"terms": {
"field": field_name,
}
}
});
let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res = serde_json::to_value(agg_res).unwrap();
assert_eq!(res["termagg"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["termagg"]["buckets"][0]["key"], expected);
};
test_agg(format!("json.{field_name_out}").as_str(), "test1");
test_agg(format!("json.a{field_name_out}").as_str(), "test2");
test_agg(format!("json.a{field_name_out}a").as_str(), "test3");
test_agg(
format!("json.a{field_name_out}a{field_name_out}").as_str(),
"test4",
);
test_agg(
format!("json.a{field_name_out}.ab{field_name_out}").as_str(),
"test5",
);
test_agg(
format!("json.a{field_name_out}.a{field_name_out}").as_str(),
"test6",
);
test_agg(format!("json.{field_name_out}a").as_str(), "test7");
}
#[test] #[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() { fn test_json_field_expand_dots_enabled_dot_escape_not_required() {

View File

@@ -103,7 +103,7 @@ impl SegmentRegister {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::index::SegmentMetaInventory; use crate::index::{SegmentId, SegmentMetaInventory};
use crate::indexer::delete_queue::*; use crate::indexer::delete_queue::*;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> { fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {

View File

@@ -213,7 +213,7 @@ pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, HasLen};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; pub use self::docset::{DocSet, TERMINATED};
#[deprecated( #[deprecated(
since = "0.22.0", since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead" note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
@@ -391,6 +391,7 @@ pub mod tests {
use crate::index::SegmentReader; use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::schema::document::Value;
use crate::schema::*; use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy}; use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};

View File

@@ -14,6 +14,7 @@ pub fn compressed_block_size(num_bits: u8) -> usize {
pub struct BlockEncoder { pub struct BlockEncoder {
bitpacker: BitPacker4x, bitpacker: BitPacker4x,
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE], pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
} }
impl Default for BlockEncoder { impl Default for BlockEncoder {
@@ -27,6 +28,7 @@ impl BlockEncoder {
BlockEncoder { BlockEncoder {
bitpacker: BitPacker4x::new(), bitpacker: BitPacker4x::new(),
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE], output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
} }
} }

View File

@@ -67,18 +67,10 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
) -> io::Result<()> { ) -> io::Result<()> {
let mut term_buffer = Term::with_capacity(48); let mut term_buffer = Term::with_capacity(48);
let mut buffer_lender = BufferLender::default(); let mut buffer_lender = BufferLender::default();
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
let mut prev_term_id = u32::MAX;
let mut term_path_len = 0; // this will be set in the first iteration
for (_field, path_id, term, addr) in term_addrs { for (_field, path_id, term, addr) in term_addrs {
if prev_term_id != path_id.path_id() { term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
term_buffer.truncate_value_bytes(0); term_buffer.append_bytes(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes()); term_buffer.append_bytes(&[JSON_END_OF_PATH]);
term_buffer.append_bytes(&[JSON_END_OF_PATH]);
term_path_len = term_buffer.len_bytes();
prev_term_id = path_id.path_id();
}
term_buffer.truncate_value_bytes(term_path_len);
term_buffer.append_bytes(term); term_buffer.append_bytes(term);
if let Some(json_value) = term_buffer.value().as_json_value_bytes() { if let Some(json_value) = term_buffer.value().as_json_value_bytes() {
let typ = json_value.typ(); let typ = json_value.typ();

View File

@@ -1,3 +1,5 @@
use std::convert::TryInto;
use crate::directory::OwnedBytes; use crate::directory::OwnedBytes;
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE}; use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
use crate::query::Bm25Weight; use crate::query::Bm25Weight;

View File

@@ -1,4 +1,5 @@
use std::io; use std::io;
use std::iter::ExactSizeIterator;
use std::ops::Range; use std::ops::Range;
use common::{BinarySerializable, FixedSize}; use common::{BinarySerializable, FixedSize};

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; use crate::docset::{DocSet, BUFFER_LEN, TERMINATED};
use crate::index::SegmentReader; use crate::index::SegmentReader;
use crate::query::boost_query::BoostScorer; use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
@@ -54,7 +54,7 @@ impl DocSet for AllScorer {
self.doc self.doc
} }
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize {
if self.doc() == TERMINATED { if self.doc() == TERMINATED {
return 0; return 0;
} }
@@ -96,7 +96,7 @@ impl Scorer for AllScorer {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::AllQuery; use super::AllQuery;
use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; use crate::docset::{DocSet, BUFFER_LEN, TERMINATED};
use crate::query::{AllScorer, EnableScoring, Query}; use crate::query::{AllScorer, EnableScoring, Query};
use crate::schema::{Schema, TEXT}; use crate::schema::{Schema, TEXT};
use crate::{Index, IndexWriter}; use crate::{Index, IndexWriter};
@@ -162,16 +162,16 @@ mod tests {
pub fn test_fill_buffer() { pub fn test_fill_buffer() {
let mut postings = AllScorer { let mut postings = AllScorer {
doc: 0u32, doc: 0u32,
max_doc: COLLECT_BLOCK_BUFFER_LEN as u32 * 2 + 9, max_doc: BUFFER_LEN as u32 * 2 + 9,
}; };
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; let mut buffer = [0u32; BUFFER_LEN];
assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN);
for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { for i in 0u32..BUFFER_LEN as u32 {
assert_eq!(buffer[i as usize], i); assert_eq!(buffer[i as usize], i);
} }
assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN);
for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { for i in 0u32..BUFFER_LEN as u32 {
assert_eq!(buffer[i as usize], i + COLLECT_BLOCK_BUFFER_LEN as u32); assert_eq!(buffer[i as usize], i + BUFFER_LEN as u32);
} }
assert_eq!(postings.fill_buffer(&mut buffer), 9); assert_eq!(postings.fill_buffer(&mut buffer), 9);
} }

View File

@@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::docset::BUFFER_LEN;
use crate::index::SegmentReader; use crate::index::SegmentReader;
use crate::postings::FreqReadingOption; use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
@@ -228,7 +228,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
callback: &mut dyn FnMut(&[DocId]), callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> { ) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; let mut buffer = [0u32; BUFFER_LEN];
match scorer { match scorer {
SpecializedScorer::TermUnion(term_scorers) => { SpecializedScorer::TermUnion(term_scorers) => {

View File

@@ -1,6 +1,6 @@
use std::fmt; use std::fmt;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::docset::BUFFER_LEN;
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, Term}; use crate::{DocId, DocSet, Score, SegmentReader, Term};
@@ -105,7 +105,7 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
self.underlying.seek(target) self.underlying.seek(target)
} }
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize {
self.underlying.fill_buffer(buffer) self.underlying.fill_buffer(buffer)
} }

View File

@@ -1,6 +1,6 @@
use std::fmt; use std::fmt;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::docset::BUFFER_LEN;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
@@ -119,7 +119,7 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
self.docset.seek(target) self.docset.seek(target)
} }
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId; BUFFER_LEN]) -> usize {
self.docset.fill_buffer(buffer) self.docset.fill_buffer(buffer)
} }

View File

@@ -149,7 +149,7 @@ mod tests {
use crate::query::exist_query::ExistsQuery; use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery}; use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT}; use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{Index, Searcher}; use crate::{doc, Index, Searcher};
#[test] #[test]
fn test_exists_query_simple() -> crate::Result<()> { fn test_exists_query_simple() -> crate::Result<()> {

View File

@@ -84,7 +84,7 @@ pub struct FuzzyTermQuery {
distance: u8, distance: u8,
/// Should a transposition cost 1 or 2? /// Should a transposition cost 1 or 2?
transposition_cost_one: bool, transposition_cost_one: bool,
/// is a starts with query ///
prefix: bool, prefix: bool,
} }

View File

@@ -477,7 +477,7 @@ mod tests {
use crate::schema::{ use crate::schema::{
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT, Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
}; };
use crate::{Index, IndexWriter}; use crate::{doc, Index, IndexWriter};
#[test] #[test]
fn test_range_query_simple() -> crate::Result<()> { fn test_range_query_simple() -> crate::Result<()> {

View File

@@ -139,7 +139,7 @@ mod tests {
use crate::collector::{Count, TopDocs}; use crate::collector::{Count, TopDocs};
use crate::query::{Query, QueryParser, TermQuery}; use crate::query::{Query, QueryParser, TermQuery};
use crate::schema::{IndexRecordOption, IntoIpv6Addr, Schema, INDEXED, STORED}; use crate::schema::{IndexRecordOption, IntoIpv6Addr, Schema, INDEXED, STORED};
use crate::{Index, IndexWriter, Term}; use crate::{doc, Index, IndexWriter, Term};
#[test] #[test]
fn search_ip_test() { fn search_ip_test() {

View File

@@ -1,5 +1,5 @@
use super::term_scorer::TermScorer; use super::term_scorer::TermScorer;
use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN}; use crate::docset::{DocSet, BUFFER_LEN};
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormReader;
use crate::index::SegmentReader; use crate::index::SegmentReader;
use crate::postings::SegmentPostings; use crate::postings::SegmentPostings;
@@ -64,7 +64,7 @@ impl Weight for TermWeight {
callback: &mut dyn FnMut(&[DocId]), callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut scorer = self.specialized_scorer(reader, 1.0)?; let mut scorer = self.specialized_scorer(reader, 1.0)?;
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; let mut buffer = [0u32; BUFFER_LEN];
for_each_docset_buffered(&mut scorer, &mut buffer, callback); for_each_docset_buffered(&mut scorer, &mut buffer, callback);
Ok(()) Ok(())
} }

View File

@@ -53,7 +53,8 @@ impl HasLen for VecDocSet {
pub mod tests { pub mod tests {
use super::*; use super::*;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::docset::{DocSet, BUFFER_LEN};
use crate::DocId;
#[test] #[test]
pub fn test_vec_postings() { pub fn test_vec_postings() {
@@ -71,16 +72,16 @@ pub mod tests {
#[test] #[test]
pub fn test_fill_buffer() { pub fn test_fill_buffer() {
let doc_ids: Vec<DocId> = (1u32..=(COLLECT_BLOCK_BUFFER_LEN as u32 * 2 + 9)).collect(); let doc_ids: Vec<DocId> = (1u32..=(BUFFER_LEN as u32 * 2 + 9)).collect();
let mut postings = VecDocSet::from(doc_ids); let mut postings = VecDocSet::from(doc_ids);
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; let mut buffer = [0u32; BUFFER_LEN];
assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN);
for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { for i in 0u32..BUFFER_LEN as u32 {
assert_eq!(buffer[i as usize], i + 1); assert_eq!(buffer[i as usize], i + 1);
} }
assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN); assert_eq!(postings.fill_buffer(&mut buffer), BUFFER_LEN);
for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 { for i in 0u32..BUFFER_LEN as u32 {
assert_eq!(buffer[i as usize], i + 1 + COLLECT_BLOCK_BUFFER_LEN as u32); assert_eq!(buffer[i as usize], i + 1 + BUFFER_LEN as u32);
} }
assert_eq!(postings.fill_buffer(&mut buffer), 9); assert_eq!(postings.fill_buffer(&mut buffer), 9);
} }

View File

@@ -1,5 +1,5 @@
use super::Scorer; use super::Scorer;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::docset::BUFFER_LEN;
use crate::index::SegmentReader; use crate::index::SegmentReader;
use crate::query::Explanation; use crate::query::Explanation;
use crate::{DocId, DocSet, Score, TERMINATED}; use crate::{DocId, DocSet, Score, TERMINATED};
@@ -22,7 +22,7 @@ pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
#[inline] #[inline]
pub(crate) fn for_each_docset_buffered<T: DocSet + ?Sized>( pub(crate) fn for_each_docset_buffered<T: DocSet + ?Sized>(
docset: &mut T, docset: &mut T,
buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN], buffer: &mut [DocId; BUFFER_LEN],
mut callback: impl FnMut(&[DocId]), mut callback: impl FnMut(&[DocId]),
) { ) {
loop { loop {
@@ -105,7 +105,7 @@ pub trait Weight: Send + Sync + 'static {
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut docset = self.scorer(reader, 1.0)?; let mut docset = self.scorer(reader, 1.0)?;
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; let mut buffer = [0u32; BUFFER_LEN];
for_each_docset_buffered(&mut docset, &mut buffer, callback); for_each_docset_buffered(&mut docset, &mut buffer, callback);
Ok(()) Ok(())
} }

View File

@@ -1,5 +1,6 @@
mod warming; mod warming;
use std::convert::TryInto;
use std::sync::atomic::AtomicU64; use std::sync::atomic::AtomicU64;
use std::sync::{atomic, Arc, Weak}; use std::sync::{atomic, Arc, Weak};

View File

@@ -819,6 +819,7 @@ mod tests {
use crate::schema::document::existing_type_impls::JsonObjectIter; use crate::schema::document::existing_type_impls::JsonObjectIter;
use crate::schema::document::se::BinaryValueSerializer; use crate::schema::document::se::BinaryValueSerializer;
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf};
use crate::schema::OwnedValue;
fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> { fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
let mut writer = Vec::new(); let mut writer = Vec::new();

View File

@@ -1,16 +1,15 @@
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use common::{BinarySerializable, DateTime, VInt}; use common::DateTime;
use serde_json::Map; use serde_json::Map;
use crate::schema::document::se::BinaryValueSerializer;
use crate::schema::document::{ use crate::schema::document::{
BinaryDocumentDeserializer, BinaryDocumentSerializer, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer, ReferenceValue, ReferenceValueLeaf DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
}; };
use crate::schema::field_type::ValueParsingError; use crate::schema::field_type::ValueParsingError;
use crate::schema::field_value::FieldValueIter; use crate::schema::field_value::FieldValueIter;
use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, OwnedValue, Schema, Value}; use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, OwnedValue, Schema};
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
/// TantivyDocument provides a default implementation of the `Document` trait. /// TantivyDocument provides a default implementation of the `Document` trait.
@@ -86,36 +85,6 @@ impl IntoIterator for TantivyDocument {
} }
impl TantivyDocument { impl TantivyDocument {
pub fn to_bytes(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
buffer.clear();
let num_field_values = self.field_values.len();
VInt(num_field_values as u64).serialize(buffer)?;
for (field, value_access) in self.iter_fields_and_values() {
field.serialize(buffer)?;
let mut serializer = BinaryValueSerializer::new(buffer);
match value_access.as_value() {
ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(pre_tokenized_text)) => {
serializer.serialize_value(ReferenceValue::Leaf::<&'_ OwnedValue>(
ReferenceValueLeaf::Str(&pre_tokenized_text.text),
))?;
}
_ => {
serializer.serialize_value(value_access.as_value())?;
}
}
}
Ok(())
}
pub fn from_bytes(mut payload: &[u8]) -> Self {
let deserializer = BinaryDocumentDeserializer::from_reader(&mut payload).unwrap();
Self::deserialize(deserializer).unwrap()
}
/// Creates a new, empty document object /// Creates a new, empty document object
pub fn new() -> TantivyDocument { pub fn new() -> TantivyDocument {
TantivyDocument::default() TantivyDocument::default()
@@ -287,6 +256,7 @@ impl DocParsingError {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::schema::document::default_document::TantivyDocument;
use crate::schema::*; use crate::schema::*;
#[test] #[test]

View File

@@ -1,4 +1,4 @@
use std::collections::BTreeMap; use std::collections::{btree_map, BTreeMap};
use std::fmt; use std::fmt;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
@@ -45,7 +45,7 @@ pub enum OwnedValue {
/// A set of values. /// A set of values.
Array(Vec<Self>), Array(Vec<Self>),
/// Dynamic object value. /// Dynamic object value.
Object(Vec<(String, Self)>), Object(BTreeMap<String, Self>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr), IpAddr(Ipv6Addr),
} }
@@ -148,10 +148,10 @@ impl ValueDeserialize for OwnedValue {
fn visit_object<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError> fn visit_object<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
where A: ObjectAccess<'de> { where A: ObjectAccess<'de> {
let mut elements = Vec::new(); let mut elements = BTreeMap::new();
while let Some((key, value)) = access.next_entry()? { while let Some((key, value)) = access.next_entry()? {
elements.push((key, value)); elements.insert(key, value);
} }
Ok(OwnedValue::Object(elements)) Ok(OwnedValue::Object(elements))
@@ -248,13 +248,12 @@ impl<'de> serde::Deserialize<'de> for OwnedValue {
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error> fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
where A: MapAccess<'de> { where A: MapAccess<'de> {
let mut object = let mut object = BTreeMap::new();
map.size_hint()
.map(Vec::with_capacity)
.unwrap_or_default();
while let Some((key, value)) = map.next_entry()? { while let Some((key, value)) = map.next_entry()? {
object.push((key, value)); object.insert(key, value);
} }
Ok(OwnedValue::Object(object)) Ok(OwnedValue::Object(object))
} }
} }
@@ -364,8 +363,7 @@ impl From<PreTokenizedString> for OwnedValue {
impl From<BTreeMap<String, OwnedValue>> for OwnedValue { impl From<BTreeMap<String, OwnedValue>> for OwnedValue {
fn from(object: BTreeMap<String, OwnedValue>) -> OwnedValue { fn from(object: BTreeMap<String, OwnedValue>) -> OwnedValue {
let key_values = object.into_iter().collect(); OwnedValue::Object(object)
OwnedValue::Object(key_values)
} }
} }
@@ -419,15 +417,18 @@ impl From<serde_json::Value> for OwnedValue {
impl From<serde_json::Map<String, serde_json::Value>> for OwnedValue { impl From<serde_json::Map<String, serde_json::Value>> for OwnedValue {
fn from(map: serde_json::Map<String, serde_json::Value>) -> Self { fn from(map: serde_json::Map<String, serde_json::Value>) -> Self {
let object: Vec<(String, OwnedValue)> = map.into_iter() let mut object = BTreeMap::new();
.map(|(key, value)| (key, OwnedValue::from(value)))
.collect(); for (key, value) in map {
object.insert(key, OwnedValue::from(value));
}
OwnedValue::Object(object) OwnedValue::Object(object)
} }
} }
/// A wrapper type for iterating over a serde_json object producing reference values. /// A wrapper type for iterating over a serde_json object producing reference values.
pub struct ObjectMapIter<'a>(std::slice::Iter<'a, (String, OwnedValue)>); pub struct ObjectMapIter<'a>(btree_map::Iter<'a, String, OwnedValue>);
impl<'a> Iterator for ObjectMapIter<'a> { impl<'a> Iterator for ObjectMapIter<'a> {
type Item = (&'a str, &'a OwnedValue); type Item = (&'a str, &'a OwnedValue);
@@ -442,7 +443,9 @@ impl<'a> Iterator for ObjectMapIter<'a> {
mod tests { mod tests {
use super::*; use super::*;
use crate::schema::{BytesOptions, Schema}; use crate::schema::{BytesOptions, Schema};
use crate::{Document, TantivyDocument}; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, Document, TantivyDocument};
#[test] #[test]
fn test_parse_bytes_doc() { fn test_parse_bytes_doc() {

View File

@@ -136,6 +136,7 @@ impl FieldEntry {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use serde_json;
use super::*; use super::*;
use crate::schema::{Schema, TextFieldIndexing, TEXT}; use crate::schema::{Schema, TextFieldIndexing, TEXT};

View File

@@ -6,8 +6,10 @@ use serde::de::{SeqAccess, Visitor};
use serde::ser::SerializeSeq; use serde::ser::SerializeSeq;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use super::ip_options::IpAddrOptions;
use super::*; use super::*;
use crate::json_utils::split_json_path; use crate::json_utils::split_json_path;
use crate::schema::bytes_options::BytesOptions;
use crate::TantivyError; use crate::TantivyError;
/// Tantivy has a very strict schema. /// Tantivy has a very strict schema.
@@ -419,7 +421,9 @@ mod tests {
use matches::{assert_matches, matches}; use matches::{assert_matches, matches};
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
use serde_json;
use crate::schema::document::Value;
use crate::schema::field_type::ValueParsingError; use crate::schema::field_type::ValueParsingError;
use crate::schema::schema::DocParsingError::InvalidJson; use crate::schema::schema::DocParsingError::InvalidJson;
use crate::schema::*; use crate::schema::*;

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::{fmt, str}; use std::{fmt, str};
@@ -217,23 +218,6 @@ impl Term {
&mut self.0[len_before..] &mut self.0[len_before..]
} }
/// Appends json path bytes to the Term.
/// If the path contains 0 bytes, they are replaced by a "0" string.
/// The 0 byte is used to mark the end of the path.
///
/// This function returns the segment that has just been added.
#[inline]
pub fn append_path(&mut self, bytes: &[u8]) -> &mut [u8] {
let len_before = self.0.len();
if bytes.contains(&0u8) {
self.0
.extend(bytes.iter().map(|&b| if b == 0 { b'0' } else { b }));
} else {
self.0.extend_from_slice(bytes);
}
&mut self.0[len_before..]
}
/// Appends a JSON_PATH_SEGMENT_SEP to the term. /// Appends a JSON_PATH_SEGMENT_SEP to the term.
/// Only used for JSON type. /// Only used for JSON type.
#[inline] #[inline]

View File

@@ -1,3 +1,4 @@
use core::convert::TryInto;
use std::io::{self}; use std::io::{self};
use std::mem; use std::mem;

View File

@@ -2,6 +2,12 @@ use std::io;
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Deserializer, Serialize};
pub trait StoreCompressor {
fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>;
fn decompress(&self, compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()>;
fn get_compressor_id() -> u8;
}
/// Compressor can be used on `IndexSettings` to choose /// Compressor can be used on `IndexSettings` to choose
/// the compressor used to compress the doc store. /// the compressor used to compress the doc store.
/// ///

View File

@@ -4,6 +4,12 @@ use serde::{Deserialize, Serialize};
use super::Compressor; use super::Compressor;
pub trait StoreCompressor {
fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>;
fn decompress(&self, compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()>;
fn get_compressor_id() -> u8;
}
/// Decompressor is deserialized from the doc store footer, when opening an index. /// Decompressor is deserialized from the doc store footer, when opening an index.
#[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Decompressor { pub enum Decompressor {
@@ -80,6 +86,7 @@ impl Decompressor {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::store::Compressor;
#[test] #[test]
fn compressor_decompressor_id_test() { fn compressor_decompressor_id_test() {

View File

@@ -41,7 +41,7 @@ mod tests {
use std::io; use std::io;
use proptest::prelude::*; use proptest::strategy::{BoxedStrategy, Strategy};
use super::{SkipIndex, SkipIndexBuilder}; use super::{SkipIndex, SkipIndexBuilder};
use crate::directory::OwnedBytes; use crate::directory::OwnedBytes;
@@ -227,6 +227,8 @@ mod tests {
} }
} }
use proptest::prelude::*;
proptest! { proptest! {
#![proptest_config(ProptestConfig::with_cases(20))] #![proptest_config(ProptestConfig::with_cases(20))]
#[test] #[test]

View File

@@ -288,6 +288,7 @@ impl TermInfoStoreWriter {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use common;
use common::BinarySerializable; use common::BinarySerializable;
use tantivy_bitpacker::{compute_num_bits, BitPacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker};

View File

@@ -1,7 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::mem; use std::mem;
use rust_stemmers::Algorithm; use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::{Token, TokenFilter, TokenStream, Tokenizer}; use super::{Token, TokenFilter, TokenStream, Tokenizer};

View File

@@ -95,6 +95,7 @@ impl TokenStream for PreTokenizedStream {
mod tests { mod tests {
use super::*; use super::*;
use crate::tokenizer::Token;
#[test] #[test]
fn test_tokenized_stream() { fn test_tokenized_stream() {

View File

@@ -3,7 +3,7 @@ use std::sync::Arc;
use common::file_slice::FileSlice; use common::file_slice::FileSlice;
use common::OwnedBytes; use common::OwnedBytes;
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion};
use tantivy_sstable::{Dictionary, MonotonicU64SSTable}; use tantivy_sstable::{self, Dictionary, MonotonicU64SSTable};
fn make_test_sstable(suffix: &str) -> FileSlice { fn make_test_sstable(suffix: &str) -> FileSlice {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap(); let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();

View File

@@ -5,7 +5,7 @@ use common::file_slice::FileSlice;
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion};
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
use tantivy_sstable::{Dictionary, MonotonicU64SSTable}; use tantivy_sstable::{self, Dictionary, MonotonicU64SSTable};
const CHARSET: &'static [u8] = b"abcdefghij"; const CHARSET: &'static [u8] = b"abcdefghij";

View File

@@ -10,7 +10,7 @@ description = "term hashmap used for indexing"
[dependencies] [dependencies]
murmurhash32 = "0.3" murmurhash32 = "0.3"
common = { version = "0.6", path = "../common/", package = "tantivy-common" } common = { version = "0.6", path = "../common/", package = "tantivy-common" }
ahash = { version = "0.8.11", default-features = false, optional = true } ahash = { version = "0.8.3", default-features = false, optional = true }
rand_distr = "0.4.3" rand_distr = "0.4.3"
[[bench]] [[bench]]

Some files were not shown because too many files have changed in this diff Show More