mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-18 17:20:41 +00:00
* use optional index in multivalued index
For mostly empty multivalued indices there was a large overhead during
creation when iterating all docids. This is alleviated by placing an
optional index in the multivalued index to mark documents that have values.
There's some performance overhead when accessing values in a multivalued
index. The accessing cost is now optional index + multivalue index. The
sparse codec performs relatively bad with the binary_search when accessing
data. This is reflected in the benchmarks below.
This changes the format of columnar to v2, but code is added to handle the v1
formats.
```
Running benches/bench_access.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_access-ea323c028db88db4)
multi sparse 1/13
access_values_for_doc Avg: 42.8946ms (+241.80%) Median: 42.8869ms (+244.10%) [42.7484ms .. 43.1074ms]
access_first_vals Avg: 42.8022ms (+421.93%) Median: 42.7553ms (+439.84%) [42.6794ms .. 43.7404ms]
multi 2x
access_values_for_doc Avg: 31.1244ms (+24.17%) Median: 30.8339ms (+23.46%) [30.7192ms .. 33.6059ms]
access_first_vals Avg: 24.3070ms (+70.92%) Median: 24.0966ms (+70.18%) [23.9328ms .. 26.4851ms]
sparse 1/13
access_values_for_doc Avg: 42.2490ms (+0.61%) Median: 42.2346ms (+2.28%) [41.8988ms .. 43.7821ms]
access_first_vals Avg: 43.6272ms (+0.23%) Median: 43.6197ms (+1.78%) [43.4920ms .. 43.9009ms]
dense 1/12
access_values_for_doc Avg: 8.6184ms (+23.18%) Median: 8.6126ms (+23.78%) [8.5843ms .. 8.7527ms]
access_first_vals Avg: 6.8112ms (+4.47%) Median: 6.8002ms (+4.55%) [6.7887ms .. 6.8991ms]
full
access_values_for_doc Avg: 9.4073ms (-5.09%) Median: 9.4023ms (-2.23%) [9.3694ms .. 9.4568ms]
access_first_vals Avg: 4.9531ms (+6.24%) Median: 4.9502ms (+7.85%) [4.9423ms .. 4.9718ms]
```
```
Running benches/bench_merge.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_merge-475697dfceb3639f)
merge_multi 2x_and_multi 2x Avg: 20.2280ms (+34.33%) Median: 20.1829ms (+35.33%) [19.9933ms .. 20.8806ms]
merge_multi sparse 1/13_and_multi sparse 1/13 Avg: 0.8961ms (-78.04%) Median: 0.8943ms (-77.61%) [0.8899ms .. 0.9272ms]
merge_dense 1/12_and_dense 1/12 Avg: 0.6619ms (-1.26%) Median: 0.6616ms (+2.20%) [0.6473ms .. 0.6837ms]
merge_sparse 1/13_and_sparse 1/13 Avg: 0.5508ms (-0.85%) Median: 0.5508ms (+2.80%) [0.5420ms .. 0.5634ms]
merge_sparse 1/13_and_dense 1/12 Avg: 0.6046ms (-4.64%) Median: 0.6038ms (+2.80%) [0.5939ms .. 0.6296ms]
merge_multi sparse 1/13_and_dense 1/12 Avg: 0.9111ms (-83.48%) Median: 0.9063ms (-83.50%) [0.9047ms .. 0.9663ms]
merge_multi sparse 1/13_and_sparse 1/13 Avg: 0.8451ms (-89.49%) Median: 0.8428ms (-89.43%) [0.8411ms .. 0.8563ms]
merge_multi 2x_and_dense 1/12 Avg: 10.6624ms (-4.82%) Median: 10.6568ms (-4.49%) [10.5738ms .. 10.8353ms]
merge_multi 2x_and_sparse 1/13 Avg: 10.6336ms (-22.95%) Median: 10.5925ms (-22.33%) [10.5149ms .. 11.5657ms]
```
* Update columnar/src/columnar/format_version.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
* Update columnar/src/column_index/mod.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
---------
Co-authored-by: Paul Masurel <paul@quickwit.io>
327 lines
11 KiB
Rust
327 lines
11 KiB
Rust
use std::net::Ipv6Addr;
|
|
use std::sync::Arc;
|
|
use std::{fmt, io};
|
|
|
|
use common::file_slice::FileSlice;
|
|
use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
|
|
|
use crate::column::{BytesColumn, Column, StrColumn};
|
|
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
|
use crate::columnar::ColumnType;
|
|
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
|
|
|
|
#[derive(Clone)]
|
|
pub enum DynamicColumn {
|
|
Bool(Column<bool>),
|
|
I64(Column<i64>),
|
|
U64(Column<u64>),
|
|
F64(Column<f64>),
|
|
IpAddr(Column<Ipv6Addr>),
|
|
DateTime(Column<DateTime>),
|
|
Bytes(BytesColumn),
|
|
Str(StrColumn),
|
|
}
|
|
|
|
impl fmt::Debug for DynamicColumn {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
write!(f, "[{} {} |", self.get_cardinality(), self.column_type())?;
|
|
match self {
|
|
DynamicColumn::Bool(col) => write!(f, " {col:?}")?,
|
|
DynamicColumn::I64(col) => write!(f, " {col:?}")?,
|
|
DynamicColumn::U64(col) => write!(f, " {col:?}")?,
|
|
DynamicColumn::F64(col) => write!(f, "{col:?}")?,
|
|
DynamicColumn::IpAddr(col) => write!(f, "{col:?}")?,
|
|
DynamicColumn::DateTime(col) => write!(f, "{col:?}")?,
|
|
DynamicColumn::Bytes(col) => write!(f, "{col:?}")?,
|
|
DynamicColumn::Str(col) => write!(f, "{col:?}")?,
|
|
}
|
|
write!(f, "]")
|
|
}
|
|
}
|
|
|
|
impl DynamicColumn {
|
|
pub fn column_index(&self) -> &ColumnIndex {
|
|
match self {
|
|
DynamicColumn::Bool(c) => &c.index,
|
|
DynamicColumn::I64(c) => &c.index,
|
|
DynamicColumn::U64(c) => &c.index,
|
|
DynamicColumn::F64(c) => &c.index,
|
|
DynamicColumn::IpAddr(c) => &c.index,
|
|
DynamicColumn::DateTime(c) => &c.index,
|
|
DynamicColumn::Bytes(c) => &c.ords().index,
|
|
DynamicColumn::Str(c) => &c.ords().index,
|
|
}
|
|
}
|
|
|
|
pub fn get_cardinality(&self) -> Cardinality {
|
|
self.column_index().get_cardinality()
|
|
}
|
|
|
|
pub fn num_values(&self) -> u32 {
|
|
match self {
|
|
DynamicColumn::Bool(c) => c.values.num_vals(),
|
|
DynamicColumn::I64(c) => c.values.num_vals(),
|
|
DynamicColumn::U64(c) => c.values.num_vals(),
|
|
DynamicColumn::F64(c) => c.values.num_vals(),
|
|
DynamicColumn::IpAddr(c) => c.values.num_vals(),
|
|
DynamicColumn::DateTime(c) => c.values.num_vals(),
|
|
DynamicColumn::Bytes(c) => c.ords().values.num_vals(),
|
|
DynamicColumn::Str(c) => c.ords().values.num_vals(),
|
|
}
|
|
}
|
|
|
|
pub fn column_type(&self) -> ColumnType {
|
|
match self {
|
|
DynamicColumn::Bool(_) => ColumnType::Bool,
|
|
DynamicColumn::I64(_) => ColumnType::I64,
|
|
DynamicColumn::U64(_) => ColumnType::U64,
|
|
DynamicColumn::F64(_) => ColumnType::F64,
|
|
DynamicColumn::IpAddr(_) => ColumnType::IpAddr,
|
|
DynamicColumn::DateTime(_) => ColumnType::DateTime,
|
|
DynamicColumn::Bytes(_) => ColumnType::Bytes,
|
|
DynamicColumn::Str(_) => ColumnType::Str,
|
|
}
|
|
}
|
|
|
|
pub fn coerce_numerical(self, target_numerical_type: NumericalType) -> Option<Self> {
|
|
match target_numerical_type {
|
|
NumericalType::I64 => self.coerce_to_i64(),
|
|
NumericalType::U64 => self.coerce_to_u64(),
|
|
NumericalType::F64 => self.coerce_to_f64(),
|
|
}
|
|
}
|
|
|
|
pub fn is_numerical(&self) -> bool {
|
|
self.column_type().numerical_type().is_some()
|
|
}
|
|
|
|
pub fn is_f64(&self) -> bool {
|
|
self.column_type().numerical_type() == Some(NumericalType::F64)
|
|
}
|
|
pub fn is_i64(&self) -> bool {
|
|
self.column_type().numerical_type() == Some(NumericalType::I64)
|
|
}
|
|
pub fn is_u64(&self) -> bool {
|
|
self.column_type().numerical_type() == Some(NumericalType::U64)
|
|
}
|
|
|
|
fn coerce_to_f64(self) -> Option<DynamicColumn> {
|
|
match self {
|
|
DynamicColumn::I64(column) => Some(DynamicColumn::F64(Column {
|
|
index: column.index,
|
|
values: Arc::new(monotonic_map_column(column.values, MapI64ToF64)),
|
|
})),
|
|
DynamicColumn::U64(column) => Some(DynamicColumn::F64(Column {
|
|
index: column.index,
|
|
values: Arc::new(monotonic_map_column(column.values, MapU64ToF64)),
|
|
})),
|
|
DynamicColumn::F64(_) => Some(self),
|
|
_ => None,
|
|
}
|
|
}
|
|
fn coerce_to_i64(self) -> Option<DynamicColumn> {
|
|
match self {
|
|
DynamicColumn::U64(column) => {
|
|
if column.max_value() > i64::MAX as u64 {
|
|
return None;
|
|
}
|
|
Some(DynamicColumn::I64(Column {
|
|
index: column.index,
|
|
values: Arc::new(monotonic_map_column(column.values, MapU64ToI64)),
|
|
}))
|
|
}
|
|
DynamicColumn::I64(_) => Some(self),
|
|
_ => None,
|
|
}
|
|
}
|
|
fn coerce_to_u64(self) -> Option<DynamicColumn> {
|
|
match self {
|
|
DynamicColumn::I64(column) => {
|
|
if column.min_value() < 0 {
|
|
return None;
|
|
}
|
|
Some(DynamicColumn::U64(Column {
|
|
index: column.index,
|
|
values: Arc::new(monotonic_map_column(column.values, MapI64ToU64)),
|
|
}))
|
|
}
|
|
DynamicColumn::U64(_) => Some(self),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
struct MapI64ToF64;
|
|
impl StrictlyMonotonicFn<i64, f64> for MapI64ToF64 {
|
|
#[inline(always)]
|
|
fn mapping(&self, inp: i64) -> f64 {
|
|
inp as f64
|
|
}
|
|
#[inline(always)]
|
|
fn inverse(&self, out: f64) -> i64 {
|
|
out as i64
|
|
}
|
|
}
|
|
|
|
struct MapU64ToF64;
|
|
impl StrictlyMonotonicFn<u64, f64> for MapU64ToF64 {
|
|
#[inline(always)]
|
|
fn mapping(&self, inp: u64) -> f64 {
|
|
inp as f64
|
|
}
|
|
#[inline(always)]
|
|
fn inverse(&self, out: f64) -> u64 {
|
|
out as u64
|
|
}
|
|
}
|
|
|
|
struct MapU64ToI64;
|
|
impl StrictlyMonotonicFn<u64, i64> for MapU64ToI64 {
|
|
#[inline(always)]
|
|
fn mapping(&self, inp: u64) -> i64 {
|
|
inp as i64
|
|
}
|
|
#[inline(always)]
|
|
fn inverse(&self, out: i64) -> u64 {
|
|
out as u64
|
|
}
|
|
}
|
|
|
|
struct MapI64ToU64;
|
|
impl StrictlyMonotonicFn<i64, u64> for MapI64ToU64 {
|
|
#[inline(always)]
|
|
fn mapping(&self, inp: i64) -> u64 {
|
|
inp as u64
|
|
}
|
|
#[inline(always)]
|
|
fn inverse(&self, out: u64) -> i64 {
|
|
out as i64
|
|
}
|
|
}
|
|
|
|
macro_rules! static_dynamic_conversions {
|
|
($typ:ty, $enum_name:ident) => {
|
|
impl From<DynamicColumn> for Option<$typ> {
|
|
fn from(dynamic_column: DynamicColumn) -> Option<$typ> {
|
|
if let DynamicColumn::$enum_name(col) = dynamic_column {
|
|
Some(col)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<$typ> for DynamicColumn {
|
|
fn from(typed_column: $typ) -> Self {
|
|
DynamicColumn::$enum_name(typed_column)
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
static_dynamic_conversions!(Column<bool>, Bool);
|
|
static_dynamic_conversions!(Column<u64>, U64);
|
|
static_dynamic_conversions!(Column<i64>, I64);
|
|
static_dynamic_conversions!(Column<f64>, F64);
|
|
static_dynamic_conversions!(Column<DateTime>, DateTime);
|
|
static_dynamic_conversions!(StrColumn, Str);
|
|
static_dynamic_conversions!(BytesColumn, Bytes);
|
|
static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct DynamicColumnHandle {
|
|
pub(crate) file_slice: FileSlice,
|
|
pub(crate) column_type: ColumnType,
|
|
pub(crate) format_version: Version,
|
|
}
|
|
|
|
impl DynamicColumnHandle {
|
|
// TODO rename load
|
|
pub fn open(&self) -> io::Result<DynamicColumn> {
|
|
let column_bytes: OwnedBytes = self.file_slice.read_bytes()?;
|
|
self.open_internal(column_bytes)
|
|
}
|
|
|
|
#[doc(hidden)]
|
|
pub fn file_slice(&self) -> &FileSlice {
|
|
&self.file_slice
|
|
}
|
|
|
|
/// Returns the `u64` fast field reader reader associated with `fields` of types
|
|
/// Str, u64, i64, f64, bool, ip, or datetime.
|
|
///
|
|
/// Notice that for IpAddr, the fastfield reader will return the u64 representation of the
|
|
/// IpAddr.
|
|
/// In order to convert to u128 back cast to `CompactSpaceU64Accessor` and call
|
|
/// `compact_to_u128`.
|
|
///
|
|
/// If not, the fastfield reader will returns the u64-value associated with the original
|
|
/// FastValue.
|
|
pub fn open_u64_lenient(&self) -> io::Result<Option<Column<u64>>> {
|
|
let column_bytes = self.file_slice.read_bytes()?;
|
|
match self.column_type {
|
|
ColumnType::Str | ColumnType::Bytes => {
|
|
let column: BytesColumn =
|
|
crate::column::open_column_bytes(column_bytes, self.format_version)?;
|
|
Ok(Some(column.term_ord_column))
|
|
}
|
|
ColumnType::IpAddr => {
|
|
let column = crate::column::open_column_u128_as_compact_u64(
|
|
column_bytes,
|
|
self.format_version,
|
|
)?;
|
|
Ok(Some(column))
|
|
}
|
|
ColumnType::Bool
|
|
| ColumnType::I64
|
|
| ColumnType::U64
|
|
| ColumnType::F64
|
|
| ColumnType::DateTime => {
|
|
let column =
|
|
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?;
|
|
Ok(Some(column))
|
|
}
|
|
}
|
|
}
|
|
|
|
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
|
|
let dynamic_column: DynamicColumn = match self.column_type {
|
|
ColumnType::Bytes => {
|
|
crate::column::open_column_bytes(column_bytes, self.format_version)?.into()
|
|
}
|
|
ColumnType::Str => {
|
|
crate::column::open_column_str(column_bytes, self.format_version)?.into()
|
|
}
|
|
ColumnType::I64 => {
|
|
crate::column::open_column_u64::<i64>(column_bytes, self.format_version)?.into()
|
|
}
|
|
ColumnType::U64 => {
|
|
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?.into()
|
|
}
|
|
ColumnType::F64 => {
|
|
crate::column::open_column_u64::<f64>(column_bytes, self.format_version)?.into()
|
|
}
|
|
ColumnType::Bool => {
|
|
crate::column::open_column_u64::<bool>(column_bytes, self.format_version)?.into()
|
|
}
|
|
ColumnType::IpAddr => {
|
|
crate::column::open_column_u128::<Ipv6Addr>(column_bytes, self.format_version)?
|
|
.into()
|
|
}
|
|
ColumnType::DateTime => {
|
|
crate::column::open_column_u64::<DateTime>(column_bytes, self.format_version)?
|
|
.into()
|
|
}
|
|
};
|
|
Ok(dynamic_column)
|
|
}
|
|
|
|
pub fn num_bytes(&self) -> ByteCount {
|
|
self.file_slice.len().into()
|
|
}
|
|
|
|
pub fn column_type(&self) -> ColumnType {
|
|
self.column_type
|
|
}
|
|
}
|