mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Compare commits
9 Commits
0.24.2
...
trinity.po
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dedd1aa83a | ||
|
|
080fa4d1f4 | ||
|
|
988c2b35e7 | ||
|
|
bf3cc12610 | ||
|
|
a2400f4e73 | ||
|
|
436ec6caea | ||
|
|
4a6123d3ff | ||
|
|
5a2fe42c24 | ||
|
|
5379c99ea2 |
@@ -1,6 +1,6 @@
|
||||
Tantivy 0.23 - Unreleased
|
||||
Tantivy 0.24
|
||||
================================
|
||||
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75.
|
||||
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
|
||||
|
||||
#### Bugfixes
|
||||
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
|
||||
|
||||
23
Cargo.toml
23
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.22.0"
|
||||
version = "0.24.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.75"
|
||||
rust-version = "1.85"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
@@ -57,13 +57,13 @@ measure_time = "0.9.0"
|
||||
arc-swap = "1.5.0"
|
||||
bon = "3.3.1"
|
||||
|
||||
columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" }
|
||||
sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true }
|
||||
stacker = { version = "0.3", path = "./stacker", package = "tantivy-stacker" }
|
||||
query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
|
||||
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
columnar = { version = "0.5", path = "./columnar", package = "tantivy-columnar" }
|
||||
sstable = { version = "0.5", path = "./sstable", package = "tantivy-sstable", optional = true }
|
||||
stacker = { version = "0.5", path = "./stacker", package = "tantivy-stacker" }
|
||||
query-grammar = { version = "0.24.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version = "0.8", path = "./bitpacker" }
|
||||
common = { version = "0.9", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.5", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
@@ -112,13 +112,16 @@ debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap", "stopwords", "lz4-compression"]
|
||||
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
|
||||
mmap = ["fs4", "tempfile", "memmap2"]
|
||||
stopwords = []
|
||||
|
||||
lz4-compression = ["lz4_flex"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
# enable zstd-compression in columnar (and sstable)
|
||||
columnar-zstd-compression = ["columnar/zstd-compression"]
|
||||
|
||||
failpoints = ["fail", "fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.6.0"
|
||||
edition = "2021"
|
||||
version = "0.8.0"
|
||||
edition = "2024"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::bitpacker::BitPacker;
|
||||
use super::compute_num_bits;
|
||||
use crate::{minmax, BitUnpacker};
|
||||
use crate::{BitUnpacker, minmax};
|
||||
|
||||
const BLOCK_SIZE: usize = 128;
|
||||
|
||||
|
||||
@@ -33,11 +33,7 @@ pub use crate::blocked_bitpacker::BlockedBitpacker;
|
||||
/// number of bits.
|
||||
pub fn compute_num_bits(n: u64) -> u8 {
|
||||
let amplitude = (64u32 - n.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
} else {
|
||||
64
|
||||
}
|
||||
if amplitude <= 64 - 8 { amplitude } else { 64 }
|
||||
}
|
||||
|
||||
/// Computes the (min, max) of an iterator of `PartialOrd` values.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "tantivy-columnar"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
version = "0.5.0"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
@@ -12,10 +12,10 @@ categories = ["database-implementations", "data-structures", "compression"]
|
||||
itertools = "0.14.0"
|
||||
fastdivide = "0.4.0"
|
||||
|
||||
stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { version= "0.3", path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { version= "0.7", path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.6", path = "../bitpacker/" }
|
||||
stacker = { version= "0.5", path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { version= "0.5", path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { version= "0.9", path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.8", path = "../bitpacker/" }
|
||||
serde = "1.0.152"
|
||||
downcast-rs = "2.0.1"
|
||||
|
||||
@@ -33,6 +33,6 @@ harness = false
|
||||
name = "bench_access"
|
||||
harness = false
|
||||
|
||||
|
||||
[features]
|
||||
unstable = []
|
||||
zstd-compression = ["sstable/zstd-compression"]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use binggan::{black_box, InputGroup};
|
||||
use binggan::{InputGroup, black_box};
|
||||
use common::*;
|
||||
use tantivy_columnar::Column;
|
||||
|
||||
|
||||
@@ -4,9 +4,9 @@ extern crate test;
|
||||
use std::sync::Arc;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
|
||||
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
||||
use tantivy_columnar::*;
|
||||
use test::{black_box, Bencher};
|
||||
use test::{Bencher, black_box};
|
||||
|
||||
struct Columns {
|
||||
pub optional: Column,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
pub mod common;
|
||||
|
||||
use binggan::BenchRunner;
|
||||
use common::{generate_columnar_with_name, Card};
|
||||
use common::{Card, generate_columnar_with_name};
|
||||
use tantivy_columnar::*;
|
||||
|
||||
const NUM_DOCS: u32 = 100_000;
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::sync::Arc;
|
||||
use common::OwnedBytes;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{random, Rng, SeedableRng};
|
||||
use rand::{Rng, SeedableRng, random};
|
||||
use tantivy_columnar::ColumnValues;
|
||||
use test::Bencher;
|
||||
extern crate test;
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::ops::RangeInclusive;
|
||||
use std::sync::Arc;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
|
||||
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
||||
use tantivy_columnar::*;
|
||||
use test::Bencher;
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
||||
&'a self,
|
||||
docs: &'a [u32],
|
||||
accessor: &Column<T>,
|
||||
) -> impl Iterator<Item = (DocId, T)> + 'a {
|
||||
) -> impl Iterator<Item = (DocId, T)> + 'a + use<'a, T> {
|
||||
if accessor.index.get_cardinality().is_full() {
|
||||
docs.iter().cloned().zip(self.val_cache.iter().cloned())
|
||||
} else {
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::{fmt, io};
|
||||
|
||||
use sstable::{Dictionary, VoidSSTable};
|
||||
|
||||
use crate::column::Column;
|
||||
use crate::RowId;
|
||||
use crate::column::Column;
|
||||
|
||||
/// Dictionary encoded column.
|
||||
///
|
||||
|
||||
@@ -9,13 +9,14 @@ use std::sync::Arc;
|
||||
use common::BinarySerializable;
|
||||
pub use dictionary_encoded::{BytesColumn, StrColumn};
|
||||
pub use serialize::{
|
||||
open_column_bytes, open_column_str, open_column_u128, open_column_u128_as_compact_u64,
|
||||
open_column_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64,
|
||||
open_column_bytes, open_column_str, open_column_u64, open_column_u128,
|
||||
open_column_u128_as_compact_u64, serialize_column_mappable_to_u64,
|
||||
serialize_column_mappable_to_u128,
|
||||
};
|
||||
|
||||
use crate::column_index::{ColumnIndex, Set};
|
||||
use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
|
||||
use crate::column_values::{monotonic_map_column, ColumnValues};
|
||||
use crate::column_values::{ColumnValues, monotonic_map_column};
|
||||
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
|
||||
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -6,10 +6,10 @@ use common::OwnedBytes;
|
||||
use sstable::Dictionary;
|
||||
|
||||
use crate::column::{BytesColumn, Column};
|
||||
use crate::column_index::{serialize_column_index, SerializableColumnIndex};
|
||||
use crate::column_index::{SerializableColumnIndex, serialize_column_index};
|
||||
use crate::column_values::{
|
||||
CodecType, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
|
||||
load_u64_based_column_values, serialize_column_values_u128, serialize_u64_based_column_values,
|
||||
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{StrColumn, Version};
|
||||
|
||||
@@ -99,9 +99,9 @@ mod tests {
|
||||
|
||||
use crate::column_index::merge::detect_cardinality;
|
||||
use crate::column_index::multivalued_index::{
|
||||
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
|
||||
MultiValueIndex, open_multivalued_index, serialize_multivalued_index,
|
||||
};
|
||||
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
|
||||
use crate::column_index::{OptionalIndex, SerializableColumnIndex, merge_column_index};
|
||||
use crate::{
|
||||
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
|
||||
};
|
||||
|
||||
@@ -137,8 +137,8 @@ impl Iterable<u32> for ShuffledMultivaluedIndex<'_> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::column_index::OptionalIndex;
|
||||
use crate::RowAddr;
|
||||
use crate::column_index::OptionalIndex;
|
||||
|
||||
#[test]
|
||||
fn test_integrate_num_vals_empty() {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
|
||||
use crate::column_index::serialize::SerializableOptionalIndex;
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ pub use merge::merge_column_index;
|
||||
pub(crate) use multivalued_index::SerializableMultivalueIndex;
|
||||
pub use optional_index::{OptionalIndex, Set};
|
||||
pub use serialize::{
|
||||
open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
|
||||
SerializableColumnIndex, SerializableOptionalIndex, open_column_index, serialize_column_index,
|
||||
};
|
||||
|
||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
||||
|
||||
@@ -8,7 +8,7 @@ use common::{CountingWriter, OwnedBytes};
|
||||
use super::optional_index::{open_optional_index, serialize_optional_index};
|
||||
use super::{OptionalIndex, SerializableOptionalIndex, Set};
|
||||
use crate::column_values::{
|
||||
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
|
||||
CodecType, ColumnValues, load_u64_based_column_values, serialize_u64_based_column_values,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{DocId, RowId, Version};
|
||||
|
||||
@@ -7,7 +7,7 @@ mod set_block;
|
||||
use common::{BinarySerializable, OwnedBytes, VInt};
|
||||
pub use set::{SelectCursor, Set, SetCodec};
|
||||
use set_block::{
|
||||
DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
|
||||
DENSE_BLOCK_NUM_BYTES, DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec,
|
||||
};
|
||||
|
||||
use crate::iterable::Iterable;
|
||||
@@ -259,11 +259,13 @@ impl Set<RowId> for OptionalIndex {
|
||||
|
||||
impl OptionalIndex {
|
||||
pub fn for_test(num_rows: RowId, row_ids: &[RowId]) -> OptionalIndex {
|
||||
assert!(row_ids
|
||||
.last()
|
||||
.copied()
|
||||
.map(|last_row_id| last_row_id < num_rows)
|
||||
.unwrap_or(true));
|
||||
assert!(
|
||||
row_ids
|
||||
.last()
|
||||
.copied()
|
||||
.map(|last_row_id| last_row_id < num_rows)
|
||||
.unwrap_or(true)
|
||||
);
|
||||
let mut buffer = Vec::new();
|
||||
serialize_optional_index(&row_ids, num_rows, &mut buffer).unwrap();
|
||||
let bytes = OwnedBytes::new(buffer);
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::io::{self, Write};
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::column_index::optional_index::{SelectCursor, Set, SetCodec, ELEMENTS_PER_BLOCK};
|
||||
use crate::column_index::optional_index::{ELEMENTS_PER_BLOCK, SelectCursor, Set, SetCodec};
|
||||
|
||||
#[inline(always)]
|
||||
fn get_bit_at(input: u64, n: u16) -> bool {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
mod dense;
|
||||
mod sparse;
|
||||
|
||||
pub use dense::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
|
||||
pub use dense::{DENSE_BLOCK_NUM_BYTES, DenseBlock, DenseBlockCodec};
|
||||
pub use sparse::{SparseBlock, SparseBlockCodec};
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -254,11 +254,7 @@ mod bench {
|
||||
let mut current = start;
|
||||
std::iter::from_fn(move || {
|
||||
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
|
||||
if current >= end {
|
||||
None
|
||||
} else {
|
||||
Some(current)
|
||||
}
|
||||
if current >= end { None } else { Some(current) }
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -3,11 +3,11 @@ use std::io::Write;
|
||||
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
|
||||
use super::multivalued_index::SerializableMultivalueIndex;
|
||||
use super::OptionalIndex;
|
||||
use super::multivalued_index::SerializableMultivalueIndex;
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::column_index::multivalued_index::serialize_multivalued_index;
|
||||
use crate::column_index::optional_index::serialize_optional_index;
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, RowId, Version};
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::column_values::u64_based::*;
|
||||
fn get_data() -> Vec<u64> {
|
||||
let mut rng = StdRng::seed_from_u64(2u64);
|
||||
let mut data: Vec<_> = (100..55000_u64)
|
||||
.map(|num| num + rng.gen::<u8>() as u64)
|
||||
.map(|num| num + rng.r#gen::<u8>() as u64)
|
||||
.collect();
|
||||
data.push(99_000);
|
||||
data.insert(1000, 2000);
|
||||
|
||||
@@ -26,13 +26,13 @@ mod monotonic_column;
|
||||
|
||||
pub(crate) use merge::MergedColumnValues;
|
||||
pub use stats::ColumnStats;
|
||||
pub use u128_based::{
|
||||
open_u128_as_compact_u64, open_u128_mapped, serialize_column_values_u128,
|
||||
CompactSpaceU64Accessor,
|
||||
};
|
||||
pub use u64_based::{
|
||||
load_u64_based_column_values, serialize_and_load_u64_based_column_values,
|
||||
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
|
||||
ALL_U64_CODEC_TYPES, CodecType, load_u64_based_column_values,
|
||||
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
||||
};
|
||||
pub use u128_based::{
|
||||
CompactSpaceU64Accessor, open_u128_as_compact_u64, open_u128_mapped,
|
||||
serialize_column_values_u128,
|
||||
};
|
||||
pub use vec_column::VecColumn;
|
||||
|
||||
|
||||
@@ -2,8 +2,8 @@ use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
|
||||
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
|
||||
use crate::ColumnValues;
|
||||
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
|
||||
|
||||
struct MonotonicMappingColumn<C, T, Input> {
|
||||
from_column: C,
|
||||
@@ -99,10 +99,10 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::VecColumn;
|
||||
use crate::column_values::monotonic_mapping::{
|
||||
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
|
||||
};
|
||||
use crate::column_values::VecColumn;
|
||||
|
||||
#[test]
|
||||
fn test_monotonic_mapping_iter() {
|
||||
|
||||
@@ -24,8 +24,8 @@ use build_compact_space::get_compact_space;
|
||||
use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker};
|
||||
|
||||
use crate::column_values::ColumnValues;
|
||||
use crate::RowId;
|
||||
use crate::column_values::ColumnValues;
|
||||
|
||||
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
|
||||
/// blanks depends on the number of blanks.
|
||||
@@ -653,12 +653,14 @@ mod tests {
|
||||
),
|
||||
&[3]
|
||||
);
|
||||
assert!(get_positions_for_value_range_helper(
|
||||
&decomp,
|
||||
99998u128..=99998u128,
|
||||
complete_range.clone()
|
||||
)
|
||||
.is_empty());
|
||||
assert!(
|
||||
get_positions_for_value_range_helper(
|
||||
&decomp,
|
||||
99998u128..=99998u128,
|
||||
complete_range.clone()
|
||||
)
|
||||
.is_empty()
|
||||
);
|
||||
assert_eq!(
|
||||
&get_positions_for_value_range_helper(
|
||||
&decomp,
|
||||
|
||||
@@ -130,11 +130,11 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::u64_based::{
|
||||
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
||||
ALL_U64_CODEC_TYPES,
|
||||
};
|
||||
use crate::column_values::CodecType;
|
||||
use crate::column_values::u64_based::{
|
||||
ALL_U64_CODEC_TYPES, serialize_and_load_u64_based_column_values,
|
||||
serialize_u64_based_column_values,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_serialize_deserialize_u128_header() {
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::ops::{Range, RangeInclusive};
|
||||
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use fastdivide::DividerU64;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
|
||||
use crate::{ColumnValues, RowId};
|
||||
@@ -23,11 +23,7 @@ const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
|
||||
// copied from unstable rust standard library.
|
||||
let d = n / q.get();
|
||||
let r = n % q.get();
|
||||
if r > 0 {
|
||||
d + 1
|
||||
} else {
|
||||
d
|
||||
}
|
||||
if r > 0 { d + 1 } else { d }
|
||||
}
|
||||
|
||||
// The bitpacked codec applies a linear transformation `f` over data that are bitpacked.
|
||||
|
||||
@@ -4,12 +4,12 @@ use std::{io, iter};
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom, OwnedBytes};
|
||||
use fastdivide::DividerU64;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
use crate::MonotonicallyMappableToU64;
|
||||
use crate::column_values::u64_based::line::Line;
|
||||
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
|
||||
use crate::column_values::{ColumnValues, VecColumn};
|
||||
use crate::MonotonicallyMappableToU64;
|
||||
|
||||
const BLOCK_SIZE: u32 = 512u32;
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use std::io;
|
||||
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
use super::line::Line;
|
||||
use super::ColumnValues;
|
||||
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
|
||||
use crate::column_values::VecColumn;
|
||||
use super::line::Line;
|
||||
use crate::RowId;
|
||||
use crate::column_values::VecColumn;
|
||||
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
|
||||
|
||||
const HALF_SPACE: u64 = u64::MAX / 2;
|
||||
const LINE_ESTIMATION_BLOCK_LEN: usize = 512;
|
||||
|
||||
@@ -17,7 +17,7 @@ pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
|
||||
pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
|
||||
pub use crate::column_values::u64_based::linear::LinearCodec;
|
||||
pub use crate::column_values::u64_based::stats_collector::StatsCollector;
|
||||
use crate::column_values::{monotonic_map_column, ColumnStats};
|
||||
use crate::column_values::{ColumnStats, monotonic_map_column};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{ColumnValues, MonotonicallyMappableToU64};
|
||||
|
||||
|
||||
@@ -2,8 +2,8 @@ use std::num::NonZeroU64;
|
||||
|
||||
use fastdivide::DividerU64;
|
||||
|
||||
use crate::column_values::ColumnStats;
|
||||
use crate::RowId;
|
||||
use crate::column_values::ColumnStats;
|
||||
|
||||
/// Compute the gcd of two non null numbers.
|
||||
///
|
||||
@@ -96,8 +96,8 @@ impl StatsCollector {
|
||||
mod tests {
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
use crate::column_values::u64_based::stats_collector::{compute_gcd, StatsCollector};
|
||||
use crate::column_values::u64_based::ColumnStats;
|
||||
use crate::column_values::u64_based::stats_collector::{StatsCollector, compute_gcd};
|
||||
|
||||
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
|
||||
let mut stats_collector = StatsCollector::default();
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use proptest::prelude::*;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
use rand::Rng;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_and_load_simple() {
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::net::Ipv6Addr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::value::NumericalType;
|
||||
use crate::InvalidData;
|
||||
use crate::value::NumericalType;
|
||||
|
||||
/// The column type represents the column type.
|
||||
/// Any changes need to be propagated to `COLUMN_TYPES`.
|
||||
|
||||
@@ -10,11 +10,11 @@ use std::sync::Arc;
|
||||
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||
|
||||
use super::writer::ColumnarSerializer;
|
||||
use crate::column::{serialize_column_mappable_to_u128, serialize_column_mappable_to_u64};
|
||||
use crate::column::{serialize_column_mappable_to_u64, serialize_column_mappable_to_u128};
|
||||
use crate::column_values::MergedColumnValues;
|
||||
use crate::columnar::ColumnarReader;
|
||||
use crate::columnar::merge::merge_dict_column::merge_bytes_or_str_column;
|
||||
use crate::columnar::writer::CompatibleNumericalTypes;
|
||||
use crate::columnar::ColumnarReader;
|
||||
use crate::dynamic_column::DynamicColumn;
|
||||
use crate::{
|
||||
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
|
||||
@@ -144,16 +144,17 @@ fn merge_column(
|
||||
let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
|
||||
Vec::with_capacity(columns_to_merge.len());
|
||||
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
|
||||
if let Some(Column { index: idx, values }) =
|
||||
dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
|
||||
{
|
||||
column_indexes.push(idx);
|
||||
column_values.push(Some(values));
|
||||
} else {
|
||||
column_indexes.push(ColumnIndex::Empty {
|
||||
num_docs: num_docs_per_column[i],
|
||||
});
|
||||
column_values.push(None);
|
||||
match dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic) {
|
||||
Some(Column { index: idx, values }) => {
|
||||
column_indexes.push(idx);
|
||||
column_values.push(Some(values));
|
||||
}
|
||||
None => {
|
||||
column_indexes.push(ColumnIndex::Empty {
|
||||
num_docs: num_docs_per_column[i],
|
||||
});
|
||||
column_values.push(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
let merged_column_index =
|
||||
@@ -253,11 +254,13 @@ impl GroupedColumns {
|
||||
}
|
||||
// At the moment, only the numerical column type category has more than one possible
|
||||
// column type.
|
||||
assert!(self
|
||||
.columns
|
||||
.iter()
|
||||
.flatten()
|
||||
.all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
|
||||
assert!(
|
||||
self.columns
|
||||
.iter()
|
||||
.flatten()
|
||||
.all(|el| ColumnTypeCategory::from(el.column_type())
|
||||
== ColumnTypeCategory::Numerical)
|
||||
);
|
||||
merged_numerical_columns_type(self.columns.iter().flatten()).into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,18 +74,19 @@ impl<'a> TermMerger<'a> {
|
||||
/// False if there is none.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
self.advance_segments();
|
||||
if let Some(head) = self.heap.pop() {
|
||||
self.term_streams_with_segment.push(head);
|
||||
while let Some(next_streamer) = self.heap.peek() {
|
||||
if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() {
|
||||
break;
|
||||
match self.heap.pop() {
|
||||
Some(head) => {
|
||||
self.term_streams_with_segment.push(head);
|
||||
while let Some(next_streamer) = self.heap.peek() {
|
||||
if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() {
|
||||
break;
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.term_streams_with_segment.push(next_heap_it);
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.term_streams_with_segment.push(next_heap_it);
|
||||
true
|
||||
}
|
||||
true
|
||||
} else {
|
||||
false
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ use proptest::collection::vec;
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
use crate::columnar::{merge_columnar, ColumnarReader, MergeRowOrder, StackMergeOrder};
|
||||
use crate::columnar::{ColumnarReader, MergeRowOrder, StackMergeOrder, merge_columnar};
|
||||
use crate::{Cardinality, ColumnarWriter, DynamicColumn, HasAssociatedColumnType, RowId};
|
||||
|
||||
fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
|
||||
|
||||
@@ -5,9 +5,9 @@ mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use column_type::{ColumnType, HasAssociatedColumnType};
|
||||
pub use format_version::{Version, CURRENT_VERSION};
|
||||
pub use format_version::{CURRENT_VERSION, Version};
|
||||
#[cfg(test)]
|
||||
pub(crate) use merge::ColumnTypeCategory;
|
||||
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||
pub use merge::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, merge_columnar};
|
||||
pub use reader::ColumnarReader;
|
||||
pub use writer::ColumnarWriter;
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::{fmt, io, mem};
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::file_slice::FileSlice;
|
||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
|
||||
use common::BinarySerializable;
|
||||
use sstable::{Dictionary, RangeSSTable};
|
||||
|
||||
use crate::columnar::{format_version, ColumnType};
|
||||
use crate::columnar::{ColumnType, format_version};
|
||||
use crate::dynamic_column::DynamicColumnHandle;
|
||||
use crate::{RowId, Version};
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ impl ColumnWriter {
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
|
||||
) -> impl Iterator<Item = ColumnOperation<V>> + 'a + use<'a, V> {
|
||||
buffer.clear();
|
||||
self.values.read_to_end(arena, buffer);
|
||||
let mut cursor: &[u8] = &buffer[..];
|
||||
@@ -104,9 +104,10 @@ pub(crate) struct NumericalColumnWriter {
|
||||
|
||||
impl NumericalColumnWriter {
|
||||
pub fn force_numerical_type(&mut self, numerical_type: NumericalType) {
|
||||
assert!(self
|
||||
.compatible_numerical_types
|
||||
.is_type_accepted(numerical_type));
|
||||
assert!(
|
||||
self.compatible_numerical_types
|
||||
.is_type_accepted(numerical_type)
|
||||
);
|
||||
self.compatible_numerical_types = CompatibleNumericalTypes::StaticType(numerical_type);
|
||||
}
|
||||
}
|
||||
@@ -211,7 +212,7 @@ impl NumericalColumnWriter {
|
||||
self,
|
||||
arena: &MemoryArena,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
|
||||
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a + use<'a> {
|
||||
self.column_writer.operation_iterator(arena, buffer)
|
||||
}
|
||||
}
|
||||
@@ -255,7 +256,7 @@ impl StrOrBytesColumnWriter {
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
byte_buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
|
||||
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a + use<'a> {
|
||||
self.column_writer.operation_iterator(arena, byte_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,13 +8,13 @@ use std::net::Ipv6Addr;
|
||||
|
||||
use column_operation::ColumnOperation;
|
||||
pub(crate) use column_writers::CompatibleNumericalTypes;
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::CountingWriter;
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
pub(crate) use serializer::ColumnarSerializer;
|
||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
|
||||
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use crate::column_values::{MonotonicallyMappableToU64, MonotonicallyMappableToU128};
|
||||
use crate::columnar::column_type::ColumnType;
|
||||
use crate::columnar::writer::column_writers::{
|
||||
ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
|
||||
|
||||
@@ -3,11 +3,11 @@ use std::io::Write;
|
||||
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::{BinarySerializable, CountingWriter};
|
||||
use sstable::value::RangeValueWriter;
|
||||
use sstable::RangeSSTable;
|
||||
use sstable::value::RangeValueWriter;
|
||||
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::RowId;
|
||||
use crate::columnar::ColumnType;
|
||||
|
||||
pub struct ColumnarSerializer<W: io::Write> {
|
||||
wrt: CountingWriter<W>,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::RowId;
|
||||
use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::RowId;
|
||||
|
||||
/// The `IndexBuilder` interprets a sequence of
|
||||
/// calls of the form:
|
||||
@@ -31,12 +31,13 @@ pub struct OptionalIndexBuilder {
|
||||
|
||||
impl OptionalIndexBuilder {
|
||||
pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ {
|
||||
debug_assert!(self
|
||||
.docs
|
||||
.last()
|
||||
.copied()
|
||||
.map(|last_doc| last_doc < num_rows)
|
||||
.unwrap_or(true));
|
||||
debug_assert!(
|
||||
self.docs
|
||||
.last()
|
||||
.copied()
|
||||
.map(|last_doc| last_doc < num_rows)
|
||||
.unwrap_or(true)
|
||||
);
|
||||
&self.docs[..]
|
||||
}
|
||||
|
||||
@@ -48,12 +49,13 @@ impl OptionalIndexBuilder {
|
||||
impl IndexBuilder for OptionalIndexBuilder {
|
||||
#[inline(always)]
|
||||
fn record_row(&mut self, doc: RowId) {
|
||||
debug_assert!(self
|
||||
.docs
|
||||
.last()
|
||||
.copied()
|
||||
.map(|prev_doc| doc > prev_doc)
|
||||
.unwrap_or(true));
|
||||
debug_assert!(
|
||||
self.docs
|
||||
.last()
|
||||
.copied()
|
||||
.map(|prev_doc| doc > prev_doc)
|
||||
.unwrap_or(true)
|
||||
);
|
||||
self.docs.push(doc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,8 +3,8 @@ use std::path::PathBuf;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::{
|
||||
merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
|
||||
CURRENT_VERSION,
|
||||
CURRENT_VERSION, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
|
||||
merge_columnar,
|
||||
};
|
||||
|
||||
const NUM_DOCS: u32 = u16::MAX as u32;
|
||||
|
||||
@@ -6,7 +6,7 @@ use common::file_slice::FileSlice;
|
||||
use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
||||
|
||||
use crate::column::{BytesColumn, Column, StrColumn};
|
||||
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
||||
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
|
||||
|
||||
|
||||
@@ -44,11 +44,11 @@ pub use block_accessor::ColumnBlockAccessor;
|
||||
pub use column::{BytesColumn, Column, StrColumn};
|
||||
pub use column_index::ColumnIndex;
|
||||
pub use column_values::{
|
||||
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
|
||||
};
|
||||
pub use columnar::{
|
||||
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
|
||||
CURRENT_VERSION, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, merge_columnar,
|
||||
};
|
||||
use sstable::VoidSSTable;
|
||||
pub use value::{NumericalType, NumericalValue};
|
||||
|
||||
@@ -716,8 +716,8 @@ fn test_columnar_merging_number_columns() {
|
||||
// TODO document edge case: required_columns incompatible with values.
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn columnar_docs_and_remap(
|
||||
) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
|
||||
fn columnar_docs_and_remap()
|
||||
-> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
|
||||
proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
|
||||
|columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
|
||||
let row_addrs: Vec<RowAddr> = columnars_docs
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.7.0"
|
||||
version = "0.9.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
edition = "2024"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
documentation = "https://docs.rs/tantivy_common/"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
@@ -13,7 +13,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version= "0.7", path="../ownedbytes" }
|
||||
ownedbytes = { version= "0.9", path="../ownedbytes" }
|
||||
async-trait = "0.1"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use binggan::{BenchRunner, black_box};
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
|
||||
use tantivy_common::{BitSet, TinySet, serialize_vint_u32};
|
||||
|
||||
fn bench_vint() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
@@ -65,11 +65,11 @@ pub fn transform_bound_inner_res<TFrom, TTo>(
|
||||
) -> io::Result<Bound<TTo>> {
|
||||
use self::Bound::*;
|
||||
Ok(match bound {
|
||||
Excluded(ref from_val) => match transform(from_val)? {
|
||||
Excluded(from_val) => match transform(from_val)? {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||
},
|
||||
Included(ref from_val) => match transform(from_val)? {
|
||||
Included(from_val) => match transform(from_val)? {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Included(new_val),
|
||||
},
|
||||
@@ -85,11 +85,11 @@ pub fn transform_bound_inner<TFrom, TTo>(
|
||||
) -> Bound<TTo> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(ref from_val) => match transform(from_val) {
|
||||
Excluded(from_val) => match transform(from_val) {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||
},
|
||||
Included(ref from_val) => match transform(from_val) {
|
||||
Included(from_val) => match transform(from_val) {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Included(new_val),
|
||||
},
|
||||
@@ -111,8 +111,8 @@ pub fn map_bound<TFrom, TTo>(
|
||||
) -> Bound<TTo> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
|
||||
Included(ref from_val) => Bound::Included(transform(from_val)),
|
||||
Excluded(from_val) => Bound::Excluded(transform(from_val)),
|
||||
Included(from_val) => Bound::Included(transform(from_val)),
|
||||
Unbounded => Unbounded,
|
||||
}
|
||||
}
|
||||
@@ -123,8 +123,8 @@ pub fn map_bound_res<TFrom, TTo, Err>(
|
||||
) -> Result<Bound<TTo>, Err> {
|
||||
use self::Bound::*;
|
||||
Ok(match bound {
|
||||
Excluded(ref from_val) => Excluded(transform(from_val)?),
|
||||
Included(ref from_val) => Included(transform(from_val)?),
|
||||
Excluded(from_val) => Excluded(transform(from_val)?),
|
||||
Included(from_val) => Included(transform(from_val)?),
|
||||
Unbounded => Unbounded,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ impl FileHandle for WrapFile {
|
||||
{
|
||||
use std::io::{Read, Seek};
|
||||
let mut file = self.file.try_clone()?; // Clone the file to read from it separately
|
||||
// Seek to the start position in the file
|
||||
// Seek to the start position in the file
|
||||
file.seek(io::SeekFrom::Start(start as u64))?;
|
||||
// Read the data into the buffer
|
||||
file.read_exact(&mut buffer)?;
|
||||
@@ -346,8 +346,8 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{FileHandle, FileSlice};
|
||||
use crate::file_slice::combine_ranges;
|
||||
use crate::HasLen;
|
||||
use crate::file_slice::combine_ranges;
|
||||
|
||||
#[test]
|
||||
fn test_file_slice() -> io::Result<()> {
|
||||
|
||||
@@ -22,7 +22,7 @@ pub use json_path_writer::JsonPathWriter;
|
||||
pub use ownedbytes::{OwnedBytes, StableDeref};
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
|
||||
VInt, VIntU128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint,
|
||||
};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
|
||||
@@ -177,8 +177,10 @@ pub(crate) mod test {
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); // nan is not a number
|
||||
assert!(
|
||||
!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))
|
||||
); // nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa
|
||||
|
||||
@@ -222,7 +222,7 @@ impl BinarySerializable for VInt {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{serialize_vint_u32, BinarySerializable, VInt};
|
||||
use super::{BinarySerializable, VInt, serialize_vint_u32};
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.7.0"
|
||||
version = "0.9.0"
|
||||
edition = "2021"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.22.0"
|
||||
version = "0.24.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -9,7 +9,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
nom = "7"
|
||||
|
||||
@@ -186,19 +186,19 @@ macro_rules! tuple_trait_impl(
|
||||
);
|
||||
|
||||
macro_rules! tuple_trait_inner(
|
||||
($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({
|
||||
($it:tt, $self:expr_2021, $input:expr_2021, (), $error_list:expr_2021, $head:ident $($id:ident)+) => ({
|
||||
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
|
||||
$error_list.append(&mut err);
|
||||
|
||||
succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+))
|
||||
});
|
||||
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({
|
||||
($it:tt, $self:expr_2021, $input:expr_2021, ($($parsed:tt)*), $error_list:expr_2021, $head:ident $($id:ident)+) => ({
|
||||
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
|
||||
$error_list.append(&mut err);
|
||||
|
||||
succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+))
|
||||
});
|
||||
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({
|
||||
($it:tt, $self:expr_2021, $input:expr_2021, ($($parsed:tt)*), $error_list:expr_2021, $head:ident) => ({
|
||||
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
|
||||
$error_list.append(&mut err);
|
||||
|
||||
@@ -328,13 +328,13 @@ macro_rules! alt_trait_impl(
|
||||
);
|
||||
|
||||
macro_rules! alt_trait_inner(
|
||||
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
|
||||
($it:tt, $self:expr_2021, $input:expr_2021, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
|
||||
match $self.$it.0.parse($input.clone()) {
|
||||
Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)),
|
||||
Ok((input_left, _)) => Some($self.$it.1.parse(input_left)),
|
||||
}
|
||||
);
|
||||
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => (
|
||||
($it:tt, $self:expr_2021, $input:expr_2021, $head_cond:ident $head:ident) => (
|
||||
None
|
||||
);
|
||||
);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::iter::once;
|
||||
|
||||
use nom::IResult;
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::tag;
|
||||
use nom::character::complete::{
|
||||
@@ -10,12 +11,11 @@ use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify};
|
||||
use nom::error::{Error, ErrorKind};
|
||||
use nom::multi::{many0, many1, separated_list0};
|
||||
use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple};
|
||||
use nom::IResult;
|
||||
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
use crate::infallible::*;
|
||||
use crate::user_input_ast::Delimiter;
|
||||
use crate::Occur;
|
||||
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
@@ -1030,7 +1030,7 @@ fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
|
||||
|
||||
fn rewrite_ast_clause(input: &mut (Option<Occur>, UserInputAst)) {
|
||||
match input {
|
||||
(None, UserInputAst::Clause(ref mut clauses)) if clauses.len() == 1 => {
|
||||
(None, UserInputAst::Clause(clauses)) if clauses.len() == 1 => {
|
||||
*input = clauses.pop().unwrap(); // safe because clauses.len() == 1
|
||||
}
|
||||
_ => {}
|
||||
@@ -1376,7 +1376,7 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_range_parser_lenient() {
|
||||
let literal = |query| literal_infallible(query).unwrap().1 .0.unwrap();
|
||||
let literal = |query| literal_infallible(query).unwrap().1.0.unwrap();
|
||||
|
||||
// same tests as non-lenient
|
||||
let res = literal("title: <hello");
|
||||
|
||||
@@ -51,7 +51,7 @@ impl UserInputLeaf {
|
||||
|
||||
pub(crate) fn set_default_field(&mut self, default_field: String) {
|
||||
match self {
|
||||
UserInputLeaf::Literal(ref mut literal) if literal.field_name.is_none() => {
|
||||
UserInputLeaf::Literal(literal) if literal.field_name.is_none() => {
|
||||
literal.field_name = Some(default_field)
|
||||
}
|
||||
UserInputLeaf::All => {
|
||||
@@ -59,12 +59,8 @@ impl UserInputLeaf {
|
||||
field: default_field,
|
||||
}
|
||||
}
|
||||
UserInputLeaf::Range { ref mut field, .. } if field.is_none() => {
|
||||
*field = Some(default_field)
|
||||
}
|
||||
UserInputLeaf::Set { ref mut field, .. } if field.is_none() => {
|
||||
*field = Some(default_field)
|
||||
}
|
||||
UserInputLeaf::Range { field, .. } if field.is_none() => *field = Some(default_field),
|
||||
UserInputLeaf::Set { field, .. } if field.is_none() => *field = Some(default_field),
|
||||
_ => (), // field was already set, do nothing
|
||||
}
|
||||
}
|
||||
@@ -75,11 +71,11 @@ impl Debug for UserInputLeaf {
|
||||
match self {
|
||||
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
|
||||
UserInputLeaf::Range {
|
||||
ref field,
|
||||
ref lower,
|
||||
ref upper,
|
||||
field,
|
||||
lower,
|
||||
upper,
|
||||
} => {
|
||||
if let Some(ref field) = field {
|
||||
if let Some(field) = field {
|
||||
// TODO properly escape field (in case of \")
|
||||
write!(formatter, "\"{field}\":")?;
|
||||
}
|
||||
@@ -89,7 +85,7 @@ impl Debug for UserInputLeaf {
|
||||
Ok(())
|
||||
}
|
||||
UserInputLeaf::Set { field, elements } => {
|
||||
if let Some(ref field) = field {
|
||||
if let Some(field) = field {
|
||||
// TODO properly escape field (in case of \")
|
||||
write!(formatter, "\"{field}\": ")?;
|
||||
}
|
||||
@@ -267,7 +263,7 @@ impl UserInputAst {
|
||||
.iter_mut()
|
||||
.for_each(|(_, ast)| ast.set_default_field(field.clone())),
|
||||
UserInputAst::Leaf(leaf) => leaf.set_default_field(field),
|
||||
UserInputAst::Boost(ref mut ast, _) => ast.set_default_field(field),
|
||||
UserInputAst::Boost(ast, _) => ast.set_default_field(field),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,11 +2,13 @@ use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::ColumnValues;
|
||||
use columnar::{ColumnValues, StrColumn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::Collector;
|
||||
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
|
||||
use crate::collector::custom_score_top_collector::{
|
||||
CustomScoreTopCollector, CustomScoreTopSegmentCollector,
|
||||
};
|
||||
use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector};
|
||||
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
@@ -14,6 +16,7 @@ use crate::collector::{
|
||||
};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::query::Weight;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
struct FastFieldConvertCollector<
|
||||
@@ -83,6 +86,163 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
struct StringConvertCollector {
|
||||
pub collector: CustomScoreTopCollector<ScorerByField, u64>,
|
||||
pub field: String,
|
||||
order: Order,
|
||||
limit: usize,
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl Collector for StringConvertCollector {
|
||||
type Fruit = Vec<(String, DocAddress)>;
|
||||
|
||||
type Child = StringConvertSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: crate::SegmentOrdinal,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment.schema();
|
||||
let field = schema.get_field(&self.field)?;
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
if !field_entry.is_fast() {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a fast field.",
|
||||
field_entry.name()
|
||||
)));
|
||||
}
|
||||
let requested_type = crate::schema::Type::Str;
|
||||
let schema_type = field_entry.field_type().value_type();
|
||||
if schema_type != requested_type {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"Field {:?} is of type {schema_type:?}!={requested_type:?}",
|
||||
field_entry.name()
|
||||
)));
|
||||
}
|
||||
let ff = segment
|
||||
.fast_fields()
|
||||
.str(&self.field)?
|
||||
.expect("ff should be a str field");
|
||||
Ok(StringConvertSegmentCollector {
|
||||
collector: self.collector.for_segment(segment_local_id, segment)?,
|
||||
ff,
|
||||
order: self.order.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
child_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
if self.limit == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
if self.order.is_desc() {
|
||||
let mut top_collector: TopNComputer<_, _, true> =
|
||||
TopNComputer::new(self.limit + self.offset);
|
||||
for child_fruit in child_fruits {
|
||||
for (feature, doc) in child_fruit {
|
||||
top_collector.push(feature, doc);
|
||||
}
|
||||
}
|
||||
Ok(top_collector
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.skip(self.offset)
|
||||
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
||||
.collect())
|
||||
} else {
|
||||
let mut top_collector: TopNComputer<_, _, false> =
|
||||
TopNComputer::new(self.limit + self.offset);
|
||||
for child_fruit in child_fruits {
|
||||
for (feature, doc) in child_fruit {
|
||||
top_collector.push(feature, doc);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(top_collector
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.skip(self.offset)
|
||||
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct StringConvertSegmentCollector {
|
||||
pub collector: CustomScoreTopSegmentCollector<ScorerByFastFieldReader, u64>,
|
||||
ff: StrColumn,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl SegmentCollector for StringConvertSegmentCollector {
|
||||
type Fruit = Vec<(String, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.collector.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(String, DocAddress)> {
|
||||
let top_ordinals: Vec<(TermOrdinal, DocAddress)> = self.collector.harvest();
|
||||
|
||||
// Collect terms.
|
||||
let mut terms: Vec<String> = Vec::with_capacity(top_ordinals.len());
|
||||
let result = if self.order.is_asc() {
|
||||
self.ff.dictionary().sorted_ords_to_term_cb(
|
||||
top_ordinals.iter().map(|(term_ord, _)| u64::MAX - term_ord),
|
||||
|term| {
|
||||
terms.push(
|
||||
std::str::from_utf8(term)
|
||||
.expect("Failed to decode term as unicode")
|
||||
.to_owned(),
|
||||
);
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
} else {
|
||||
self.ff.dictionary().sorted_ords_to_term_cb(
|
||||
top_ordinals.iter().rev().map(|(term_ord, _)| *term_ord),
|
||||
|term| {
|
||||
terms.push(
|
||||
std::str::from_utf8(term)
|
||||
.expect("Failed to decode term as unicode")
|
||||
.to_owned(),
|
||||
);
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
};
|
||||
|
||||
assert!(
|
||||
result.expect("Failed to read terms from term dictionary"),
|
||||
"Not all terms were matched in segment."
|
||||
);
|
||||
|
||||
// Zip them back with their docs.
|
||||
if self.order.is_asc() {
|
||||
terms
|
||||
.into_iter()
|
||||
.zip(top_ordinals)
|
||||
.map(|(term, (_, doc))| (term, doc))
|
||||
.collect()
|
||||
} else {
|
||||
terms
|
||||
.into_iter()
|
||||
.rev()
|
||||
.zip(top_ordinals)
|
||||
.map(|(term, (_, doc))| (term, doc))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||
/// sorted by their score.
|
||||
///
|
||||
@@ -410,6 +570,30 @@ impl TopDocs {
|
||||
}
|
||||
}
|
||||
|
||||
/// Like `order_by_fast_field`, but for a `String` fast field.
|
||||
pub fn order_by_string_fast_field(
|
||||
self,
|
||||
fast_field: impl ToString,
|
||||
order: Order,
|
||||
) -> impl Collector<Fruit = Vec<(String, DocAddress)>> {
|
||||
let limit = self.0.limit;
|
||||
let offset = self.0.offset;
|
||||
let u64_collector = CustomScoreTopCollector::new(
|
||||
ScorerByField {
|
||||
field: fast_field.to_string(),
|
||||
order: order.clone(),
|
||||
},
|
||||
self.0.into_tscore(),
|
||||
);
|
||||
StringConvertCollector {
|
||||
collector: u64_collector,
|
||||
field: fast_field.to_string(),
|
||||
order,
|
||||
limit,
|
||||
offset,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ranks the documents using a custom score.
|
||||
///
|
||||
/// This method offers a convenient way to tweak or replace
|
||||
@@ -1214,6 +1398,94 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_string() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let city = schema_builder.add_text_field("city", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "austin",
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "greenville",
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
fn query(
|
||||
index: &Index,
|
||||
order: Order,
|
||||
limit: usize,
|
||||
offset: usize,
|
||||
) -> crate::Result<Vec<(String, DocAddress)>> {
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(limit)
|
||||
.and_offset(offset)
|
||||
.order_by_string_fast_field("city", order);
|
||||
searcher.search(&AllQuery, &top_collector)
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
&query(&index, Order::Desc, 3, 0)?,
|
||||
&[
|
||||
("tokyo".to_owned(), DocAddress::new(0, 2)),
|
||||
("greenville".to_owned(), DocAddress::new(0, 1)),
|
||||
("austin".to_owned(), DocAddress::new(0, 0)),
|
||||
]
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
&query(&index, Order::Desc, 2, 0)?,
|
||||
&[
|
||||
("tokyo".to_owned(), DocAddress::new(0, 2)),
|
||||
("greenville".to_owned(), DocAddress::new(0, 1)),
|
||||
]
|
||||
);
|
||||
|
||||
assert_eq!(&query(&index, Order::Desc, 3, 3)?, &[]);
|
||||
|
||||
assert_eq!(
|
||||
&query(&index, Order::Desc, 2, 1)?,
|
||||
&[
|
||||
("greenville".to_owned(), DocAddress::new(0, 1)),
|
||||
("austin".to_owned(), DocAddress::new(0, 0)),
|
||||
]
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
&query(&index, Order::Asc, 3, 0)?,
|
||||
&[
|
||||
("austin".to_owned(), DocAddress::new(0, 0)),
|
||||
("greenville".to_owned(), DocAddress::new(0, 1)),
|
||||
("tokyo".to_owned(), DocAddress::new(0, 2)),
|
||||
]
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
&query(&index, Order::Asc, 2, 1)?,
|
||||
&[
|
||||
("greenville".to_owned(), DocAddress::new(0, 1)),
|
||||
("tokyo".to_owned(), DocAddress::new(0, 2)),
|
||||
]
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
&query(&index, Order::Asc, 2, 0)?,
|
||||
&[
|
||||
("austin".to_owned(), DocAddress::new(0, 0)),
|
||||
("greenville".to_owned(), DocAddress::new(0, 1)),
|
||||
]
|
||||
);
|
||||
|
||||
assert_eq!(&query(&index, Order::Asc, 3, 3)?, &[]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_field_does_not_exist() {
|
||||
|
||||
@@ -214,7 +214,7 @@ impl Searcher {
|
||||
/// It is powerless at making search faster if your index consists in
|
||||
/// one large segment.
|
||||
///
|
||||
/// Also, keep in my multithreading a single query on several
|
||||
/// Also, keep in mind multithreading a single query on several
|
||||
/// threads will not improve your throughput. It can actually
|
||||
/// hurt it. It will however, decrease the average response time.
|
||||
pub fn search_with_executor<C: Collector>(
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "tantivy-sstable"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
version = "0.5.0"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
@@ -10,13 +10,16 @@ categories = ["database-implementations", "data-structures", "compression"]
|
||||
description = "sstables for tantivy"
|
||||
|
||||
[dependencies]
|
||||
common = {version= "0.7", path="../common", package="tantivy-common"}
|
||||
common = {version= "0.9", path="../common", package="tantivy-common"}
|
||||
futures-util = "0.3.30"
|
||||
itertools = "0.14.0"
|
||||
tantivy-bitpacker = { version= "0.6", path="../bitpacker" }
|
||||
tantivy-bitpacker = { version= "0.8", path="../bitpacker" }
|
||||
tantivy-fst = "0.5"
|
||||
# experimental gives us access to Decompressor::upper_bound
|
||||
zstd = { version = "0.13", features = ["experimental"] }
|
||||
zstd = { version = "0.13", optional = true, features = ["experimental"] }
|
||||
|
||||
[features]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
use common::OwnedBytes;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use common::file_slice::FileSlice;
|
||||
use criterion::{Criterion, criterion_group, criterion_main};
|
||||
use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
|
||||
|
||||
fn make_test_sstable(suffix: &str) -> FileSlice {
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use criterion::{Criterion, criterion_group, criterion_main};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::io::{self, Read};
|
||||
use std::ops::Range;
|
||||
|
||||
use common::OwnedBytes;
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
use zstd::bulk::Decompressor;
|
||||
|
||||
pub struct BlockReader {
|
||||
@@ -51,18 +52,21 @@ impl BlockReader {
|
||||
let block_len = match self.reader.len() {
|
||||
0 => {
|
||||
// we are out of data for this block. Check if we have another block after
|
||||
if let Some(new_reader) = self.next_readers.next() {
|
||||
self.reader = new_reader;
|
||||
continue;
|
||||
} else {
|
||||
return Ok(false);
|
||||
match self.next_readers.next() {
|
||||
Some(new_reader) => {
|
||||
self.reader = new_reader;
|
||||
continue;
|
||||
}
|
||||
_ => {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
1..=3 => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"failed to read block_len",
|
||||
))
|
||||
));
|
||||
}
|
||||
_ => self.reader.read_u32() as usize,
|
||||
};
|
||||
@@ -79,13 +83,23 @@ impl BlockReader {
|
||||
));
|
||||
}
|
||||
if compress == 1 {
|
||||
let required_capacity =
|
||||
Decompressor::upper_bound(&self.reader[..block_len]).unwrap_or(1024 * 1024);
|
||||
self.buffer.reserve(required_capacity);
|
||||
Decompressor::new()?
|
||||
.decompress_to_buffer(&self.reader[..block_len], &mut self.buffer)?;
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
let required_capacity =
|
||||
Decompressor::upper_bound(&self.reader[..block_len]).unwrap_or(1024 * 1024);
|
||||
self.buffer.reserve(required_capacity);
|
||||
Decompressor::new()?
|
||||
.decompress_to_buffer(&self.reader[..block_len], &mut self.buffer)?;
|
||||
|
||||
self.reader.advance(block_len);
|
||||
self.reader.advance(block_len);
|
||||
}
|
||||
|
||||
if cfg!(not(feature = "zstd-compression")) {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Unsupported,
|
||||
"zstd-compression feature is not enabled",
|
||||
));
|
||||
}
|
||||
} else {
|
||||
self.buffer.resize(block_len, 0u8);
|
||||
self.reader.read_exact(&mut self.buffer[..])?;
|
||||
|
||||
@@ -2,10 +2,11 @@ use std::io::{self, BufWriter, Write};
|
||||
use std::ops::Range;
|
||||
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
use zstd::bulk::Compressor;
|
||||
|
||||
use super::value::ValueWriter;
|
||||
use super::{value, vint, BlockReader};
|
||||
use super::{BlockReader, value, vint};
|
||||
|
||||
const FOUR_BIT_LIMITS: usize = 1 << 4;
|
||||
const VINT_MODE: u8 = 1u8;
|
||||
@@ -53,25 +54,28 @@ where
|
||||
|
||||
let block_len = buffer.len() + self.block.len();
|
||||
|
||||
if block_len > 2048 {
|
||||
buffer.extend_from_slice(&self.block);
|
||||
self.block.clear();
|
||||
if cfg!(feature = "zstd-compression") && block_len > 2048 {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
buffer.extend_from_slice(&self.block);
|
||||
self.block.clear();
|
||||
|
||||
let max_len = zstd::zstd_safe::compress_bound(buffer.len());
|
||||
self.block.reserve(max_len);
|
||||
Compressor::new(3)?.compress_to_buffer(buffer, &mut self.block)?;
|
||||
let max_len = zstd::zstd_safe::compress_bound(buffer.len());
|
||||
self.block.reserve(max_len);
|
||||
Compressor::new(3)?.compress_to_buffer(buffer, &mut self.block)?;
|
||||
|
||||
// verify compression had a positive impact
|
||||
if self.block.len() < buffer.len() {
|
||||
self.write
|
||||
.write_all(&(self.block.len() as u32 + 1).to_le_bytes())?;
|
||||
self.write.write_all(&[1])?;
|
||||
self.write.write_all(&self.block[..])?;
|
||||
} else {
|
||||
self.write
|
||||
.write_all(&(block_len as u32 + 1).to_le_bytes())?;
|
||||
self.write.write_all(&[0])?;
|
||||
self.write.write_all(&buffer[..])?;
|
||||
// verify compression had a positive impact
|
||||
if self.block.len() < buffer.len() {
|
||||
self.write
|
||||
.write_all(&(self.block.len() as u32 + 1).to_le_bytes())?;
|
||||
self.write.write_all(&[1])?;
|
||||
self.write.write_all(&self.block[..])?;
|
||||
} else {
|
||||
self.write
|
||||
.write_all(&(block_len as u32 + 1).to_le_bytes())?;
|
||||
self.write.write_all(&[0])?;
|
||||
self.write.write_all(&buffer[..])?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
self.write
|
||||
|
||||
@@ -6,13 +6,13 @@ use std::marker::PhantomData;
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::bounds::{transform_bound_inner_res, TransformBound};
|
||||
use common::bounds::{TransformBound, transform_bound_inner_res};
|
||||
use common::file_slice::FileSlice;
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use futures_util::{stream, StreamExt, TryStreamExt};
|
||||
use futures_util::{StreamExt, TryStreamExt, stream};
|
||||
use itertools::Itertools;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
use tantivy_fst::Automaton;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
use crate::sstable_index_v3::SSTableIndexV3Empty;
|
||||
use crate::streamer::{Streamer, StreamerBuilder};
|
||||
@@ -311,7 +311,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("Unsupported sstable version, expected one of [2, 3], found {version}"),
|
||||
))
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -487,7 +487,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
/// the buffer may be modified.
|
||||
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
|
||||
// find block in which the term would be
|
||||
let block_addr = self.sstable_index.get_block_with_ord(ord);
|
||||
let block_addr = self.sstable_index.get_block_with_ord::<false>(ord).0;
|
||||
let first_ordinal = block_addr.first_ordinal;
|
||||
|
||||
// then search inside that block only
|
||||
@@ -511,16 +511,17 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
mut cb: F,
|
||||
) -> io::Result<bool> {
|
||||
let mut bytes = Vec::new();
|
||||
let mut current_block_addr = self.sstable_index.get_block_with_ord(0);
|
||||
let (mut current_block_addr, mut next_block_ord) =
|
||||
self.sstable_index.get_block_with_ord::<true>(0);
|
||||
let mut current_sstable_delta_reader =
|
||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||
let mut current_ordinal = 0;
|
||||
for ord in ord {
|
||||
assert!(ord >= current_ordinal);
|
||||
// check if block changed for new term_ord
|
||||
let new_block_addr = self.sstable_index.get_block_with_ord(ord);
|
||||
if new_block_addr != current_block_addr {
|
||||
current_block_addr = new_block_addr;
|
||||
if ord >= next_block_ord {
|
||||
(current_block_addr, next_block_ord) =
|
||||
self.sstable_index.get_block_with_ord::<true>(ord);
|
||||
current_ordinal = current_block_addr.first_ordinal;
|
||||
current_sstable_delta_reader =
|
||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||
@@ -544,7 +545,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
/// Returns the number of terms in the dictionary.
|
||||
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
|
||||
// find block in which the term would be
|
||||
let block_addr = self.sstable_index.get_block_with_ord(term_ord);
|
||||
let block_addr = self.sstable_index.get_block_with_ord::<false>(term_ord).0;
|
||||
let first_ordinal = block_addr.first_ordinal;
|
||||
|
||||
// then search inside that block only
|
||||
@@ -644,8 +645,8 @@ mod tests {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use super::Dictionary;
|
||||
use crate::dictionary::TermOrdHit;
|
||||
use crate::MonotonicU64SSTable;
|
||||
use crate::dictionary::TermOrdHit;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PermissionedHandle {
|
||||
@@ -846,7 +847,7 @@ mod tests {
|
||||
fn test_ord_term_conversion() {
|
||||
let (dic, slice) = make_test_sstable();
|
||||
|
||||
let block = dic.sstable_index.get_block_with_ord(100_000);
|
||||
let block = dic.sstable_index.get_block_with_ord::<false>(100_000).0;
|
||||
slice.restrict(block.byte_range);
|
||||
|
||||
let mut res = Vec::new();
|
||||
@@ -872,7 +873,11 @@ mod tests {
|
||||
|
||||
// end of a block
|
||||
let ordinal = block.first_ordinal - 1;
|
||||
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
|
||||
let new_range = dic
|
||||
.sstable_index
|
||||
.get_block_with_ord::<false>(ordinal)
|
||||
.0
|
||||
.byte_range;
|
||||
slice.restrict(new_range);
|
||||
assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
|
||||
assert_eq!(res, format!("{ordinal:05X}").into_bytes());
|
||||
@@ -882,7 +887,7 @@ mod tests {
|
||||
|
||||
// before first block
|
||||
// 1st block must be loaded for key-related operations
|
||||
let block = dic.sstable_index.get_block_with_ord(0);
|
||||
let block = dic.sstable_index.get_block_with_ord::<false>(0).0;
|
||||
slice.restrict(block.byte_range);
|
||||
|
||||
assert!(dic.get(b"$$$").unwrap().is_none());
|
||||
@@ -891,7 +896,11 @@ mod tests {
|
||||
// after last block
|
||||
// last block must be loaded for ord related operations
|
||||
let ordinal = 0x40000 + 10;
|
||||
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
|
||||
let new_range = dic
|
||||
.sstable_index
|
||||
.get_block_with_ord::<false>(ordinal)
|
||||
.0
|
||||
.byte_range;
|
||||
slice.restrict(new_range);
|
||||
assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
|
||||
assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());
|
||||
@@ -914,30 +923,33 @@ mod tests {
|
||||
|
||||
// Single term
|
||||
let mut terms = Vec::new();
|
||||
assert!(dic
|
||||
.sorted_ords_to_term_cb(100_000..100_001, |term| {
|
||||
assert!(
|
||||
dic.sorted_ords_to_term_cb(100_000..100_001, |term| {
|
||||
terms.push(term.to_vec());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap());
|
||||
.unwrap()
|
||||
);
|
||||
assert_eq!(terms, vec![format!("{:05X}", 100_000).into_bytes(),]);
|
||||
// Single term
|
||||
let mut terms = Vec::new();
|
||||
assert!(dic
|
||||
.sorted_ords_to_term_cb(100_001..100_002, |term| {
|
||||
assert!(
|
||||
dic.sorted_ords_to_term_cb(100_001..100_002, |term| {
|
||||
terms.push(term.to_vec());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap());
|
||||
.unwrap()
|
||||
);
|
||||
assert_eq!(terms, vec![format!("{:05X}", 100_001).into_bytes(),]);
|
||||
// both terms
|
||||
let mut terms = Vec::new();
|
||||
assert!(dic
|
||||
.sorted_ords_to_term_cb(100_000..100_002, |term| {
|
||||
assert!(
|
||||
dic.sorted_ords_to_term_cb(100_000..100_002, |term| {
|
||||
terms.push(term.to_vec());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap());
|
||||
.unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
terms,
|
||||
vec![
|
||||
@@ -947,12 +959,13 @@ mod tests {
|
||||
);
|
||||
// Test cross block
|
||||
let mut terms = Vec::new();
|
||||
assert!(dic
|
||||
.sorted_ords_to_term_cb(98653..=98655, |term| {
|
||||
assert!(
|
||||
dic.sorted_ords_to_term_cb(98653..=98655, |term| {
|
||||
terms.push(term.to_vec());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap());
|
||||
.unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
terms,
|
||||
vec![
|
||||
|
||||
@@ -1,3 +1,40 @@
|
||||
//! `tantivy_sstable` is a crate that provides a sorted string table data structure.
|
||||
//!
|
||||
//! It is used in `tantivy` to store the term dictionary.
|
||||
//!
|
||||
//! A `sstable` is a map of sorted `&[u8]` keys to values.
|
||||
//! The keys are encoded using incremental encoding.
|
||||
//!
|
||||
//! Values and keys are compressed using zstd with the default feature flag `zstd-compression`.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! Here is an example of how to create and search an `sstable`:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use common::OwnedBytes;
|
||||
//! use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
|
||||
//!
|
||||
//! // Create a new sstable in memory.
|
||||
//! let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
|
||||
//! builder.insert(b"apple", &1).unwrap();
|
||||
//! builder.insert(b"banana", &2).unwrap();
|
||||
//! builder.insert(b"orange", &3).unwrap();
|
||||
//! let sstable_bytes = builder.finish().unwrap();
|
||||
//!
|
||||
//! // Open the sstable.
|
||||
//! let sstable =
|
||||
//! Dictionary::<MonotonicU64SSTable>::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap();
|
||||
//!
|
||||
//! // Search for a key.
|
||||
//! let value = sstable.get(b"banana").unwrap();
|
||||
//! assert_eq!(value, Some(2));
|
||||
//!
|
||||
//! // Search for a non-existent key.
|
||||
//! let value = sstable.get(b"grape").unwrap();
|
||||
//! assert_eq!(value, None);
|
||||
//! ```
|
||||
|
||||
use std::io::{self, Write};
|
||||
use std::ops::Range;
|
||||
|
||||
@@ -19,6 +56,7 @@ pub use streamer::{Streamer, StreamerBuilder};
|
||||
|
||||
mod block_reader;
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use value::{VecU32ValueReader, VecU32ValueWriter};
|
||||
|
||||
pub use self::block_reader::BlockReader;
|
||||
pub use self::delta::{DeltaReader, DeltaWriter};
|
||||
@@ -130,6 +168,15 @@ impl SSTable for RangeSSTable {
|
||||
type ValueWriter = RangeValueWriter;
|
||||
}
|
||||
|
||||
/// SSTable associating keys to Vec<u32>.
|
||||
pub struct VecU32ValueSSTable;
|
||||
|
||||
impl SSTable for VecU32ValueSSTable {
|
||||
type Value = Vec<u32>;
|
||||
type ValueReader = VecU32ValueReader;
|
||||
type ValueWriter = VecU32ValueWriter;
|
||||
}
|
||||
|
||||
/// SSTable reader.
|
||||
pub struct Reader<TValueReader> {
|
||||
key: Vec<u8>,
|
||||
@@ -322,7 +369,7 @@ mod test {
|
||||
|
||||
use common::OwnedBytes;
|
||||
|
||||
use super::{common_prefix_len, MonotonicU64SSTable, SSTable, VoidMerge, VoidSSTable};
|
||||
use super::{MonotonicU64SSTable, SSTable, VoidMerge, VoidSSTable, common_prefix_len};
|
||||
|
||||
fn aux_test_common_prefix_len(left: &str, right: &str, expect_len: usize) {
|
||||
assert_eq!(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::binary_heap::PeekMut;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::binary_heap::PeekMut;
|
||||
use std::io;
|
||||
|
||||
use super::{SingleValueMerger, ValueMerger};
|
||||
@@ -41,14 +41,17 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
|
||||
loop {
|
||||
let len = heap.len();
|
||||
let mut value_merger;
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
writer.insert_key(head.0.key()).unwrap();
|
||||
value_merger = merger.new_value(head.0.value());
|
||||
if !head.0.advance()? {
|
||||
PeekMut::pop(head);
|
||||
match heap.peek_mut() {
|
||||
Some(mut head) => {
|
||||
writer.insert_key(head.0.key()).unwrap();
|
||||
value_merger = merger.new_value(head.0.value());
|
||||
if !head.0.advance()? {
|
||||
PeekMut::pop(head);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
for _ in 0..len - 1 {
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
|
||||
@@ -72,9 +72,15 @@ impl SSTableIndex {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
// locate_with_ord always returns an index within range
|
||||
self.get_block(self.locate_with_ord(ord)).unwrap()
|
||||
let block_pos = self.locate_with_ord(ord);
|
||||
(
|
||||
self.get_block(block_pos).unwrap(),
|
||||
self.get_block(block_pos + 1)
|
||||
.map(|b| b.first_ordinal)
|
||||
.unwrap_or(u64::MAX),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn get_block_for_automaton<'a>(
|
||||
|
||||
@@ -3,12 +3,12 @@ use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{BinarySerializable, FixedSize, OwnedBytes};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker};
|
||||
use tantivy_bitpacker::{BitPacker, compute_num_bits};
|
||||
use tantivy_fst::raw::Fst;
|
||||
use tantivy_fst::{Automaton, IntoStreamer, Map, MapBuilder, Streamer};
|
||||
|
||||
use crate::block_match_automaton::can_block_match_automaton;
|
||||
use crate::{common_prefix_len, SSTableDataCorruption, TermOrdinal};
|
||||
use crate::{SSTableDataCorruption, TermOrdinal, common_prefix_len};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum SSTableIndex {
|
||||
@@ -58,10 +58,13 @@ impl SSTableIndex {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
ord: TermOrdinal,
|
||||
) -> (BlockAddr, u64) {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord::<FETCH_NEXT_ORD>(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
|
||||
}
|
||||
}
|
||||
@@ -152,12 +155,18 @@ impl SSTableIndexV3 {
|
||||
}
|
||||
|
||||
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
|
||||
self.block_addr_store.binary_search_ord(ord).0
|
||||
self.block_addr_store.binary_search_ord::<false>(ord).0
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
self.block_addr_store.binary_search_ord(ord).1
|
||||
pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
ord: TermOrdinal,
|
||||
) -> (BlockAddr, u64) {
|
||||
let (_block_id, block_addr, next_ord) = self
|
||||
.block_addr_store
|
||||
.binary_search_ord::<FETCH_NEXT_ORD>(ord);
|
||||
(block_addr, next_ord)
|
||||
}
|
||||
|
||||
pub(crate) fn get_block_for_automaton<'a>(
|
||||
@@ -253,8 +262,8 @@ impl SSTableIndexV3Empty {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> BlockAddr {
|
||||
self.block_addr.clone()
|
||||
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
(self.block_addr.clone(), u64::MAX)
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
@@ -461,7 +470,11 @@ impl BlockAddrBlockMetadata {
|
||||
})
|
||||
}
|
||||
|
||||
fn bisect_for_ord(&self, data: &[u8], target_ord: TermOrdinal) -> (u64, BlockAddr) {
|
||||
fn bisect_for_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
data: &[u8],
|
||||
target_ord: TermOrdinal,
|
||||
) -> (u64, BlockAddr, u64) {
|
||||
let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal;
|
||||
let num_bits = self.num_bits() as usize;
|
||||
let range_start_nbits = self.range_start_nbits as usize;
|
||||
@@ -481,11 +494,17 @@ impl BlockAddrBlockMetadata {
|
||||
Err(inner_offset) => inner_offset,
|
||||
};
|
||||
// we can unwrap because inner_offset <= self.block_len
|
||||
(
|
||||
inner_offset,
|
||||
self.deserialize_block_addr(data, inner_offset as usize)
|
||||
.unwrap(),
|
||||
)
|
||||
let block = self
|
||||
.deserialize_block_addr(data, inner_offset as usize)
|
||||
.unwrap();
|
||||
let next_ord = if FETCH_NEXT_ORD {
|
||||
self.deserialize_block_addr(data, inner_offset as usize + 1)
|
||||
.map(|b| b.first_ordinal)
|
||||
.unwrap_or(u64::MAX)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
(inner_offset, block, next_ord)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -591,7 +610,10 @@ impl BlockAddrStore {
|
||||
)
|
||||
}
|
||||
|
||||
fn binary_search_ord(&self, ord: TermOrdinal) -> (u64, BlockAddr) {
|
||||
fn binary_search_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
ord: TermOrdinal,
|
||||
) -> (u64, BlockAddr, u64) {
|
||||
let max_block =
|
||||
(self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64;
|
||||
let get_first_ordinal = |block_id| {
|
||||
@@ -606,20 +628,29 @@ impl BlockAddrStore {
|
||||
Ok(store_block_id) => {
|
||||
let block_id = store_block_id * STORE_BLOCK_LEN as u64;
|
||||
// we can unwrap because store_block_id < max_block
|
||||
return (block_id, self.get(block_id).unwrap());
|
||||
let next_ord = if FETCH_NEXT_ORD {
|
||||
self.get(block_id + 1)
|
||||
.map(|b| b.first_ordinal)
|
||||
.unwrap_or(u64::MAX)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
return (block_id, self.get(block_id).unwrap(), next_ord);
|
||||
}
|
||||
Err(store_block_id) => store_block_id - 1,
|
||||
};
|
||||
|
||||
// we can unwrap because store_block_id < max_block
|
||||
let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap();
|
||||
let (inner_offset, block_addr) = block_addr_block_data.bisect_for_ord(
|
||||
&self.addr_bytes[block_addr_block_data.offset as usize..],
|
||||
ord,
|
||||
);
|
||||
let (inner_offset, block_addr, next_block_ord) = block_addr_block_data
|
||||
.bisect_for_ord::<FETCH_NEXT_ORD>(
|
||||
&self.addr_bytes[block_addr_block_data.offset as usize..],
|
||||
ord,
|
||||
);
|
||||
(
|
||||
store_block_id * STORE_BLOCK_LEN as u64 + inner_offset,
|
||||
block_addr,
|
||||
next_block_ord,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -824,8 +855,8 @@ mod tests {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use super::*;
|
||||
use crate::block_match_automaton::tests::EqBuffer;
|
||||
use crate::SSTableDataCorruption;
|
||||
use crate::block_match_automaton::tests::EqBuffer;
|
||||
|
||||
#[test]
|
||||
fn test_sstable_index() {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::io;
|
||||
use std::ops::Bound;
|
||||
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
use tantivy_fst::Automaton;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
use crate::dictionary::Dictionary;
|
||||
use crate::{DeltaReader, SSTable, TermOrdinal};
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::io;
|
||||
|
||||
use crate::value::{deserialize_vint_u64, ValueReader, ValueWriter};
|
||||
use crate::{vint, BlockAddr};
|
||||
use crate::value::{ValueReader, ValueWriter, deserialize_vint_u64};
|
||||
use crate::{BlockAddr, vint};
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct IndexValueReader {
|
||||
|
||||
@@ -1,10 +1,16 @@
|
||||
pub(crate) mod index;
|
||||
mod range;
|
||||
mod u64_monotonic;
|
||||
mod vec_u32;
|
||||
mod void;
|
||||
|
||||
use std::io;
|
||||
|
||||
pub use range::{RangeValueReader, RangeValueWriter};
|
||||
pub use u64_monotonic::{U64MonotonicValueReader, U64MonotonicValueWriter};
|
||||
pub use vec_u32::{VecU32ValueReader, VecU32ValueWriter};
|
||||
pub use void::{VoidValueReader, VoidValueWriter};
|
||||
|
||||
/// `ValueReader` is a trait describing the contract of something
|
||||
/// reading blocks of value, and offering random access within this values.
|
||||
pub trait ValueReader: Default {
|
||||
@@ -40,10 +46,6 @@ pub trait ValueWriter: Default {
|
||||
fn clear(&mut self);
|
||||
}
|
||||
|
||||
pub use range::{RangeValueReader, RangeValueWriter};
|
||||
pub use u64_monotonic::{U64MonotonicValueReader, U64MonotonicValueWriter};
|
||||
pub use void::{VoidValueReader, VoidValueWriter};
|
||||
|
||||
fn deserialize_vint_u64(data: &mut &[u8]) -> u64 {
|
||||
let (num_bytes, val) = super::vint::deserialize_read(data);
|
||||
*data = &data[num_bytes..];
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::value::{deserialize_vint_u64, ValueReader, ValueWriter};
|
||||
use crate::value::{ValueReader, ValueWriter, deserialize_vint_u64};
|
||||
|
||||
/// See module comment.
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io;
|
||||
|
||||
use crate::value::{deserialize_vint_u64, ValueReader, ValueWriter};
|
||||
use crate::value::{ValueReader, ValueWriter, deserialize_vint_u64};
|
||||
use crate::vint;
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
73
sstable/src/value/vec_u32.rs
Normal file
73
sstable/src/value/vec_u32.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::io;
|
||||
|
||||
use super::{ValueReader, ValueWriter};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VecU32ValueReader {
|
||||
vals: Vec<Vec<u32>>,
|
||||
}
|
||||
|
||||
impl ValueReader for VecU32ValueReader {
|
||||
type Value = Vec<u32>;
|
||||
|
||||
#[inline(always)]
|
||||
fn value(&self, idx: usize) -> &Self::Value {
|
||||
&self.vals[idx]
|
||||
}
|
||||
|
||||
fn load(&mut self, mut data: &[u8]) -> io::Result<usize> {
|
||||
let original_num_bytes = data.len();
|
||||
self.vals.clear();
|
||||
|
||||
// The first 4 bytes are the number of blocks
|
||||
let num_blocks = u32::from_le_bytes(data[..4].try_into().unwrap()) as usize;
|
||||
data = &data[4..];
|
||||
|
||||
for _ in 0..num_blocks {
|
||||
// Each block starts with a 4-byte length
|
||||
let segment_len = u32::from_le_bytes(data[..4].try_into().unwrap()) as usize;
|
||||
data = &data[4..];
|
||||
|
||||
// Read the segment IDs for this block
|
||||
let mut segment_ids = Vec::with_capacity(segment_len);
|
||||
for _ in 0..segment_len {
|
||||
let segment_id = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
segment_ids.push(segment_id);
|
||||
data = &data[4..];
|
||||
}
|
||||
self.vals.push(segment_ids);
|
||||
}
|
||||
|
||||
// Return the number of bytes consumed
|
||||
Ok(original_num_bytes - data.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VecU32ValueWriter {
|
||||
vals: Vec<Vec<u32>>,
|
||||
}
|
||||
|
||||
impl ValueWriter for VecU32ValueWriter {
|
||||
type Value = Vec<u32>;
|
||||
|
||||
fn write(&mut self, val: &Self::Value) {
|
||||
self.vals.push(val.to_vec());
|
||||
}
|
||||
|
||||
fn serialize_block(&self, output: &mut Vec<u8>) {
|
||||
let num_blocks = self.vals.len() as u32;
|
||||
output.extend_from_slice(&num_blocks.to_le_bytes());
|
||||
for vals in &self.vals {
|
||||
let len = vals.len() as u32;
|
||||
output.extend_from_slice(&len.to_le_bytes());
|
||||
for &segment_id in vals.iter() {
|
||||
output.extend_from_slice(&segment_id.to_le_bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.vals.clear();
|
||||
}
|
||||
}
|
||||
49
sstable/tests/sstable_test.rs
Normal file
49
sstable/tests/sstable_test.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
use common::OwnedBytes;
|
||||
use tantivy_sstable::{Dictionary, MonotonicU64SSTable, VecU32ValueSSTable};
|
||||
|
||||
#[test]
|
||||
fn test_create_and_search_sstable() {
|
||||
// Create a new sstable in memory.
|
||||
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
|
||||
builder.insert(b"apple", &1).unwrap();
|
||||
builder.insert(b"banana", &2).unwrap();
|
||||
builder.insert(b"orange", &3).unwrap();
|
||||
let sstable_bytes = builder.finish().unwrap();
|
||||
|
||||
// Open the sstable.
|
||||
let sstable =
|
||||
Dictionary::<MonotonicU64SSTable>::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap();
|
||||
|
||||
// Search for a key.
|
||||
let value = sstable.get(b"banana").unwrap();
|
||||
assert_eq!(value, Some(2));
|
||||
|
||||
// Search for a non-existent key.
|
||||
let value = sstable.get(b"blub").unwrap();
|
||||
assert_eq!(value, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_value_sstable() {
|
||||
// Create a new sstable with custom values.
|
||||
let mut builder = Dictionary::<VecU32ValueSSTable>::builder(Vec::new()).unwrap();
|
||||
builder.set_block_len(4096); // Ensure both values are in the same block
|
||||
builder.insert(b"first", &vec![1, 2, 3]).unwrap();
|
||||
builder.insert(b"second", &vec![4, 5]).unwrap();
|
||||
let sstable_bytes = builder.finish().unwrap();
|
||||
|
||||
// Open the sstable.
|
||||
let sstable =
|
||||
Dictionary::<VecU32ValueSSTable>::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap();
|
||||
|
||||
let mut stream = sstable.stream().unwrap();
|
||||
assert!(stream.advance());
|
||||
assert_eq!(stream.key(), b"first");
|
||||
assert_eq!(stream.value(), &vec![1, 2, 3]);
|
||||
|
||||
assert!(stream.advance());
|
||||
assert_eq!(stream.key(), b"second");
|
||||
assert_eq!(stream.value(), &vec![4, 5]);
|
||||
|
||||
assert!(!stream.advance());
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "tantivy-stacker"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
version = "0.5.0"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
@@ -9,7 +9,7 @@ description = "term hashmap used for indexing"
|
||||
|
||||
[dependencies]
|
||||
murmurhash32 = "0.3"
|
||||
common = { version = "0.7", path = "../common/", package = "tantivy-common" }
|
||||
common = { version = "0.9", path = "../common/", package = "tantivy-common" }
|
||||
ahash = { version = "0.8.11", default-features = false, optional = true }
|
||||
rand_distr = "0.4.3"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use binggan::{BenchRunner, INSTRUMENTED_SYSTEM, PeakMemAlloc, black_box};
|
||||
use rand::SeedableRng;
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_stacker::{ArenaHashMap, ExpUnrolledLinkedList, MemoryArena};
|
||||
|
||||
@@ -13,7 +13,7 @@ mod shared_arena_hashmap;
|
||||
pub use self::arena_hashmap::ArenaHashMap;
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, MemoryArena};
|
||||
pub use self::shared_arena_hashmap::{compute_table_memory_size, SharedArenaHashMap};
|
||||
pub use self::shared_arena_hashmap::{SharedArenaHashMap, compute_table_memory_size};
|
||||
|
||||
/// When adding an element in a `ArenaHashMap`, we get a unique id associated to the given key.
|
||||
pub type UnorderedId = u32;
|
||||
|
||||
@@ -356,7 +356,7 @@ mod tests {
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::{compute_previous_power_of_two, SharedArenaHashMap};
|
||||
use super::{SharedArenaHashMap, compute_previous_power_of_two};
|
||||
use crate::MemoryArena;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-tokenizer-api"
|
||||
version = "0.3.0"
|
||||
version = "0.5.0"
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
description = "Tokenizer API of tantivy"
|
||||
|
||||
Reference in New Issue
Block a user