Compare commits

...

6 Commits
0.24 ... 0.19.2

Author SHA1 Message Date
Pascal Seitz
6761237ec7 chore: Release 0.19.2 2023-02-10 12:20:20 +08:00
Pascal Seitz
3da08e92c7 fix: doc store for files larger 4GB
Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32.
`get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB).
Causes index corruption, if a segment with a doc store larger 4GB is merged.

tantivy version 0.19 is affected
2023-02-10 12:12:47 +08:00
Pascal Seitz
6c4b8d97ed chore: Release 2023-01-13 13:46:28 +08:00
Pascal Seitz
dc5f503c9a use fastfield_codecs 0.3.1 2023-01-13 13:34:42 +08:00
Pascal Seitz
4ffcf3a933 chore: Release 2023-01-13 13:31:20 +08:00
Pascal Seitz
079f542f97 handle user input on get_docid_for_value_range 2023-01-13 12:24:34 +08:00
11 changed files with 253 additions and 37 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.19.0"
version = "0.19.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -60,7 +60,7 @@ arc-swap = "1.5.0"
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
common = { version= "0.4", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
fastfield_codecs = { version= "0.3.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version= "0.4", path="./ownedbytes" }
[target.'cfg(windows)'.dependencies]

View File

@@ -1,6 +1,6 @@
[package]
name = "fastfield_codecs"
version = "0.3.0"
version = "0.3.1"
authors = ["Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2021"

View File

@@ -1,3 +1,4 @@
use std::fmt::{self, Debug};
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
@@ -6,7 +7,7 @@ use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn;
/// `Column` provides columnar access on a field.
pub trait Column<T: PartialOrd = u64>: Send + Sync {
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
@@ -83,7 +84,7 @@ pub struct VecColumn<'a, T = u64> {
max_value: T,
}
impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
@@ -109,7 +110,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
}
}
impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]
}
@@ -177,8 +178,8 @@ pub fn monotonic_map_column<C, T, Input, Output>(
where
C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
Input: PartialOrd + Send + Sync + Copy + Debug,
Output: PartialOrd + Send + Sync + Copy + Debug,
{
MonotonicMappingColumn {
from_column,
@@ -191,8 +192,8 @@ impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where
C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
Input: PartialOrd + Send + Sync + Copy + Debug,
Output: PartialOrd + Send + Sync + Copy + Debug,
{
#[inline]
fn get_val(&self, idx: u32) -> Output {
@@ -228,12 +229,15 @@ where
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_docids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
)
if range.start() > &self.max_value() || range.end() < &self.min_value() {
return;
}
let range = self.monotonic_mapping.inverse_coerce(range);
if range.start() > range.end() {
return;
}
self.from_column
.get_docids_for_value_range(range, doc_id_range, positions)
}
// We voluntarily do not implement get_range as it yields a regression,
@@ -254,7 +258,7 @@ where T: Iterator + Clone + ExactSizeIterator
impl<T> Column<T::Item> for IterColumn<T>
where
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd,
T::Item: PartialOrd + fmt::Debug,
{
fn get_val(&self, idx: u32) -> T::Item {
self.0.clone().nth(idx as usize).unwrap()

View File

@@ -455,6 +455,8 @@ impl CompactSpaceDecompressor {
#[cfg(test)]
mod tests {
use std::fmt;
use super::*;
use crate::format_version::read_format_version;
use crate::null_index_footer::read_null_index_footer;
@@ -708,7 +710,7 @@ mod tests {
);
}
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd + fmt::Debug>(
column: &C,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,

View File

@@ -14,9 +14,9 @@ extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
use std::io;
use std::io::Write;
use std::sync::Arc;
use std::{fmt, io};
use common::BinarySerializable;
use compact_space::CompactSpaceDecompressor;
@@ -132,7 +132,7 @@ impl U128FastFieldCodecType {
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128<Item: MonotonicallyMappableToU128>(
pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
@@ -146,7 +146,9 @@ pub fn open_u128<Item: MonotonicallyMappableToU128>(
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<T>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
let header = Header::deserialize(&mut bytes)?;
@@ -159,7 +161,7 @@ pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<
}
}
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
bytes: OwnedBytes,
header: &Header,
) -> io::Result<Arc<dyn Column<Item>>> {
@@ -320,6 +322,9 @@ mod tests {
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];
let data = vec![10];
data_and_names.push((data, "minimal test"));
let data = (10..=10_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "simple monotonically increasing"));
@@ -327,6 +332,9 @@ mod tests {
vec![5, 6, 7, 8, 9, 10, 99, 100],
"offset in linear interpol",
));
data_and_names.push((vec![3, 18446744073709551613, 5], "docid range regression"));
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value"));

View File

@@ -1,4 +1,6 @@
use std::fmt;
use std::marker::PhantomData;
use std::ops::RangeInclusive;
use fastdivide::DividerU64;
@@ -6,7 +8,9 @@ use crate::MonotonicallyMappableToU128;
/// Monotonic maps a value to u64 value space.
/// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space.
pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
pub trait MonotonicallyMappableToU64:
'static + PartialOrd + Copy + Send + Sync + fmt::Debug
{
/// Converts a value to u64.
///
/// Internally all fast field values are encoded as u64.
@@ -29,11 +33,29 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync
/// mapping from their range to their domain. The `inverse` method is required when opening a codec,
/// so a value can be converted back to its original domain (e.g. ip address or f64) from its
/// internal representation.
pub trait StrictlyMonotonicFn<External, Internal> {
pub trait StrictlyMonotonicFn<External: Copy, Internal: Copy> {
/// Strictly monotonically maps the value from External to Internal.
fn mapping(&self, inp: External) -> Internal;
/// Inverse of `mapping`. Maps the value from Internal to External.
fn inverse(&self, out: Internal) -> External;
/// Maps a user provded value from External to Internal.
/// It may be necessary to coerce the value if it is outside the value space.
/// In that case it tries to find the next greater value in the value space.
///
/// Returns a bool to mark if a value was outside the value space and had to be coerced _up_.
/// With that information we can detect if two values in a range both map outside the same value
/// space.
///
/// coerce_up means the next valid upper value in the value space will be chosen if the value
/// has to be coerced.
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<Internal> {
self.mapping(*inp.start())..=self.mapping(*inp.end())
}
/// Inverse of `mapping_coerce`.
fn inverse_coerce(&self, out: RangeInclusive<Internal>) -> RangeInclusive<External> {
self.inverse(*out.start())..=self.inverse(*out.end())
}
}
/// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn<A, B>` to
@@ -54,7 +76,10 @@ impl<T> From<T> for StrictlyMonotonicMappingInverter<T> {
}
impl<From, To, T> StrictlyMonotonicFn<To, From> for StrictlyMonotonicMappingInverter<T>
where T: StrictlyMonotonicFn<From, To>
where
T: StrictlyMonotonicFn<From, To>,
From: Copy,
To: Copy,
{
fn mapping(&self, val: To) -> From {
self.orig_mapping.inverse(val)
@@ -63,6 +88,15 @@ where T: StrictlyMonotonicFn<From, To>
fn inverse(&self, val: From) -> To {
self.orig_mapping.mapping(val)
}
#[inline]
fn mapping_coerce(&self, inp: RangeInclusive<To>) -> RangeInclusive<From> {
self.orig_mapping.inverse_coerce(inp)
}
#[inline]
fn inverse_coerce(&self, out: RangeInclusive<From>) -> RangeInclusive<To> {
self.orig_mapping.mapping_coerce(out)
}
}
/// Applies the strictly monotonic mapping from `T` without any additional changes.
@@ -134,6 +168,31 @@ impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
fn inverse(&self, out: u64) -> External {
External::from_u64(self.min_value + out * self.gcd)
}
#[inline]
#[allow(clippy::reversed_empty_ranges)]
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
let end = External::to_u64(*inp.end());
if end < self.min_value || inp.end() < inp.start() {
return 1..=0;
}
let map_coerce = |mut inp, coerce_up| {
let inp_lower_bound = self.inverse(0);
if inp < inp_lower_bound {
inp = inp_lower_bound;
}
let val = External::to_u64(inp);
let need_coercion = coerce_up && (val - self.min_value) % self.gcd != 0;
let mut mapped_val = self.mapping(inp);
if need_coercion {
mapped_val += 1;
}
mapped_val
};
let start = map_coerce(*inp.start(), true);
let end = map_coerce(*inp.end(), false);
start..=end
}
}
/// Strictly monotonic mapping with a base value.
@@ -149,6 +208,17 @@ impl StrictlyMonotonicMappingToInternalBaseval {
impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
for StrictlyMonotonicMappingToInternalBaseval
{
#[inline]
#[allow(clippy::reversed_empty_ranges)]
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
if External::to_u64(*inp.end()) < self.min_value {
return 1..=0;
}
let start = self.mapping(External::to_u64(*inp.start()).max(self.min_value));
let end = self.mapping(External::to_u64(*inp.end()));
start..=end
}
fn mapping(&self, val: External) -> u64 {
External::to_u64(val) - self.min_value
}
@@ -224,7 +294,7 @@ mod tests {
test_round_trip::<_, _, u64>(&mapping, 100u64);
}
fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L>(
fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L: Copy>(
mapping: &T,
test_val: K,
) {

View File

@@ -1,8 +1,11 @@
use std::fmt;
use std::net::Ipv6Addr;
/// Montonic maps a value to u128 value space
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync {
pub trait MonotonicallyMappableToU128:
'static + PartialOrd + Copy + Send + Sync + fmt::Debug
{
/// Converts a value to u128.
///
/// Internally all fast field values are encoded as u64.

View File

@@ -17,9 +17,9 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
use std::io;
use std::num::NonZeroU64;
use std::sync::Arc;
use std::{fmt, io};
use common::{BinarySerializable, VInt};
use log::warn;
@@ -168,7 +168,7 @@ impl BinarySerializable for Header {
/// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no
/// compression.
pub fn estimate<T: MonotonicallyMappableToU64>(
pub fn estimate<T: MonotonicallyMappableToU64 + fmt::Debug>(
typed_column: impl Column<T>,
codec_type: FastFieldCodecType,
) -> Option<f32> {
@@ -214,7 +214,7 @@ pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
}
/// Serializes the column with the codec with the best estimate on the data.
pub fn serialize<T: MonotonicallyMappableToU64>(
pub fn serialize<T: MonotonicallyMappableToU64 + fmt::Debug>(
typed_column: impl Column<T>,
output: &mut impl io::Write,
codecs: &[FastFieldCodecType],
@@ -294,7 +294,7 @@ fn serialize_given_codec(
}
/// Helper function to serialize a column (autodetect from all codecs) and then open it
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default + fmt::Debug>(
column: &[T],
) -> Arc<dyn Column<T>> {
let mut buffer = Vec::new();

View File

@@ -145,7 +145,7 @@ impl FastFieldType {
mod tests {
use std::collections::HashMap;
use std::ops::Range;
use std::ops::{Range, RangeInclusive};
use std::path::Path;
use std::sync::Arc;
@@ -159,7 +159,9 @@ mod tests {
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{Cardinality, Document, Field, Schema, SchemaBuilder, FAST, STRING, TEXT};
use crate::schema::{
Cardinality, Document, Field, Schema, SchemaBuilder, FAST, INDEXED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
@@ -969,4 +971,117 @@ mod tests {
}
Ok(len)
}
#[test]
fn test_gcd_bug_regression_1757() {
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("url_norm_hash", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc! {
num_field => 100u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 200u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 300u64,
})
.unwrap();
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment = &searcher.segment_readers()[0];
let field = segment.fast_fields().u64(num_field).unwrap();
let numbers = vec![100, 200, 300];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
};
test_range(50..=50);
test_range(150..=150);
test_range(350..=350);
test_range(100..=250);
test_range(101..=200);
test_range(101..=199);
test_range(100..=300);
test_range(100..=299);
}
#[test]
fn test_mapping_bug_docids_for_value_range() {
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("url_norm_hash", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// Values without gcd, but with min_value
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc! {
num_field => 1000u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 1001u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 1003u64,
})
.unwrap();
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment = &searcher.segment_readers()[0];
let field = segment.fast_fields().u64(num_field).unwrap();
let numbers = vec![1000, 1001, 1003];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
};
let test_range_variant = |start, stop| {
let start_range = start..=stop;
test_range(start_range);
let start_range = start..=(stop - 1);
test_range(start_range);
let start_range = start..=(stop + 1);
test_range(start_range);
let start_range = (start - 1)..=stop;
test_range(start_range);
let start_range = (start - 1)..=(stop - 1);
test_range(start_range);
let start_range = (start - 1)..=(stop + 1);
test_range(start_range);
let start_range = (start + 1)..=stop;
test_range(start_range);
let start_range = (start + 1)..=(stop - 1);
test_range(start_range);
let start_range = (start + 1)..=(stop + 1);
test_range(start_range);
};
test_range_variant(50, 50);
test_range_variant(1000, 1000);
test_range_variant(1000, 1002);
}
}

View File

@@ -1,3 +1,4 @@
use std::fmt;
use std::io::{self, Write};
pub use fastfield_codecs::Column;
@@ -49,7 +50,7 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64>(
pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64 + fmt::Debug>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
@@ -59,7 +60,9 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field_with_idx<T: MonotonicallyMappableToU64>(
pub fn create_auto_detect_u64_fast_field_with_idx<
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
@@ -72,7 +75,9 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec of the the provided
/// will be chosen.
pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<T: MonotonicallyMappableToU64>(
pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,

View File

@@ -90,7 +90,7 @@ impl CheckpointBlock {
return Ok(());
}
let mut doc = read_u32_vint(data);
let mut start_offset = read_u32_vint(data) as usize;
let mut start_offset = VInt::deserialize_u64(data)? as usize;
for _ in 0..len {
let num_docs = read_u32_vint(data);
let block_num_bytes = read_u32_vint(data) as usize;
@@ -147,6 +147,15 @@ mod tests {
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize_large_byte_range() -> io::Result<()> {
let checkpoints = vec![Checkpoint {
doc_range: 10..12,
byte_range: 8_000_000_000..9_000_000_000,
}];
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize() -> io::Result<()> {
let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();