mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-10 11:02:55 +00:00
group montonic mapping and inverse
fix mapping inverse remove ip indexing add get_between_vals test
This commit is contained in:
@@ -139,16 +139,27 @@ where V: AsRef<[T]> + ?Sized
|
||||
|
||||
struct MonotonicMappingColumn<C, T, U, Input> {
|
||||
from_column: C,
|
||||
monotonic_mapping_to_output: T,
|
||||
monotonic_mapping_to_input: U,
|
||||
monotonic_mapping: T,
|
||||
monotonic_mapping_inv: U,
|
||||
_phantom: PhantomData<Input>,
|
||||
}
|
||||
|
||||
/// Creates a view of a column transformed by a monotonic mapping.
|
||||
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
|
||||
/// The provided mappings need to be the inverse of each other.
|
||||
///
|
||||
/// The inverse of the mapping is required for:
|
||||
/// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
|
||||
/// The user provides the original value range and we need to monotonic map them in the same way the
|
||||
/// serialization does before calling the underlying column.
|
||||
///
|
||||
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
|
||||
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
|
||||
/// monotonic_mapping during serialization.
|
||||
pub fn monotonic_map_column<C, T, U, Input: PartialOrd, Output: PartialOrd + Clone>(
|
||||
from_column: C,
|
||||
monotonic_mapping_to_output: T,
|
||||
monotonic_mapping_to_input: U,
|
||||
monotonic_mapping: T,
|
||||
monotonic_mapping_inv: U,
|
||||
) -> impl Column<Output>
|
||||
where
|
||||
C: Column<Input>,
|
||||
@@ -159,8 +170,8 @@ where
|
||||
{
|
||||
MonotonicMappingColumn {
|
||||
from_column,
|
||||
monotonic_mapping_to_output,
|
||||
monotonic_mapping_to_input,
|
||||
monotonic_mapping,
|
||||
monotonic_mapping_inv,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
@@ -177,17 +188,17 @@ where
|
||||
#[inline]
|
||||
fn get_val(&self, idx: u64) -> Output {
|
||||
let from_val = self.from_column.get_val(idx);
|
||||
(self.monotonic_mapping_to_output)(from_val)
|
||||
(self.monotonic_mapping)(from_val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> Output {
|
||||
let from_min_value = self.from_column.min_value();
|
||||
(self.monotonic_mapping_to_output)(from_min_value)
|
||||
(self.monotonic_mapping)(from_min_value)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> Output {
|
||||
let from_max_value = self.from_column.max_value();
|
||||
(self.monotonic_mapping_to_output)(from_max_value)
|
||||
(self.monotonic_mapping)(from_max_value)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
@@ -195,17 +206,13 @@ where
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
|
||||
Box::new(
|
||||
self.from_column
|
||||
.iter()
|
||||
.map(&self.monotonic_mapping_to_output),
|
||||
)
|
||||
Box::new(self.from_column.iter().map(&self.monotonic_mapping))
|
||||
}
|
||||
|
||||
fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
|
||||
self.from_column.get_between_vals(
|
||||
(self.monotonic_mapping_to_input)(range.start().clone())
|
||||
..=(self.monotonic_mapping_to_input)(range.end().clone()),
|
||||
(self.monotonic_mapping_inv)(range.start().clone())
|
||||
..=(self.monotonic_mapping_inv)(range.end().clone()),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -258,7 +265,7 @@ mod tests {
|
||||
fn test_monotonic_mapping() {
|
||||
let vals = &[1u64, 3u64][..];
|
||||
let col = VecColumn::from(vals);
|
||||
let mapped = monotonic_map_column(col, |el| el + 4, |el| el);
|
||||
let mapped = monotonic_map_column(col, |el| el + 4, |_el| unimplemented!());
|
||||
assert_eq!(mapped.min_value(), 5u64);
|
||||
assert_eq!(mapped.max_value(), 7u64);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
@@ -278,7 +285,8 @@ mod tests {
|
||||
fn test_monotonic_mapping_iter() {
|
||||
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
|
||||
let col = VecColumn::from(&vals);
|
||||
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64);
|
||||
let mapped =
|
||||
monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!());
|
||||
let val_i64s: Vec<i64> = mapped.iter().collect();
|
||||
for i in 0..100 {
|
||||
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
|
||||
@@ -289,7 +297,8 @@ mod tests {
|
||||
fn test_monotonic_mapping_get_range() {
|
||||
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
|
||||
let col = VecColumn::from(&vals);
|
||||
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64);
|
||||
let mapped =
|
||||
monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!());
|
||||
assert_eq!(mapped.min_value(), -10i64);
|
||||
assert_eq!(mapped.max_value(), 980i64);
|
||||
assert_eq!(mapped.num_vals(), 100);
|
||||
|
||||
@@ -13,6 +13,9 @@ use std::sync::Arc;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use compact_space::CompactSpaceDecompressor;
|
||||
use fastdivide::DividerU64;
|
||||
use monotonic_mapping::gcd_min_val_mapping_pairs::{from_gcd_normalized_u64, normalize_with_gcd};
|
||||
use monotonic_mapping::min_val_mapping_pairs::{from_normalized_u64, normalize};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use serialize::Header;
|
||||
|
||||
@@ -78,11 +81,10 @@ impl FastFieldCodecType {
|
||||
pub fn open_u128<Item: MonotonicallyMappableToU128>(
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
let monotonic_mapping = move |val: u128| Item::from_u128(val);
|
||||
let reader = CompactSpaceDecompressor::open(bytes)?;
|
||||
Ok(Arc::new(monotonic_map_column(
|
||||
reader,
|
||||
monotonic_mapping,
|
||||
Item::from_u128,
|
||||
Item::to_u128,
|
||||
)))
|
||||
}
|
||||
@@ -109,18 +111,17 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
|
||||
let reader = C::open_from_bytes(bytes, normalized_header)?;
|
||||
let min_value = header.min_value;
|
||||
if let Some(gcd) = header.gcd {
|
||||
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get());
|
||||
let divider = DividerU64::divide_by(gcd.get());
|
||||
Ok(Arc::new(monotonic_map_column(
|
||||
reader,
|
||||
monotonic_mapping,
|
||||
Item::to_u64,
|
||||
move |val: u64| from_gcd_normalized_u64(val, min_value, gcd.get()),
|
||||
move |val| normalize_with_gcd(val, min_value, ÷r),
|
||||
)))
|
||||
} else {
|
||||
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val);
|
||||
Ok(Arc::new(monotonic_map_column(
|
||||
reader,
|
||||
monotonic_mapping,
|
||||
Item::to_u64,
|
||||
move |val: u64| from_normalized_u64(val, min_value),
|
||||
move |val| normalize(val, min_value),
|
||||
)))
|
||||
}
|
||||
}
|
||||
@@ -161,6 +162,7 @@ pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use proptest::prelude::*;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
@@ -195,6 +197,18 @@ mod tests {
|
||||
`{data:?}`",
|
||||
);
|
||||
}
|
||||
|
||||
if !data.is_empty() {
|
||||
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
|
||||
let expected_positions: Vec<u64> = data
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, el)| **el == data[test_rand_idx])
|
||||
.map(|(pos, _)| pos as u64)
|
||||
.collect();
|
||||
let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
|
||||
assert_eq!(expected_positions, positions);
|
||||
}
|
||||
Some((estimation, actual_compression))
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use fastdivide::DividerU64;
|
||||
|
||||
pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
|
||||
/// Converts a value to u64.
|
||||
///
|
||||
@@ -11,6 +13,54 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync
|
||||
fn from_u64(val: u64) -> Self;
|
||||
}
|
||||
|
||||
// Mapping pairs for the case we subtract the min_value and apply a gcd (greatest common divisor)
|
||||
pub mod gcd_min_val_mapping_pairs {
|
||||
|
||||
use super::*;
|
||||
pub fn from_gcd_normalized_u64<Item: MonotonicallyMappableToU64>(
|
||||
val: u64,
|
||||
min_value: u64,
|
||||
gcd: u64,
|
||||
) -> Item {
|
||||
Item::from_u64(min_value + val * gcd)
|
||||
}
|
||||
|
||||
pub fn normalize_with_gcd<Item: MonotonicallyMappableToU64>(
|
||||
val: Item,
|
||||
min_value: u64,
|
||||
gcd_divider: &DividerU64,
|
||||
) -> u64 {
|
||||
gcd_divider.divide(Item::to_u64(val) - min_value)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn monotonic_mapping_roundtrip_test() {
|
||||
let gcd = std::num::NonZeroU64::new(10).unwrap();
|
||||
let divider = DividerU64::divide_by(gcd.get());
|
||||
|
||||
let orig_value: u64 = 500;
|
||||
let normalized_val: u64 = normalize_with_gcd(orig_value, 100, ÷r);
|
||||
assert_eq!(normalized_val, 40);
|
||||
assert_eq!(
|
||||
from_gcd_normalized_u64::<u64>(normalized_val, 100, gcd.get()),
|
||||
500
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Mapping pairs for the case we subtract the min_value
|
||||
pub mod min_val_mapping_pairs {
|
||||
use super::*;
|
||||
|
||||
pub fn from_normalized_u64<Item: MonotonicallyMappableToU64>(val: u64, min_value: u64) -> Item {
|
||||
Item::from_u64(min_value + val)
|
||||
}
|
||||
|
||||
pub fn normalize<Item: MonotonicallyMappableToU64>(val: Item, min_value: u64) -> u64 {
|
||||
Item::to_u64(val) - min_value
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicallyMappableToU64 for u64 {
|
||||
fn to_u64(self) -> u64 {
|
||||
self
|
||||
|
||||
@@ -30,6 +30,7 @@ use crate::bitpacked::BitpackedCodec;
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
use crate::compact_space::CompactSpaceCompressor;
|
||||
use crate::linear::LinearCodec;
|
||||
use crate::monotonic_mapping::gcd_min_val_mapping_pairs::normalize_with_gcd;
|
||||
use crate::{
|
||||
monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64,
|
||||
VecColumn, ALL_CODEC_TYPES,
|
||||
@@ -57,8 +58,9 @@ pub(crate) struct Header {
|
||||
|
||||
impl Header {
|
||||
pub fn normalized(self) -> NormalizedHeader {
|
||||
let max_value =
|
||||
(self.max_value - self.min_value) / self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
|
||||
let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
|
||||
let gcd_divider = DividerU64::divide_by(gcd);
|
||||
let max_value = normalize_with_gcd(self.max_value, self.min_value, &gcd_divider);
|
||||
NormalizedHeader {
|
||||
num_vals: self.num_vals,
|
||||
max_value,
|
||||
@@ -66,14 +68,7 @@ impl Header {
|
||||
}
|
||||
|
||||
pub fn normalize_column<C: Column>(&self, from_column: C) -> impl Column {
|
||||
let min_value = self.min_value;
|
||||
let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
|
||||
let divider = DividerU64::divide_by(gcd);
|
||||
monotonic_map_column(
|
||||
from_column,
|
||||
move |val| divider.divide(val - min_value),
|
||||
|val| val,
|
||||
)
|
||||
normalize_column(from_column, self.min_value, self.gcd)
|
||||
}
|
||||
|
||||
pub fn compute_header(
|
||||
@@ -85,10 +80,8 @@ impl Header {
|
||||
let max_value = column.max_value();
|
||||
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
|
||||
.filter(|gcd| gcd.get() > 1u64);
|
||||
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
|
||||
let shifted_column =
|
||||
monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val);
|
||||
let codec_type = detect_codec(shifted_column, codecs)?;
|
||||
let normalized_column = normalize_column(column, min_value, gcd);
|
||||
let codec_type = detect_codec(normalized_column, codecs)?;
|
||||
Some(Header {
|
||||
num_vals,
|
||||
min_value,
|
||||
@@ -99,6 +92,20 @@ impl Header {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_column<C: Column>(
|
||||
from_column: C,
|
||||
min_value: u64,
|
||||
gcd: Option<NonZeroU64>,
|
||||
) -> impl Column {
|
||||
let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1);
|
||||
let gcd_divider = DividerU64::divide_by(gcd);
|
||||
monotonic_map_column(
|
||||
from_column,
|
||||
move |val| normalize_with_gcd(val, min_value, &gcd_divider),
|
||||
move |_val| unimplemented!(), // This code is only used in serialization
|
||||
)
|
||||
}
|
||||
|
||||
impl BinarySerializable for Header {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.num_vals).serialize(writer)?;
|
||||
@@ -138,9 +145,12 @@ pub fn estimate<T: MonotonicallyMappableToU64>(
|
||||
let min_value = column.min_value();
|
||||
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
|
||||
.filter(|gcd| gcd.get() > 1u64);
|
||||
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
|
||||
let normalized_column =
|
||||
monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val);
|
||||
let gcd_divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
|
||||
let normalized_column = monotonic_map_column(
|
||||
&column,
|
||||
|val| normalize_with_gcd(val, min_value, &gcd_divider),
|
||||
|_val| unimplemented!(),
|
||||
);
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
|
||||
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
|
||||
|
||||
@@ -319,7 +319,7 @@ impl MultiValueU128FastFieldWriter {
|
||||
let value = field_value.value();
|
||||
let ip_addr = value
|
||||
.as_ip()
|
||||
.expect(&format!("expected and ip, but got {:?}", value));
|
||||
.unwrap_or_else(|| panic!("expected and ip, but got {:?}", value));
|
||||
let value = ip_addr.to_u128();
|
||||
self.add_val(value);
|
||||
}
|
||||
|
||||
@@ -331,7 +331,7 @@ impl U128FastFieldWriter {
|
||||
Some(v) => {
|
||||
let ip_addr = v
|
||||
.as_ip()
|
||||
.expect(&format!("expected and ip, but got {:?}", v));
|
||||
.unwrap_or_else(|| panic!("expected and ip, but got {:?}", v));
|
||||
|
||||
let value = ip_addr.to_u128();
|
||||
self.add_val(value);
|
||||
|
||||
@@ -294,13 +294,7 @@ impl SegmentWriter {
|
||||
ctx,
|
||||
)?;
|
||||
}
|
||||
FieldType::IpAddr(_) => {
|
||||
for value in values {
|
||||
let ip_val = value.as_ip().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_text(&ip_val.to_string());
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::IpAddr(_) => {}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
Reference in New Issue
Block a user