group montonic mapping and inverse

fix mapping inverse
remove ip indexing
add get_between_vals test
This commit is contained in:
Pascal Seitz
2022-10-04 16:53:03 +08:00
parent 67f453b534
commit cdc8e3a8be
7 changed files with 130 additions and 53 deletions

View File

@@ -139,16 +139,27 @@ where V: AsRef<[T]> + ?Sized
struct MonotonicMappingColumn<C, T, U, Input> {
from_column: C,
monotonic_mapping_to_output: T,
monotonic_mapping_to_input: U,
monotonic_mapping: T,
monotonic_mapping_inv: U,
_phantom: PhantomData<Input>,
}
/// Creates a view of a column transformed by a monotonic mapping.
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
/// The provided mappings need to be the inverse of each other.
///
/// The inverse of the mapping is required for:
/// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column.
///
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
/// monotonic_mapping during serialization.
pub fn monotonic_map_column<C, T, U, Input: PartialOrd, Output: PartialOrd + Clone>(
from_column: C,
monotonic_mapping_to_output: T,
monotonic_mapping_to_input: U,
monotonic_mapping: T,
monotonic_mapping_inv: U,
) -> impl Column<Output>
where
C: Column<Input>,
@@ -159,8 +170,8 @@ where
{
MonotonicMappingColumn {
from_column,
monotonic_mapping_to_output,
monotonic_mapping_to_input,
monotonic_mapping,
monotonic_mapping_inv,
_phantom: PhantomData,
}
}
@@ -177,17 +188,17 @@ where
#[inline]
fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx);
(self.monotonic_mapping_to_output)(from_val)
(self.monotonic_mapping)(from_val)
}
fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
(self.monotonic_mapping_to_output)(from_min_value)
(self.monotonic_mapping)(from_min_value)
}
fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
(self.monotonic_mapping_to_output)(from_max_value)
(self.monotonic_mapping)(from_max_value)
}
fn num_vals(&self) -> u64 {
@@ -195,17 +206,13 @@ where
}
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
Box::new(
self.from_column
.iter()
.map(&self.monotonic_mapping_to_output),
)
Box::new(self.from_column.iter().map(&self.monotonic_mapping))
}
fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
self.from_column.get_between_vals(
(self.monotonic_mapping_to_input)(range.start().clone())
..=(self.monotonic_mapping_to_input)(range.end().clone()),
(self.monotonic_mapping_inv)(range.start().clone())
..=(self.monotonic_mapping_inv)(range.end().clone()),
)
}
@@ -258,7 +265,7 @@ mod tests {
fn test_monotonic_mapping() {
let vals = &[1u64, 3u64][..];
let col = VecColumn::from(vals);
let mapped = monotonic_map_column(col, |el| el + 4, |el| el);
let mapped = monotonic_map_column(col, |el| el + 4, |_el| unimplemented!());
assert_eq!(mapped.min_value(), 5u64);
assert_eq!(mapped.max_value(), 7u64);
assert_eq!(mapped.num_vals(), 2);
@@ -278,7 +285,8 @@ mod tests {
fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64);
let mapped =
monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!());
let val_i64s: Vec<i64> = mapped.iter().collect();
for i in 0..100 {
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
@@ -289,7 +297,8 @@ mod tests {
fn test_monotonic_mapping_get_range() {
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64);
let mapped =
monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!());
assert_eq!(mapped.min_value(), -10i64);
assert_eq!(mapped.max_value(), 980i64);
assert_eq!(mapped.num_vals(), 100);

View File

@@ -13,6 +13,9 @@ use std::sync::Arc;
use common::BinarySerializable;
use compact_space::CompactSpaceDecompressor;
use fastdivide::DividerU64;
use monotonic_mapping::gcd_min_val_mapping_pairs::{from_gcd_normalized_u64, normalize_with_gcd};
use monotonic_mapping::min_val_mapping_pairs::{from_normalized_u64, normalize};
use ownedbytes::OwnedBytes;
use serialize::Header;
@@ -78,11 +81,10 @@ impl FastFieldCodecType {
pub fn open_u128<Item: MonotonicallyMappableToU128>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
let monotonic_mapping = move |val: u128| Item::from_u128(val);
let reader = CompactSpaceDecompressor::open(bytes)?;
Ok(Arc::new(monotonic_map_column(
reader,
monotonic_mapping,
Item::from_u128,
Item::to_u128,
)))
}
@@ -109,18 +111,17 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
let reader = C::open_from_bytes(bytes, normalized_header)?;
let min_value = header.min_value;
if let Some(gcd) = header.gcd {
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get());
let divider = DividerU64::divide_by(gcd.get());
Ok(Arc::new(monotonic_map_column(
reader,
monotonic_mapping,
Item::to_u64,
move |val: u64| from_gcd_normalized_u64(val, min_value, gcd.get()),
move |val| normalize_with_gcd(val, min_value, &divider),
)))
} else {
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val);
Ok(Arc::new(monotonic_map_column(
reader,
monotonic_mapping,
Item::to_u64,
move |val: u64| from_normalized_u64(val, min_value),
move |val| normalize(val, min_value),
)))
}
}
@@ -161,6 +162,7 @@ pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
@@ -195,6 +197,18 @@ mod tests {
`{data:?}`",
);
}
if !data.is_empty() {
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
let expected_positions: Vec<u64> = data
.iter()
.enumerate()
.filter(|(_, el)| **el == data[test_rand_idx])
.map(|(pos, _)| pos as u64)
.collect();
let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
assert_eq!(expected_positions, positions);
}
Some((estimation, actual_compression))
}

View File

@@ -1,3 +1,5 @@
use fastdivide::DividerU64;
pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
/// Converts a value to u64.
///
@@ -11,6 +13,54 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync
fn from_u64(val: u64) -> Self;
}
// Mapping pairs for the case we subtract the min_value and apply a gcd (greatest common divisor)
pub mod gcd_min_val_mapping_pairs {
use super::*;
pub fn from_gcd_normalized_u64<Item: MonotonicallyMappableToU64>(
val: u64,
min_value: u64,
gcd: u64,
) -> Item {
Item::from_u64(min_value + val * gcd)
}
pub fn normalize_with_gcd<Item: MonotonicallyMappableToU64>(
val: Item,
min_value: u64,
gcd_divider: &DividerU64,
) -> u64 {
gcd_divider.divide(Item::to_u64(val) - min_value)
}
#[test]
fn monotonic_mapping_roundtrip_test() {
let gcd = std::num::NonZeroU64::new(10).unwrap();
let divider = DividerU64::divide_by(gcd.get());
let orig_value: u64 = 500;
let normalized_val: u64 = normalize_with_gcd(orig_value, 100, &divider);
assert_eq!(normalized_val, 40);
assert_eq!(
from_gcd_normalized_u64::<u64>(normalized_val, 100, gcd.get()),
500
);
}
}
// Mapping pairs for the case we subtract the min_value
pub mod min_val_mapping_pairs {
use super::*;
pub fn from_normalized_u64<Item: MonotonicallyMappableToU64>(val: u64, min_value: u64) -> Item {
Item::from_u64(min_value + val)
}
pub fn normalize<Item: MonotonicallyMappableToU64>(val: Item, min_value: u64) -> u64 {
Item::to_u64(val) - min_value
}
}
impl MonotonicallyMappableToU64 for u64 {
fn to_u64(self) -> u64 {
self

View File

@@ -30,6 +30,7 @@ use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::compact_space::CompactSpaceCompressor;
use crate::linear::LinearCodec;
use crate::monotonic_mapping::gcd_min_val_mapping_pairs::normalize_with_gcd;
use crate::{
monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64,
VecColumn, ALL_CODEC_TYPES,
@@ -57,8 +58,9 @@ pub(crate) struct Header {
impl Header {
pub fn normalized(self) -> NormalizedHeader {
let max_value =
(self.max_value - self.min_value) / self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
let gcd_divider = DividerU64::divide_by(gcd);
let max_value = normalize_with_gcd(self.max_value, self.min_value, &gcd_divider);
NormalizedHeader {
num_vals: self.num_vals,
max_value,
@@ -66,14 +68,7 @@ impl Header {
}
pub fn normalize_column<C: Column>(&self, from_column: C) -> impl Column {
let min_value = self.min_value;
let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
let divider = DividerU64::divide_by(gcd);
monotonic_map_column(
from_column,
move |val| divider.divide(val - min_value),
|val| val,
)
normalize_column(from_column, self.min_value, self.gcd)
}
pub fn compute_header(
@@ -85,10 +80,8 @@ impl Header {
let max_value = column.max_value();
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
.filter(|gcd| gcd.get() > 1u64);
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
let shifted_column =
monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val);
let codec_type = detect_codec(shifted_column, codecs)?;
let normalized_column = normalize_column(column, min_value, gcd);
let codec_type = detect_codec(normalized_column, codecs)?;
Some(Header {
num_vals,
min_value,
@@ -99,6 +92,20 @@ impl Header {
}
}
pub fn normalize_column<C: Column>(
from_column: C,
min_value: u64,
gcd: Option<NonZeroU64>,
) -> impl Column {
let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1);
let gcd_divider = DividerU64::divide_by(gcd);
monotonic_map_column(
from_column,
move |val| normalize_with_gcd(val, min_value, &gcd_divider),
move |_val| unimplemented!(), // This code is only used in serialization
)
}
impl BinarySerializable for Header {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.num_vals).serialize(writer)?;
@@ -138,9 +145,12 @@ pub fn estimate<T: MonotonicallyMappableToU64>(
let min_value = column.min_value();
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
.filter(|gcd| gcd.get() > 1u64);
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
let normalized_column =
monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val);
let gcd_divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
let normalized_column = monotonic_map_column(
&column,
|val| normalize_with_gcd(val, min_value, &gcd_divider),
|_val| unimplemented!(),
);
match codec_type {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),

View File

@@ -319,7 +319,7 @@ impl MultiValueU128FastFieldWriter {
let value = field_value.value();
let ip_addr = value
.as_ip()
.expect(&format!("expected and ip, but got {:?}", value));
.unwrap_or_else(|| panic!("expected and ip, but got {:?}", value));
let value = ip_addr.to_u128();
self.add_val(value);
}

View File

@@ -331,7 +331,7 @@ impl U128FastFieldWriter {
Some(v) => {
let ip_addr = v
.as_ip()
.expect(&format!("expected and ip, but got {:?}", v));
.unwrap_or_else(|| panic!("expected and ip, but got {:?}", v));
let value = ip_addr.to_u128();
self.add_val(value);

View File

@@ -294,13 +294,7 @@ impl SegmentWriter {
ctx,
)?;
}
FieldType::IpAddr(_) => {
for value in values {
let ip_val = value.as_ip().ok_or_else(make_schema_error)?;
term_buffer.set_text(&ip_val.to_string());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
}
FieldType::IpAddr(_) => {}
}
}
Ok(())