This commit is contained in:
Paul Masurel
2023-01-23 17:15:43 +09:00
parent 0e66423de8
commit d29d63a829
19 changed files with 203 additions and 815 deletions

View File

@@ -17,10 +17,12 @@ use std::sync::Arc;
use common::{BinarySerializable, OwnedBytes};
use compact_space::CompactSpaceDecompressor;
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
use monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
};
pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
use serialize::{Header, U128Header};
mod bitpacked;
@@ -37,8 +39,6 @@ mod gcd;
pub mod serialize;
pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn};
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
#[cfg(test)]
pub use self::serialize::tests::serialize_and_load;
pub use self::serialize::{serialize_column_values, NormalizedHeader};

View File

@@ -20,7 +20,7 @@ mod value;
pub use column::{BytesColumn, Column, StrColumn};
pub use column_index::ColumnIndex;
pub use column_values::ColumnValues;
pub use column_values::{ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
pub use columnar::{
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
MergeDocOrder,

View File

@@ -3,254 +3,3 @@ use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
pub use columnar::ColumnValues as Column;
use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn;
/// VecColumn provides `Column` over a slice.
pub struct VecColumn<'a, T = u64> {
values: &'a [T],
min_value: T,
max_value: T,
}
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]
}
fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
Box::new(self.values.iter().copied())
}
fn min_value(&self) -> T {
self.min_value
}
fn max_value(&self) -> T {
self.max_value
}
fn num_vals(&self) -> u32 {
self.values.len() as u32
}
fn get_range(&self, start: u64, output: &mut [T]) {
output.copy_from_slice(&self.values[start as usize..][..output.len()])
}
}
impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
where V: AsRef<[T]> + ?Sized
{
fn from(values: &'a V) -> Self {
let values = values.as_ref();
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
Self {
values,
min_value,
max_value,
}
}
}
struct MonotonicMappingColumn<C, T, Input> {
from_column: C,
monotonic_mapping: T,
_phantom: PhantomData<Input>,
}
/// Creates a view of a column transformed by a strictly monotonic mapping. See
/// [`StrictlyMonotonicFn`].
///
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
/// monotonic_mapping.mapping() is expected to be injective, and we should always have
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
///
/// The inverse of the mapping is required for:
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column.
///
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
/// monotonic_mapping during serialization.
pub fn monotonic_map_column<C, T, Input, Output>(
from_column: C,
monotonic_mapping: T,
) -> impl Column<Output>
where
C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Copy + Debug,
Output: PartialOrd + Send + Sync + Copy + Debug,
{
MonotonicMappingColumn {
from_column,
monotonic_mapping,
_phantom: PhantomData,
}
}
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where
C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Copy + Debug,
Output: PartialOrd + Send + Sync + Copy + Debug,
{
#[inline]
fn get_val(&self, idx: u32) -> Output {
let from_val = self.from_column.get_val(idx);
self.monotonic_mapping.mapping(from_val)
}
fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
self.monotonic_mapping.mapping(from_min_value)
}
fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
self.monotonic_mapping.mapping(from_max_value)
}
fn num_vals(&self) -> u32 {
self.from_column.num_vals()
}
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
Box::new(
self.from_column
.iter()
.map(|el| self.monotonic_mapping.mapping(el)),
)
}
fn get_docids_for_value_range(
&self,
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
if range.start() > &self.max_value() || range.end() < &self.min_value() {
return;
}
let range = self.monotonic_mapping.inverse_coerce(range);
if range.start() > range.end() {
return;
}
self.from_column
.get_docids_for_value_range(range, doc_id_range, positions)
}
// We voluntarily do not implement get_range as it yields a regression,
// and we do not have any specialized implementation anyway.
}
/// Wraps an iterator into a `Column`.
pub struct IterColumn<T>(T);
impl<T> From<T> for IterColumn<T>
where T: Iterator + Clone + ExactSizeIterator
{
fn from(iter: T) -> Self {
IterColumn(iter)
}
}
impl<T> Column<T::Item> for IterColumn<T>
where
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd + fmt::Debug,
{
fn get_val(&self, idx: u32) -> T::Item {
self.0.clone().nth(idx as usize).unwrap()
}
fn min_value(&self) -> T::Item {
self.0.clone().next().unwrap()
}
fn max_value(&self) -> T::Item {
self.0.clone().last().unwrap()
}
fn num_vals(&self) -> u32 {
self.0.len() as u32
}
fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {
Box::new(self.0.clone())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
StrictlyMonotonicMappingToInternalGCDBaseval,
};
#[test]
fn test_monotonic_mapping() {
let vals = &[3u64, 5u64][..];
let col = VecColumn::from(vals);
let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
assert_eq!(mapped.min_value(), 1u64);
assert_eq!(mapped.max_value(), 3u64);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.get_val(0), 1);
assert_eq!(mapped.get_val(1), 3);
}
#[test]
fn test_range_as_col() {
let col = IterColumn::from(10..100);
assert_eq!(col.num_vals(), 90);
assert_eq!(col.max_value(), 99);
}
#[test]
fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
),
);
let val_i64s: Vec<u64> = mapped.iter().collect();
for i in 0..100 {
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
}
}
#[test]
fn test_monotonic_mapping_get_range() {
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
),
);
assert_eq!(mapped.min_value(), 0u64);
assert_eq!(mapped.max_value(), 9900u64);
assert_eq!(mapped.num_vals(), 100);
let val_u64s: Vec<u64> = mapped.iter().collect();
assert_eq!(val_u64s.len(), 100);
for i in 0..100 {
assert_eq!(val_u64s[i as usize], mapped.get_val(i));
assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
}
let mut buf = [0u64; 20];
mapped.get_range(7, &mut buf[..]);
assert_eq!(&val_u64s[7..][..20], &buf);
}
}

View File

@@ -14,208 +14,206 @@ extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
use std::io::Write;
use std::sync::Arc;
use std::{fmt, io};
// use common::{BinarySerializable, OwnedBytes};
// use compact_space::CompactSpaceDecompressor;
// use format_version::read_format_version;
// use monotonic_mapping::{
// StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
// StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
// };
// use null_index_footer::read_null_index_footer;
// use serialize::{Header, U128Header};
use common::{BinarySerializable, OwnedBytes};
use compact_space::CompactSpaceDecompressor;
use format_version::read_format_version;
use monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
};
use null_index_footer::read_null_index_footer;
use serialize::{Header, U128Header};
mod bitpacked;
mod blockwise_linear;
mod compact_space;
mod format_version;
mod line;
mod linear;
mod monotonic_mapping;
mod monotonic_mapping_u128;
#[allow(dead_code)]
mod null_index;
mod null_index_footer;
// mod bitpacked;
// mod blockwise_linear;
// mod compact_space;
// mod format_version;
// mod line;
// mod linear;
// mod monotonic_mapping;
// mod monotonic_mapping_u128;
// #[allow(dead_code)]
// mod null_index;
// mod null_index_footer;
mod column;
mod gcd;
pub mod serialize;
pub use column::Column;
use self::bitpacked::BitpackedCodec;
use self::blockwise_linear::BlockwiseLinearCodec;
pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
use self::linear::LinearCodec;
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
pub use self::serialize::{
estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
};
// mod gcd;
// pub mod serialize;
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
pub enum FastFieldCodecType {
/// Bitpack all values in the value range. The number of bits is defined by the amplitude
/// `column.max_value() - column.min_value()`
Bitpacked = 1,
/// Linear interpolation puts a line between the first and last value and then bitpacks the
/// values by the offset from the line. The number of bits is defined by the max deviation from
/// the line.
Linear = 2,
/// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements.
BlockwiseLinear = 3,
}
// use self::bitpacked::BitpackedCodec;
// use self::blockwise_linear::BlockwiseLinearCodec;
// pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
// use self::linear::LinearCodec;
// pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
// pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
// pub use self::serialize::{
// estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
// };
impl BinarySerializable for FastFieldCodecType {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
// #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
// #[repr(u8)]
// /// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted
// data. pub enum FastFieldCodecType {
// /// Bitpack all values in the value range. The number of bits is defined by the amplitude
// /// `column.max_value() - column.min_value()`
// Bitpacked = 1,
// /// Linear interpolation puts a line between the first and last value and then bitpacks the
// /// values by the offset from the line. The number of bits is defined by the max deviation
// from /// the line.
// Linear = 2,
// /// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements.
// BlockwiseLinear = 3,
// }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
// impl BinarySerializable for FastFieldCodecType {
// fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
// self.to_code().serialize(wrt)
// }
impl FastFieldCodecType {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
// fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
// let code = u8::deserialize(reader)?;
// let codec_type: Self = Self::from_code(code)
// .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code
// `{code}.`"))?; Ok(codec_type)
// }
// }
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::Bitpacked),
2 => Some(Self::Linear),
3 => Some(Self::BlockwiseLinear),
_ => None,
}
}
}
// impl FastFieldCodecType {
// pub(crate) fn to_code(self) -> u8 {
// self as u8
// }
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
pub enum U128FastFieldCodecType {
/// This codec takes a large number space (u128) and reduces it to a compact number space, by
/// removing the holes.
CompactSpace = 1,
}
// pub(crate) fn from_code(code: u8) -> Option<Self> {
// match code {
// 1 => Some(Self::Bitpacked),
// 2 => Some(Self::Linear),
// 3 => Some(Self::BlockwiseLinear),
// _ => None,
// }
// }
// }
impl BinarySerializable for U128FastFieldCodecType {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
// #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
// #[repr(u8)]
// /// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted
// data. pub enum U128FastFieldCodecType {
// /// This codec takes a large number space (u128) and reduces it to a compact number space, by
// /// removing the holes.
// CompactSpace = 1,
// }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
// impl BinarySerializable for U128FastFieldCodecType {
// fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
// self.to_code().serialize(wrt)
// }
impl U128FastFieldCodecType {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
// fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
// let code = u8::deserialize(reader)?;
// let codec_type: Self = Self::from_code(code)
// .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code
// `{code}.`"))?; Ok(codec_type)
// }
// }
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::CompactSpace),
_ => None,
}
}
}
// impl U128FastFieldCodecType {
// pub(crate) fn to_code(self) -> u8 {
// self as u8
// }
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceDecompressor::open(bytes)?;
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
StrictlyMonotonicMappingToInternal::<Item>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted)))
}
// pub(crate) fn from_code(code: u8) -> Option<Self> {
// match code {
// 1 => Some(Self::CompactSpace),
// _ => None,
// }
// }
// }
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<T>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
let header = Header::deserialize(&mut bytes)?;
match header.codec_type {
FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes, &header),
FastFieldCodecType::BlockwiseLinear => {
open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
}
}
}
// /// Returns the correct codec reader wrapped in the `Arc` for the data.
// pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
// bytes: OwnedBytes,
// ) -> io::Result<Arc<dyn Column<Item>>> {
// let (bytes, _format_version) = read_format_version(bytes)?;
// let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
// let header = U128Header::deserialize(&mut bytes)?;
// assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
// let reader = CompactSpaceDecompressor::open(bytes)?;
// let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
// StrictlyMonotonicMappingToInternal::<Item>::new().into();
// Ok(Arc::new(monotonic_map_column(reader, inverted)))
// }
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
bytes: OwnedBytes,
header: &Header,
) -> io::Result<Arc<dyn Column<Item>>> {
let normalized_header = header.normalized();
let reader = C::open_from_bytes(bytes, normalized_header)?;
let min_value = header.min_value;
if let Some(gcd) = header.gcd {
let mapping = StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
);
Ok(Arc::new(monotonic_map_column(reader, mapping)))
} else {
let mapping = StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalBaseval::new(min_value),
);
Ok(Arc::new(monotonic_map_column(reader, mapping)))
}
}
// /// Returns the correct codec reader wrapped in the `Arc` for the data.
// pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
// bytes: OwnedBytes,
// ) -> io::Result<Arc<dyn Column<T>>> {
// let (bytes, _format_version) = read_format_version(bytes)?;
// let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
// let header = Header::deserialize(&mut bytes)?;
// match header.codec_type {
// FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes,
// &header), FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes,
// &header), FastFieldCodecType::BlockwiseLinear => {
// open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
// }
// }
// }
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
trait FastFieldCodec: 'static {
/// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType;
// fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
// bytes: OwnedBytes,
// header: &Header,
// ) -> io::Result<Arc<dyn Column<Item>>> {
// let normalized_header = header.normalized();
// let reader = C::open_from_bytes(bytes, normalized_header)?;
// let min_value = header.min_value;
// if let Some(gcd) = header.gcd {
// let mapping = StrictlyMonotonicMappingInverter::from(
// StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
// );
// Ok(Arc::new(monotonic_map_column(reader, mapping)))
// } else {
// let mapping = StrictlyMonotonicMappingInverter::from(
// StrictlyMonotonicMappingToInternalBaseval::new(min_value),
// );
// Ok(Arc::new(monotonic_map_column(reader, mapping)))
// }
// }
type Reader: Column<u64> + 'static;
// /// The FastFieldSerializerEstimate trait is required on all variants
// /// of fast field compressions, to decide which one to choose.
// trait FastFieldCodec: 'static {
// /// A codex needs to provide a unique name and id, which is
// /// used for debugging and de/serialization.
// const CODEC_TYPE: FastFieldCodecType;
/// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
// type Reader: Column<u64> + 'static;
/// Serializes the data using the serializer into write.
///
/// The column iterator should be preferred over using column `get_val` method for
/// performance reasons.
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;
// /// Reads the metadata and returns the CodecReader
// fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
/// Returns an estimate of the compression ratio.
/// If the codec is not applicable, returns `None`.
///
/// The baseline is uncompressed 64bit data.
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(column: &dyn Column) -> Option<f32>;
}
// /// Serializes the data using the serializer into write.
// ///
// /// The column iterator should be preferred over using column `get_val` method for
// /// performance reasons.
// fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;
/// The list of all available codecs for u64 convertible data.
pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
FastFieldCodecType::Bitpacked,
FastFieldCodecType::BlockwiseLinear,
FastFieldCodecType::Linear,
];
// /// Returns an estimate of the compression ratio.
// /// If the codec is not applicable, returns `None`.
// ///
// /// The baseline is uncompressed 64bit data.
// ///
// /// It could make sense to also return a value representing
// /// computational complexity.
// fn estimate(column: &dyn Column) -> Option<f32>;
// }
// /// The list of all available codecs for u64 convertible data.
// pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
// FastFieldCodecType::Bitpacked,
// FastFieldCodecType::BlockwiseLinear,
// FastFieldCodecType::Linear,
// ];
#[cfg(test)]
mod tests {

View File

@@ -1,222 +0,0 @@
#[macro_use]
extern crate prettytable;
use std::collections::HashSet;
use std::env;
use std::io::BufRead;
use std::net::{IpAddr, Ipv6Addr};
use std::str::FromStr;
use common::OwnedBytes;
use fastfield_codecs::{open_u128, serialize_u128, Column, FastFieldCodecType, VecColumn};
use itertools::Itertools;
use measure_time::print_time;
use prettytable::{Cell, Row, Table};
fn print_set_stats(ip_addrs: &[u128]) {
println!("NumIps\t{}", ip_addrs.len());
let ip_addr_set: HashSet<u128> = ip_addrs.iter().cloned().collect();
println!("NumUniqueIps\t{}", ip_addr_set.len());
let ratio_unique = ip_addr_set.len() as f64 / ip_addrs.len() as f64;
println!("RatioUniqueOverTotal\t{ratio_unique:.4}");
// histogram
let mut ip_addrs = ip_addrs.to_vec();
ip_addrs.sort();
let mut cnts: Vec<usize> = ip_addrs
.into_iter()
.dedup_with_count()
.map(|(cnt, _)| cnt)
.collect();
cnts.sort();
let top_256_cnt: usize = cnts.iter().rev().take(256).sum();
let top_128_cnt: usize = cnts.iter().rev().take(128).sum();
let top_64_cnt: usize = cnts.iter().rev().take(64).sum();
let top_8_cnt: usize = cnts.iter().rev().take(8).sum();
let total: usize = cnts.iter().sum();
println!("{}", total);
println!("{}", top_256_cnt);
println!("{}", top_128_cnt);
println!("Percentage Top8 {:02}", top_8_cnt as f32 / total as f32);
println!("Percentage Top64 {:02}", top_64_cnt as f32 / total as f32);
println!("Percentage Top128 {:02}", top_128_cnt as f32 / total as f32);
println!("Percentage Top256 {:02}", top_256_cnt as f32 / total as f32);
let mut cnts: Vec<(usize, usize)> = cnts.into_iter().dedup_with_count().collect();
cnts.sort_by(|a, b| {
if a.1 == b.1 {
a.0.cmp(&b.0)
} else {
b.1.cmp(&a.1)
}
});
}
fn ip_dataset() -> Vec<u128> {
let mut ip_addr_v4 = 0;
let stdin = std::io::stdin();
let ip_addrs: Vec<u128> = stdin
.lock()
.lines()
.flat_map(|line| {
let line = line.unwrap();
let line = line.trim();
let ip_addr = IpAddr::from_str(line.trim()).ok()?;
if ip_addr.is_ipv4() {
ip_addr_v4 += 1;
}
let ip_addr_v6: Ipv6Addr = match ip_addr {
IpAddr::V4(v4) => v4.to_ipv6_mapped(),
IpAddr::V6(v6) => v6,
};
Some(ip_addr_v6)
})
.map(|ip_v6| u128::from_be_bytes(ip_v6.octets()))
.collect();
println!("IpAddrsAny\t{}", ip_addrs.len());
println!("IpAddrsV4\t{}", ip_addr_v4);
ip_addrs
}
fn bench_ip() {
let dataset = ip_dataset();
print_set_stats(&dataset);
// Chunks
{
let mut data = vec![];
for dataset in dataset.chunks(500_000) {
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
}
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression 50_000 chunks {:.4}", compression);
println!(
"Num Bits per elem {:.2}",
(data.len() * 8) as f32 / dataset.len() as f32
);
}
let mut data = vec![];
{
print_time!("creation");
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
}
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression {:.2}", compression);
println!(
"Num Bits per elem {:.2}",
(data.len() * 8) as f32 / dataset.len() as f32
);
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
// Sample some ranges
let mut doc_values = Vec::new();
for value in dataset.iter().take(1110).skip(1100).cloned() {
doc_values.clear();
print_time!("get range");
decompressor.get_docids_for_value_range(
value..=value,
0..decompressor.num_vals(),
&mut doc_values,
);
println!("{:?}", doc_values.len());
}
}
fn main() {
if env::args().nth(1).unwrap() == "bench_ip" {
bench_ip();
return;
}
let mut table = Table::new();
// Add a row per time
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
for (data, data_set_name) in get_codec_test_data_sets() {
let results: Vec<(f32, f32, FastFieldCodecType)> = [
serialize_with_codec(&data, FastFieldCodecType::Bitpacked),
serialize_with_codec(&data, FastFieldCodecType::Linear),
serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear),
]
.into_iter()
.flatten()
.collect();
let best_compression_ratio_codec = results
.iter()
.min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
.cloned()
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (est, comp, codec_type) in results {
let est_cell = est.to_string();
let ratio_cell = comp.to_string();
let style = if comp == best_compression_ratio_codec.1 {
"Fb"
} else {
""
};
table.add_row(Row::new(vec![
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style),
Cell::new(&est_cell).style_spec(""),
]));
}
}
table.printstd();
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];
let data = (1000..=200_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "Autoincrement"));
let mut current_cumulative = 0;
let data = (1..=200_000_u64)
.map(|num| {
let num = (num as f32 + num as f32).log10() as u64;
current_cumulative += num;
current_cumulative
})
.collect::<Vec<_>>();
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing concave"));
let mut current_cumulative = 0;
let data = (1..=200_000_u64)
.map(|num| {
let num = (200_000.0 - num as f32).log10() as u64;
current_cumulative += num;
current_cumulative
})
.collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing convex"));
let data = (1000..=200_000_u64)
.map(|num| num + rand::random::<u8>() as u64)
.collect::<Vec<_>>();
data_and_names.push((data, "Almost monotonically increasing"));
data_and_names
}
pub fn serialize_with_codec(
data: &[u64],
codec_type: FastFieldCodecType,
) -> Option<(f32, f32, FastFieldCodecType)> {
let col = VecColumn::from(data);
let estimation = fastfield_codecs::estimate(&col, codec_type)?;
let mut out = Vec::new();
fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?;
let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32;
Some((estimation, actual_compression, codec_type))
}

View File

@@ -21,14 +21,14 @@
use std::net::Ipv6Addr;
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
pub use fastfield_codecs::Column;
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
// pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result};
// pub use self::facet_reader::FacetReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::{Column, CompositeFastFieldSerializer};
pub use self::writer::FastFieldsWriter;
use crate::schema::{Type, Value};
use crate::DateTime;
@@ -38,7 +38,6 @@ mod alive_bitset;
mod error;
// mod facet_reader;
mod readers;
mod serializer;
mod writer;
/// Trait for types that provide a zero value.
@@ -71,7 +70,7 @@ impl MakeZero for Ipv6Addr {
/// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime).
pub trait FastValue:
MonotonicallyMappableToU64 + Copy + Send + Sync + PartialOrd + 'static
Copy + Send + Sync + columnar::MonotonicallyMappableToU64 + PartialOrd + 'static
{
/// Returns the `schema::Type` for this FastValue.
fn to_type() -> Type;
@@ -100,21 +99,21 @@ impl FastValue for bool {
Type::Bool
}
}
impl FastValue for DateTime {
fn to_type() -> Type {
Type::Date
}
}
impl MonotonicallyMappableToU64 for DateTime {
impl columnar::MonotonicallyMappableToU64 for DateTime {
fn to_u64(self) -> u64 {
self.timestamp_micros.to_u64()
}
fn from_u64(val: u64) -> Self {
let timestamp_micros = i64::from_u64(val);
DateTime { timestamp_micros }
}
}
impl FastValue for DateTime {
fn to_type() -> Type {
Type::Date
DateTime {
timestamp_micros: MonotonicallyMappableToU64::from_u64(val),
}
}
}
@@ -166,7 +165,6 @@ mod tests {
use std::sync::Arc;
use common::{HasLen, TerminatingWrite};
use fastfield_codecs::{open, FastFieldCodecType};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;

View File

@@ -4,15 +4,12 @@ use std::sync::Arc;
use columnar::{
BytesColumn, ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
HasAssociatedColumnType, NumericalType, StrColumn,
HasAssociatedColumnType, StrColumn,
};
use fastfield_codecs::{open, open_u128, Column};
use fastfield_codecs::Column;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::{Field, FieldType, Schema};
use crate::directory::FileSlice;
use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError};
/// Provides access to all of the BitpackedFastFieldReader.
///

View File

@@ -1,122 +0,0 @@
use std::fmt;
use std::io::{self, Write};
pub use fastfield_codecs::Column;
use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES};
use crate::directory::{CompositeWrite, WritePtr};
use crate::schema::Field;
/// `CompositeFastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
/// Fast fields have different encodings like bit-packing.
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `create_auto_detect_u64_fast_field(...)`
/// * `create_auto_detect_u64_fast_field(...)`
/// * ...
/// * `let bytes_fastfield = new_bytes_fast_field(...)`
/// * `bytes_fastfield.write_all(...)`
/// * `bytes_fastfield.write_all(...)`
/// * `bytes_fastfield.flush()`
/// * ...
/// * `close()`
pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
codec_types: Vec<FastFieldCodecType>,
}
impl CompositeFastFieldSerializer {
/// New fast field serializer with all codec types
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
Self::from_write_with_codec(write, &ALL_CODEC_TYPES)
}
/// New fast field serializer with allowed codec types
pub fn from_write_with_codec(
write: WritePtr,
codec_types: &[FastFieldCodecType],
) -> io::Result<CompositeFastFieldSerializer> {
let composite_write = CompositeWrite::wrap(write);
Ok(CompositeFastFieldSerializer {
composite_write,
codec_types: codec_types.to_vec(),
})
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64 + fmt::Debug>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
) -> io::Result<()> {
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field_with_idx<
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
fastfield_codecs::serialize(fastfield_accessor, field_write, &self.codec_types)?;
Ok(())
}
/// Serialize data into a new u64 fast field. The best compression codec of the the provided
/// will be chosen.
pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
idx: usize,
codec_types: &[FastFieldCodecType],
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
fastfield_codecs::serialize(fastfield_accessor, field_write, codec_types)?;
Ok(())
}
/// Serialize data into a new u128 fast field. The codec will be compact space compressor,
/// which is optimized for scanning the fast field for a given range.
pub fn create_u128_fast_field_with_idx<F: Fn() -> I, I: Iterator<Item = u128>>(
&mut self,
field: Field,
iter_gen: F,
num_vals: u32,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
fastfield_codecs::serialize_u128(iter_gen, num_vals, field_write)?;
Ok(())
}
/// Start serializing a new [u8] fast field. Use the returned writer to write data into the
/// bytes field. To associate the bytes with documents a seperate index must be created on
/// index 0. See bytes/writer.rs::serialize for an example.
///
/// The bytes will be stored as is, no compression will be applied.
pub fn new_bytes_fast_field(&mut self, field: Field) -> impl Write + '_ {
self.composite_write.for_field_with_idx(field, 1)
}
/// Closes the serializer
///
/// After this call the data must be persistently saved on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}

View File

@@ -3,12 +3,10 @@ use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::FastFieldType;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};

View File

@@ -801,7 +801,6 @@ mod tests {
use std::collections::{HashMap, HashSet};
use std::net::Ipv6Addr;
use fastfield_codecs::MonotonicallyMappableToU128;
use proptest::prelude::*;
use proptest::prop_oneof;
use proptest::strategy::Strategy;

View File

@@ -1,4 +1,4 @@
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;

View File

@@ -1,20 +1,15 @@
use std::collections::HashMap;
use std::io::Write;
use std::sync::Arc;
use fastfield_codecs::VecColumn;
use itertools::Itertools;
use measure_time::debug_time;
use super::flat_map_with_buffer::FlatMapWithBufferIter;
// use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn;
use crate::core::{Segment, SegmentReader};
use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{
AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldNotAvailableError,
};
use crate::fastfield::{AliveBitSet, Column, FastFieldNotAvailableError};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
// use crate::indexer::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueColumn;

View File

@@ -2,7 +2,6 @@ use common::TerminatingWrite;
use crate::core::{Segment, SegmentComponent};
use crate::directory::WritePtr;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
use crate::store::StoreWriter;

View File

@@ -1,4 +1,4 @@
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
use itertools::Itertools;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};

View File

@@ -5,9 +5,8 @@
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::Column;
use columnar::{Column, MonotonicallyMappableToU128};
use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128;
use super::map_bound;
use crate::query::range_query::fast_field_range_query::RangeDocSet;

View File

@@ -4,7 +4,7 @@
use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
use super::fast_field_range_query::RangeDocSet;
use super::map_bound;

View File

@@ -132,7 +132,7 @@ mod tests {
use std::net::{IpAddr, Ipv6Addr};
use std::str::FromStr;
use fastfield_codecs::MonotonicallyMappableToU128;
use columnar::MonotonicallyMappableToU128;
use crate::collector::{Count, TopDocs};
use crate::query::{Query, QueryParser, TermQuery};

View File

@@ -3,7 +3,7 @@ use std::hash::{Hash, Hasher};
use std::net::Ipv6Addr;
use std::{fmt, str};
use fastfield_codecs::MonotonicallyMappableToU128;
use columnar::MonotonicallyMappableToU128;
use super::Field;
use crate::fastfield::FastValue;

View File

@@ -319,8 +319,8 @@ mod binary_serialize {
use std::io::{self, Read, Write};
use std::net::Ipv6Addr;
use columnar::MonotonicallyMappableToU128;
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use fastfield_codecs::MonotonicallyMappableToU128;
use super::Value;
use crate::schema::Facet;