Compare commits

...

2 Commits

Author SHA1 Message Date
Paul Masurel
7b06db062b Adding a special method to Arc<dyn ColumnValues> 2023-02-14 23:14:14 +09:00
Paul Masurel
097fd6138d Fix clippy comments (#1872) 2023-02-14 23:12:45 +09:00
36 changed files with 156 additions and 180 deletions

View File

@@ -84,7 +84,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
// Load rows
self.values
.get_row_ids_for_value_range(value_range.clone(), rowid_range, docids);
.get_row_ids_for_value_range(value_range, rowid_range, docids);
// Convert rows to docids
self.idx
.select_batch_in_place(docids, selected_docid_range.start);

View File

@@ -93,11 +93,7 @@ fn iter_num_values<'a>(
match column_index {
ColumnIndex::Full => 1,
ColumnIndex::Optional(optional_index) => {
if optional_index.contains(row_addr.row_id) {
1u32
} else {
0u32
}
u32::from(optional_index.contains(row_addr.row_id))
}
ColumnIndex::Multivalued(multivalued_index) => {
multivalued_index.range(row_addr.row_id).len() as u32

View File

@@ -48,7 +48,7 @@ impl ColumnIndex {
ColumnIndex::Full => true,
ColumnIndex::Optional(optional_index) => optional_index.contains(doc_id),
ColumnIndex::Multivalued(multivalued_index) => {
multivalued_index.range(doc_id).len() > 0
!multivalued_index.range(doc_id).is_empty()
}
}
}

View File

@@ -220,7 +220,7 @@ impl Set<RowId> for OptionalIndex {
block_doc_idx_start + in_block_rank as u32
}
fn select_cursor<'b>(&'b self) -> OptionalIndexSelectCursor<'b> {
fn select_cursor(&self) -> OptionalIndexSelectCursor<'_> {
OptionalIndexSelectCursor {
current_block_cursor: BlockSelectCursor::Sparse(
SparseBlockCodec::open(b"").select_cursor(),
@@ -255,7 +255,7 @@ impl OptionalIndex {
self.num_non_null_rows
}
pub fn iter_rows<'a>(&'a self) -> impl Iterator<Item = RowId> + 'a {
pub fn iter_rows(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize
let mut select_batch = self.select_cursor();
(0..self.num_non_null_rows).map(move |rank| select_batch.select(rank))
@@ -268,7 +268,7 @@ impl OptionalIndex {
}
#[inline]
fn block<'a>(&'a self, block_meta: BlockMeta) -> Block<'a> {
fn block(&self, block_meta: BlockMeta) -> Block<'_> {
let BlockMeta {
start_byte_offset,
block_variant,
@@ -351,7 +351,7 @@ fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -
Ok(())
}
pub fn serialize_optional_index<'a, W: io::Write>(
pub fn serialize_optional_index<W: io::Write>(
non_null_rows: &dyn Iterable<RowId>,
num_rows: RowId,
output: &mut W,
@@ -427,7 +427,7 @@ impl SerializedBlockMeta {
}
#[inline]
fn to_bytes(&self) -> [u8; SERIALIZED_BLOCK_META_NUM_BYTES] {
fn to_bytes(self) -> [u8; SERIALIZED_BLOCK_META_NUM_BYTES] {
assert!(self.num_non_null_rows > 0);
let mut bytes = [0u8; SERIALIZED_BLOCK_META_NUM_BYTES];
bytes[0..2].copy_from_slice(&self.block_id.to_le_bytes());
@@ -501,7 +501,7 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES;
let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes);
let (block_metas, num_non_null_rows) =
deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows).into();
deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows);
let optional_index = OptionalIndex {
num_rows,
num_non_null_rows,

View File

@@ -10,7 +10,7 @@ pub trait SetCodec {
///
/// May panic if the elements are not sorted.
fn serialize(els: impl Iterator<Item = Self::Item>, wrt: impl io::Write) -> io::Result<()>;
fn open<'a>(data: &'a [u8]) -> Self::Reader<'a>;
fn open(data: &[u8]) -> Self::Reader<'_>;
}
/// Stateful object that makes it possible to compute several select in a row,
@@ -43,5 +43,5 @@ pub trait Set<T> {
fn select(&self, rank: T) -> T;
/// Creates a brand new select cursor.
fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b>;
fn select_cursor(&self) -> Self::SelectCursor<'_>;
}

View File

@@ -45,7 +45,7 @@ impl SetCodec for DenseBlockCodec {
}
#[inline]
fn open<'a>(data: &'a [u8]) -> Self::Reader<'a> {
fn open(data: &[u8]) -> Self::Reader<'_> {
assert_eq!(data.len(), DENSE_BLOCK_NUM_BYTES as usize);
DenseBlock(data)
}
@@ -94,7 +94,7 @@ impl DenseMiniBlock {
Self { bitvec, rank }
}
fn to_bytes(&self) -> [u8; MINI_BLOCK_NUM_BYTES] {
fn to_bytes(self) -> [u8; MINI_BLOCK_NUM_BYTES] {
let mut bytes = [0u8; MINI_BLOCK_NUM_BYTES];
bytes[..MINI_BLOCK_BITVEC_NUM_BYTES].copy_from_slice(&self.bitvec.to_le_bytes());
bytes[MINI_BLOCK_BITVEC_NUM_BYTES..].copy_from_slice(&self.rank.to_le_bytes());
@@ -166,7 +166,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
}
#[inline(always)]
fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b> {
fn select_cursor(&self) -> Self::SelectCursor<'_> {
DenseBlockSelectCursor {
block_id: 0,
dense_block: *self,

View File

@@ -16,7 +16,7 @@ impl SetCodec for SparseBlockCodec {
Ok(())
}
fn open<'a>(data: &'a [u8]) -> Self::Reader<'a> {
fn open(data: &[u8]) -> Self::Reader<'_> {
SparseBlock(data)
}
}
@@ -56,7 +56,7 @@ impl<'a> Set<u16> for SparseBlock<'a> {
}
#[inline(always)]
fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b> {
fn select_cursor(&self) -> Self::SelectCursor<'_> {
*self
}
}

View File

@@ -110,31 +110,14 @@ impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>>
fn get_range(&self, start: u64, output: &mut [T]) {
self.as_ref().get_range(start, output)
}
}
impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd + Debug> ColumnValues<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
fn min_value(&self) -> T {
(*self).min_value()
}
fn max_value(&self) -> T {
(*self).max_value()
}
fn num_vals(&self) -> u32 {
(*self).num_vals()
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
(*self).iter()
}
fn get_range(&self, start: u64, output: &mut [T]) {
(*self).get_range(start, output)
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
self.as_ref().get_row_ids_for_value_range(value_range, row_id_range, row_id_hits)
}
}

View File

@@ -1,5 +1,4 @@
#![warn(missing_docs)]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
//! # `fastfield_codecs`
//!
@@ -26,10 +25,10 @@ mod stats;
pub(crate) mod u64_based;
mod column;
pub mod serialize;
pub(crate) mod serialize;
pub use serialize::serialize_column_values_u128;
pub use stats::Stats;
pub use stats::ColumnStats;
pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
@@ -137,7 +136,6 @@ mod bench {
use test::{self, Bencher};
use super::*;
use crate::column_values::serialize::NormalizedHeader;
use crate::column_values::u64_based::*;
fn get_data() -> Vec<u64> {
@@ -154,7 +152,7 @@ mod bench {
data
}
fn compute_stats(vals: impl Iterator<Item = u64>) -> Stats {
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default();
for val in vals {
stats_collector.collect(val);
@@ -166,7 +164,7 @@ mod bench {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::Reader {
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
let mut codec_serializer = Codec::estimator();

View File

@@ -123,6 +123,7 @@ pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval {
min_value: u64,
}
impl StrictlyMonotonicMappingToInternalGCDBaseval {
/// Creates a linear mapping `x -> gcd*x + min_value`.
pub(crate) fn new(gcd: u64, min_value: u64) -> Self {
let gcd_divider = DividerU64::divide_by(gcd);
Self {
@@ -151,7 +152,9 @@ impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
pub(crate) struct StrictlyMonotonicMappingToInternalBaseval {
min_value: u64,
}
impl StrictlyMonotonicMappingToInternalBaseval {
/// Creates a linear mapping `x -> x + min_value`.
#[inline(always)]
pub(crate) fn new(min_value: u64) -> Self {
Self { min_value }

View File

@@ -8,19 +8,6 @@ use crate::column_values::U128FastFieldCodecType;
use crate::iterable::Iterable;
use crate::MonotonicallyMappableToU128;
/// The normalized header gives some parameters after applying the following
/// normalization of the vector:
/// `val -> (val - min_value) / gcd`
///
/// By design, after normalization, `min_value = 0` and `gcd = 1`.
#[derive(Debug, Copy, Clone)]
pub struct NormalizedHeader {
/// The number of values in the underlying column.
pub num_vals: u32,
/// The max value of the underlying column.
pub max_value: u64,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) struct U128Header {
pub num_vals: u32,

View File

@@ -6,21 +6,28 @@ use common::{BinarySerializable, VInt};
use crate::RowId;
/// Column statistics.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Stats {
pub struct ColumnStats {
/// GCD of the elements `el - min(column)`.
pub gcd: NonZeroU64,
/// Minimum value of the column.
pub min_value: u64,
/// Maximum value of the column.
pub max_value: u64,
/// Number of rows in the column.
pub num_rows: RowId,
}
impl Stats {
impl ColumnStats {
/// Amplitude of value.
/// Difference between the maximum and the minimum value.
pub fn amplitude(&self) -> u64 {
self.max_value - self.min_value
}
}
impl BinarySerializable for Stats {
impl BinarySerializable for ColumnStats {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.min_value).serialize(writer)?;
VInt(self.gcd.get()).serialize(writer)?;
@@ -37,7 +44,7 @@ impl BinarySerializable for Stats {
let amplitude = VInt::deserialize(reader)?.0 * gcd.get();
let max_value = min_value + amplitude;
let num_rows = VInt::deserialize(reader)?.0 as RowId;
Ok(Stats {
Ok(ColumnStats {
min_value,
max_value,
num_rows,
@@ -52,21 +59,21 @@ mod tests {
use common::BinarySerializable;
use crate::column_values::Stats;
use crate::column_values::ColumnStats;
#[track_caller]
fn test_stats_ser_deser_aux(stats: &Stats, num_bytes: usize) {
fn test_stats_ser_deser_aux(stats: &ColumnStats, num_bytes: usize) {
let mut buffer: Vec<u8> = Vec::new();
stats.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), num_bytes);
let deser_stats = Stats::deserialize(&mut &buffer[..]).unwrap();
let deser_stats = ColumnStats::deserialize(&mut &buffer[..]).unwrap();
assert_eq!(stats, &deser_stats);
}
#[test]
fn test_stats_serialization() {
test_stats_ser_deser_aux(
&(Stats {
&(ColumnStats {
gcd: NonZeroU64::new(3).unwrap(),
min_value: 1,
max_value: 3001,
@@ -75,7 +82,7 @@ mod tests {
5,
);
test_stats_ser_deser_aux(
&(Stats {
&(ColumnStats {
gcd: NonZeroU64::new(1_000).unwrap(),
min_value: 1,
max_value: 3001,
@@ -84,7 +91,7 @@ mod tests {
5,
);
test_stats_ser_deser_aux(
&(Stats {
&(ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
min_value: 0,
max_value: 0,

View File

@@ -4,7 +4,7 @@ use common::{BinarySerializable, OwnedBytes};
use fastdivide::DividerU64;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, Stats};
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::{ColumnValues, RowId};
/// Depending on the field type, a different
@@ -13,7 +13,7 @@ use crate::{ColumnValues, RowId};
pub struct BitpackedReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
stats: Stats,
stats: ColumnStats,
}
impl ColumnValues for BitpackedReader {
@@ -36,7 +36,7 @@ impl ColumnValues for BitpackedReader {
}
}
fn num_bits(stats: &Stats) -> u8 {
fn num_bits(stats: &ColumnStats) -> u8 {
compute_num_bits(stats.amplitude() / stats.gcd)
}
@@ -46,14 +46,14 @@ pub struct BitpackedCodecEstimator;
impl ColumnCodecEstimator for BitpackedCodecEstimator {
fn collect(&mut self, _value: u64) {}
fn estimate(&self, stats: &Stats) -> Option<u64> {
fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
let num_bits_per_value = num_bits(stats);
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
}
fn serialize(
&self,
stats: &Stats,
stats: &ColumnStats,
vals: &mut dyn Iterator<Item = u64>,
wrt: &mut dyn Write,
) -> io::Result<()> {
@@ -72,12 +72,12 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
pub struct BitpackedCodec;
impl ColumnCodec for BitpackedCodec {
type Reader = BitpackedReader;
type ColumnValues = BitpackedReader;
type Estimator = BitpackedCodecEstimator;
/// Opens a fast field given a file.
fn load(mut data: OwnedBytes) -> io::Result<Self::Reader> {
let stats = Stats::deserialize(&mut data)?;
fn load(mut data: OwnedBytes) -> io::Result<Self::ColumnValues> {
let stats = ColumnStats::deserialize(&mut data)?;
let num_bits = num_bits(&stats);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {

View File

@@ -7,7 +7,7 @@ use fastdivide::DividerU64;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::column_values::u64_based::line::Line;
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, Stats};
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::column_values::{ColumnValues, VecColumn};
use crate::MonotonicallyMappableToU64;
@@ -84,7 +84,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
self.block.clear();
}
}
fn estimate(&self, stats: &Stats) -> Option<u64> {
fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
let mut estimate = 4 + stats.num_bytes() + self.meta_num_bytes + self.values_num_bytes;
if stats.gcd.get() > 1 {
let estimate_gain_from_gcd =
@@ -100,7 +100,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
fn serialize(
&self,
stats: &Stats,
stats: &ColumnStats,
mut vals: &mut dyn Iterator<Item = u64>,
wrt: &mut dyn Write,
) -> io::Result<()> {
@@ -165,12 +165,12 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
pub struct BlockwiseLinearCodec;
impl ColumnCodec<u64> for BlockwiseLinearCodec {
type Reader = BlockwiseLinearReader;
type ColumnValues = BlockwiseLinearReader;
type Estimator = BlockwiseLinearEstimator;
fn load(mut bytes: OwnedBytes) -> io::Result<Self::Reader> {
let stats = Stats::deserialize(&mut bytes)?;
fn load(mut bytes: OwnedBytes) -> io::Result<Self::ColumnValues> {
let stats = ColumnStats::deserialize(&mut bytes)?;
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
@@ -195,7 +195,7 @@ impl ColumnCodec<u64> for BlockwiseLinearCodec {
pub struct BlockwiseLinearReader {
blocks: Arc<[Block]>,
data: OwnedBytes,
stats: Stats,
stats: ColumnStats,
}
impl ColumnValues for BlockwiseLinearReader {

View File

@@ -5,7 +5,7 @@ use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use super::line::Line;
use super::ColumnValues;
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, Stats};
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::column_values::VecColumn;
use crate::RowId;
@@ -18,7 +18,7 @@ const LINE_ESTIMATION_BLOCK_LEN: usize = 512;
pub struct LinearReader {
data: OwnedBytes,
linear_params: LinearParams,
stats: Stats,
stats: ColumnStats,
}
impl ColumnValues for LinearReader {
@@ -106,7 +106,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
}
}
fn estimate(&self, stats: &Stats) -> Option<u64> {
fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
let line = self.line?;
let amplitude = self.max_deviation - self.min_deviation;
let num_bits = compute_num_bits(amplitude);
@@ -123,7 +123,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
fn serialize(
&self,
stats: &Stats,
stats: &ColumnStats,
vals: &mut dyn Iterator<Item = u64>,
wrt: &mut dyn io::Write,
) -> io::Result<()> {
@@ -184,12 +184,12 @@ impl LinearCodecEstimator {
}
impl ColumnCodec for LinearCodec {
type Reader = LinearReader;
type ColumnValues = LinearReader;
type Estimator = LinearCodecEstimator;
fn load(mut data: OwnedBytes) -> io::Result<Self::Reader> {
let stats = Stats::deserialize(&mut data)?;
fn load(mut data: OwnedBytes) -> io::Result<Self::ColumnValues> {
let stats = ColumnStats::deserialize(&mut data)?;
let linear_params = LinearParams::deserialize(&mut data)?;
Ok(LinearReader {
stats,

View File

@@ -17,31 +17,57 @@ pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
pub use crate::column_values::u64_based::linear::LinearCodec;
pub use crate::column_values::u64_based::stats_collector::StatsCollector;
use crate::column_values::{monotonic_map_column, Stats};
use crate::column_values::{monotonic_map_column, ColumnStats};
use crate::iterable::Iterable;
use crate::{ColumnValues, MonotonicallyMappableToU64};
/// A `ColumnCodecEstimator` is in charge of gathering all
/// data required to serialize a column.
///
/// This happens during a first pass on data of the column elements.
/// During that pass, all column estimators receive a call to their
/// `.collect(el)`.
///
/// After this first pass, finalize is called.
/// `.estimate(..)` then should return an accurate estimation of the
/// size of the serialized column (were we to pick this codec.).
/// `.serialize(..)` then serializes the column using this codec.
pub trait ColumnCodecEstimator<T = u64>: 'static {
/// Records a new value for estimation.
/// This method will be called for each element of the column during
/// `estimation`.
fn collect(&mut self, value: u64);
fn estimate(&self, stats: &Stats) -> Option<u64>;
/// Finalizes the first pass phase.
fn finalize(&mut self) {}
/// Returns an accurate estimation of the number of bytes that will
/// be used to represent this column.
fn estimate(&self, stats: &ColumnStats) -> Option<u64>;
/// Serializes the column using the given codec.
/// This constitutes a second pass over the columns values.
fn serialize(
&self,
stats: &Stats,
stats: &ColumnStats,
vals: &mut dyn Iterator<Item = T>,
wrt: &mut dyn io::Write,
) -> io::Result<()>;
}
/// A column codec describes a colunm serialization format.
pub trait ColumnCodec<T: PartialOrd = u64> {
type Reader: ColumnValues<T> + 'static;
/// Specialized `ColumnValues` type.
type ColumnValues: ColumnValues<T> + 'static;
/// `Estimator` for the given codec.
type Estimator: ColumnCodecEstimator + Default;
fn load(bytes: OwnedBytes) -> io::Result<Self::Reader>;
/// Loads a column that has been serialized using this codec.
fn load(bytes: OwnedBytes) -> io::Result<Self::ColumnValues>;
/// Returns an estimator.
fn estimator() -> Self::Estimator {
Self::Estimator::default()
}
/// Returns a boxed estimator.
fn boxed_estimator() -> Box<dyn ColumnCodecEstimator> {
Box::new(Self::estimator())
}
@@ -62,6 +88,7 @@ pub enum CodecType {
BlockwiseLinear = 2u8,
}
/// List of all available u64-base codecs.
pub const ALL_U64_CODEC_TYPES: [CodecType; 3] = [
CodecType::Bitpacked,
CodecType::Linear,
@@ -106,6 +133,7 @@ fn load_specific_codec<C: ColumnCodec, T: MonotonicallyMappableToU64>(
}
impl CodecType {
/// Returns a boxed codec estimator associated to a given `CodecType`.
pub fn estimator(&self) -> Box<dyn ColumnCodecEstimator> {
match self {
CodecType::Bitpacked => BitpackedCodec::boxed_estimator(),
@@ -115,7 +143,8 @@ impl CodecType {
}
}
pub fn serialize_u64_based_column_values<'a, T: MonotonicallyMappableToU64>(
/// Serializes a given column of u64-mapped values.
pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
vals: &dyn Iterable<T>,
codec_types: &[CodecType],
wrt: &mut dyn Write,
@@ -156,11 +185,14 @@ pub fn serialize_u64_based_column_values<'a, T: MonotonicallyMappableToU64>(
Ok(())
}
/// Load u64-based column values.
///
/// This method first identifies the codec off the first byte.
pub fn load_u64_based_column_values<T: MonotonicallyMappableToU64>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn ColumnValues<T>>> {
let codec_type: CodecType = bytes
.get(0)
.first()
.copied()
.and_then(CodecType::try_from_code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Failed to read codec type"))?;

View File

@@ -2,7 +2,7 @@ use std::num::NonZeroU64;
use fastdivide::DividerU64;
use crate::column_values::Stats;
use crate::column_values::ColumnStats;
use crate::RowId;
/// Compute the gcd of two non null numbers.
@@ -33,14 +33,14 @@ pub struct StatsCollector {
}
impl StatsCollector {
pub fn stats(&self) -> Stats {
pub fn stats(&self) -> ColumnStats {
let (min_value, max_value) = self.min_max_opt.unwrap_or((0u64, 0u64));
let increment_gcd = if let Some((increment_gcd, _)) = self.increment_gcd_opt {
increment_gcd
} else {
NonZeroU64::new(1u64).unwrap()
};
Stats {
ColumnStats {
min_value,
max_value,
num_rows: self.num_rows,
@@ -97,9 +97,9 @@ mod tests {
use std::num::NonZeroU64;
use crate::column_values::u64_based::stats_collector::{compute_gcd, StatsCollector};
use crate::column_values::u64_based::Stats;
use crate::column_values::u64_based::ColumnStats;
fn compute_stats(vals: impl Iterator<Item = u64>) -> Stats {
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default();
for val in vals {
stats_collector.collect(val);
@@ -144,7 +144,7 @@ mod tests {
fn test_stats() {
assert_eq!(
compute_stats([].into_iter()),
Stats {
ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
min_value: 0,
max_value: 0,
@@ -153,7 +153,7 @@ mod tests {
);
assert_eq!(
compute_stats([0, 1].into_iter()),
Stats {
ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
min_value: 0,
max_value: 1,
@@ -162,7 +162,7 @@ mod tests {
);
assert_eq!(
compute_stats([0, 1].into_iter()),
Stats {
ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
min_value: 0,
max_value: 1,
@@ -171,7 +171,7 @@ mod tests {
);
assert_eq!(
compute_stats([10, 20, 30].into_iter()),
Stats {
ColumnStats {
gcd: NonZeroU64::new(10).unwrap(),
min_value: 10,
max_value: 30,
@@ -180,7 +180,7 @@ mod tests {
);
assert_eq!(
compute_stats([10, 50, 10, 30].into_iter()),
Stats {
ColumnStats {
gcd: NonZeroU64::new(20).unwrap(),
min_value: 10,
max_value: 50,
@@ -189,7 +189,7 @@ mod tests {
);
assert_eq!(
compute_stats([10, 0, 30].into_iter()),
Stats {
ColumnStats {
gcd: NonZeroU64::new(10).unwrap(),
min_value: 0,
max_value: 30,

View File

@@ -4,7 +4,7 @@ pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_o
/// We end the file by these 4 bytes just to somewhat identify that
/// this is indeed a columnar file.
const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 066];
const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];
pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
@@ -27,8 +27,8 @@ pub enum Version {
}
impl Version {
fn to_bytes(&self) -> [u8; 4] {
(*self as u32).to_le_bytes()
fn to_bytes(self) -> [u8; 4] {
(self as u32).to_le_bytes()
}
fn try_from_bytes(bytes: [u8; 4]) -> Result<Version, InvalidData> {

View File

@@ -58,7 +58,7 @@ impl<'a> RemappedTermOrdinalsValues<'a> {
.enumerate()
.flat_map(|(segment_ord, byte_column)| {
let segment_ord = self.term_ord_mapping.get_segment(segment_ord as u32);
byte_column.into_iter().flat_map(move |bytes_column| {
byte_column.iter().flat_map(move |bytes_column| {
bytes_column
.ords()
.values

View File

@@ -174,6 +174,7 @@ fn merge_column(
Ok(())
}
#[allow(clippy::type_complexity)]
fn group_columns_for_merge(
columnar_readers: &[&ColumnarReader],
) -> io::Result<BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>>> {

View File

@@ -162,7 +162,7 @@ mod tests {
}
#[test]
#[should_panic(expect = "Input type forbidden")]
#[should_panic(expected = "Input type forbidden")]
fn test_list_columns_strict_typing_panics_on_wrong_types() {
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_column_type("count", ColumnType::U64, false);

View File

@@ -47,6 +47,7 @@ struct SpareBuffers {
/// let mut wrt: Vec<u8> = Vec::new();
/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
/// ```
#[derive(Default)]
pub struct ColumnarWriter {
numerical_field_hash_map: ArenaHashMap,
datetime_field_hash_map: ArenaHashMap,
@@ -60,22 +61,6 @@ pub struct ColumnarWriter {
buffers: SpareBuffers,
}
impl Default for ColumnarWriter {
fn default() -> Self {
ColumnarWriter {
numerical_field_hash_map: ArenaHashMap::new(),
bool_field_hash_map: ArenaHashMap::new(),
ip_addr_field_hash_map: ArenaHashMap::new(),
bytes_field_hash_map: ArenaHashMap::new(),
str_field_hash_map: ArenaHashMap::new(),
datetime_field_hash_map: ArenaHashMap::new(),
dictionaries: Vec::new(),
arena: MemoryArena::default(),
buffers: SpareBuffers::default(),
}
}
}
#[inline]
fn mutate_or_create_column<V, TMutator>(
arena_hash_map: &mut ArenaHashMap,
@@ -671,7 +656,7 @@ where
Ok(())
}
fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut Vec<u64>) {
fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
let mut start_index: usize = 0;
for end_index in multivalued_index.iter().copied() {
let end_index = end_index as usize;

View File

@@ -29,7 +29,7 @@ pub struct OptionalIndexBuilder {
}
impl OptionalIndexBuilder {
pub fn finish<'a>(&'a mut self, num_rows: RowId) -> impl Iterable<RowId> + 'a {
pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ {
debug_assert!(self
.docs
.last()

View File

@@ -166,9 +166,9 @@ impl StrictlyMonotonicFn<i64, u64> for MapI64ToU64 {
macro_rules! static_dynamic_conversions {
($typ:ty, $enum_name:ident) => {
impl Into<Option<$typ>> for DynamicColumn {
fn into(self) -> Option<$typ> {
if let DynamicColumn::$enum_name(col) = self {
impl From<DynamicColumn> for Option<$typ> {
fn from(dynamic_column: DynamicColumn) -> Option<$typ> {
if let DynamicColumn::$enum_name(col) = dynamic_column {
Some(col)
} else {
None

View File

@@ -66,7 +66,7 @@ impl BucketAggregationWithAccessor {
BucketAggregationType::Terms(TermsAggregation {
field: field_name, ..
}) => {
str_dict_column = reader.fast_fields().str(&field_name)?;
str_dict_column = reader.fast_fields().str(field_name)?;
get_ff_reader_and_validate(reader, field_name)?
}
};

View File

@@ -74,9 +74,9 @@ use crate::{DocId, TantivyError};
/// ...
/// "aggregations": {
/// "genres": {
/// "doc_count_error_upper_bound": 0,
/// "sum_other_doc_count": 0,
/// "buckets": [
/// "doc_count_error_upper_bound": 0,
/// "sum_other_doc_count": 0,
/// "buckets": [
/// { "key": "drumnbass", "doc_count": 6 },
/// { "key": "raggae", "doc_count": 4 },
/// { "key": "jazz", "doc_count": 2 }
@@ -241,15 +241,6 @@ impl TermBucketEntry {
}
impl TermBuckets {
pub(crate) fn from_req_and_validate(
_sub_aggregation: &AggregationsWithAccessor,
_max_term_id: usize,
) -> crate::Result<Self> {
Ok(TermBuckets {
entries: Default::default(),
})
}
fn force_flush(&mut self, agg_with_accessor: &AggregationsWithAccessor) -> crate::Result<()> {
for entry in &mut self.entries.values_mut() {
if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() {

View File

@@ -196,6 +196,7 @@ impl MmapDirectory {
directory_path,
)));
}
#[allow(clippy::bind_instead_of_map)]
let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
let directory_path = directory_path.to_owned();

View File

@@ -49,11 +49,6 @@ impl AliveBitSet {
Self::open(alive_bitset_bytes)
}
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
let readonly_bitset = ReadOnlyBitSet::from(bitset);
AliveBitSet::from(readonly_bitset)
}
/// Opens an alive bitset given its file.
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let bitset = ReadOnlyBitSet::open(bytes);

View File

@@ -54,6 +54,7 @@ impl FacetReader {
self.facet_column.ords().values(doc)
}
/// Accessor to the facet dictionary.
pub fn facet_dict(&self) -> &columnar::Dictionary {
self.facet_column.dictionary()
}

View File

@@ -156,8 +156,7 @@ impl FastFieldReaders {
.columnar
.read_columns(field_name)?
.into_iter()
.filter(|column| column.column_type() == column_type)
.next();
.find(|column| column.column_type() == column_type);
Ok(dynamic_column_handle_opt)
}

View File

@@ -126,7 +126,7 @@ impl FastFieldsWriter {
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime.into(),
truncated_datetime,
);
}
Value::Facet(facet) => {

View File

@@ -110,8 +110,8 @@ impl DeltaComputer {
}
}
fn convert_to_merge_order<'a>(
columnars: &[&'a ColumnarReader],
fn convert_to_merge_order(
columnars: &[&ColumnarReader],
doc_id_mapping: SegmentDocIdMapping,
) -> MergeRowOrder {
match doc_id_mapping.mapping_type() {
@@ -369,11 +369,8 @@ impl IndexMerger {
.readers
.iter()
.map(|segment_reader| {
if let Some(alive_bitset) = segment_reader.alive_bitset() {
Some(alive_bitset.bitset().clone())
} else {
None
}
let alive_bitset = segment_reader.alive_bitset()?;
Some(alive_bitset.bitset().clone())
})
.collect();
Ok(SegmentDocIdMapping::new(
@@ -416,11 +413,8 @@ impl IndexMerger {
.readers
.iter()
.map(|reader| {
if let Some(bitset) = reader.alive_bitset() {
Some(bitset.bitset().clone())
} else {
None
}
let alive_bitset = reader.alive_bitset()?;
Some(alive_bitset.bitset().clone())
})
.collect();
Ok(SegmentDocIdMapping::new(

View File

@@ -334,7 +334,7 @@ impl SegmentWriter {
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
let AddOperation { document, opstamp } = add_operation;
self.doc_opstamps.push(add_operation.opstamp);
self.doc_opstamps.push(opstamp);
self.fast_field_writers.add_document(&document)?;
self.index_document(&document)?;
let doc_writer = self.segment_serializer.get_store_writer();

View File

@@ -27,8 +27,10 @@ pub struct FacetTokenStream<'a> {
impl Tokenizer for FacetTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token = Token::default();
token.position = 0;
let token = Token {
position: 0,
..Default::default()
};
FacetTokenStream {
text,
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.

View File

@@ -48,8 +48,7 @@ impl Dictionary<VoidSSTable> {
dictionary_writer.insert(term, &()).unwrap();
}
dictionary_writer.finish().unwrap();
let dictionary = Dictionary::from_bytes(OwnedBytes::new(buffer)).unwrap();
dictionary
Dictionary::from_bytes(OwnedBytes::new(buffer)).unwrap()
}
}

View File

@@ -103,8 +103,8 @@ fn compute_previous_power_of_two(n: usize) -> usize {
1 << msb
}
impl ArenaHashMap {
pub fn new() -> ArenaHashMap {
impl Default for ArenaHashMap {
fn default() -> Self {
let memory_arena = MemoryArena::default();
ArenaHashMap {
table: Box::new([]),
@@ -114,7 +114,9 @@ impl ArenaHashMap {
len: 0,
}
}
}
impl ArenaHashMap {
pub fn with_capacity(table_size: usize) -> ArenaHashMap {
let table_size_power_of_2 = compute_previous_power_of_two(table_size);
let memory_arena = MemoryArena::default();
@@ -298,7 +300,7 @@ mod tests {
#[test]
fn test_hash_map() {
let mut hash_map: ArenaHashMap = ArenaHashMap::new();
let mut hash_map: ArenaHashMap = ArenaHashMap::default();
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
3u32