Compare commits

..

5 Commits

Author SHA1 Message Date
Paul Masurel
9aefa349ca blop 2022-08-22 11:04:39 +02:00
Paul Masurel
b9a87d6dc6 Refactor Further 2022-08-21 13:04:12 +02:00
Paul Masurel
0ec2ebd791 Experimental refactor 2022-08-21 11:40:08 +02:00
Paul Masurel
6602786db8 Moved GCD fast field codec. Partial. 2022-08-20 20:08:07 +02:00
Paul Masurel
c71169b6e0 Fastfield refactoring. 2022-08-20 19:16:31 +02:00
73 changed files with 1823 additions and 2864 deletions

View File

@@ -12,14 +12,12 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
run: rustup toolchain install nightly --component llvm-tools-preview
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info

View File

@@ -19,10 +19,11 @@ jobs:
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
components: rustfmt, clippy
- name: Run indexing_unsorted
run: cargo test indexing_unsorted -- --ignored
- name: Run indexing_sorted
run: cargo test indexing_sorted -- --ignored

View File

@@ -10,27 +10,34 @@ env:
CARGO_TERM_COLOR: always
jobs:
check:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install nightly
- name: Install latest nightly to test also against unstable feature flag
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
profile: minimal
override: true
components: rustfmt
- name: Install stable
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
components: clippy
override: true
components: rustfmt, clippy
- uses: Swatinem/rust-cache@v2
- name: Build
run: cargo build --verbose --workspace
- name: Run tests
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace
- name: Run tests quickwit feature
run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace
- name: Check Formatting
run: cargo +nightly fmt --all -- --check
@@ -41,34 +48,3 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}
args: --tests
test:
runs-on: ubuntu-latest
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]
name: test-${{ matrix.features.label}}
steps:
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- uses: taiki-e/install-action@nextest
- uses: Swatinem/rust-cache@v2
- name: Run tests
run: cargo +stable nextest run --features ${{ matrix.features.flags }} --verbose --workspace
- name: Run doctests
run: cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace

View File

@@ -82,16 +82,14 @@ impl BitUnpacker {
}
}
pub fn bit_width(&self) -> u8 {
self.num_bits as u8
}
#[inline]
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 {
return 0u64;
}
let addr_in_bits = idx * self.num_bits;
let num_bits = self.num_bits;
let mask = self.mask;
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(
@@ -103,7 +101,7 @@ impl BitUnpacker {
.unwrap();
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & self.mask
val_shifted & mask
}
}

View File

@@ -14,6 +14,7 @@ pub struct BlockedBitpacker {
buffer: Vec<u64>,
offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
}
impl Default for BlockedBitpacker {
fn default() -> Self {
BlockedBitpacker::new()
@@ -58,18 +59,13 @@ fn metadata_test() {
assert_eq!(meta.num_bits(), 6);
}
fn mem_usage<T>(items: &Vec<T>) -> usize {
items.capacity() * std::mem::size_of::<T>()
}
impl BlockedBitpacker {
pub fn new() -> Self {
let mut compressed_blocks = vec![];
compressed_blocks.resize(8, 0);
let compressed_blocks = vec![0u8; 8];
Self {
compressed_blocks,
buffer: vec![],
offset_and_bits: vec![],
buffer: Vec::new(),
offset_and_bits: Vec::new(),
}
}
@@ -77,8 +73,10 @@ impl BlockedBitpacker {
pub fn mem_usage(&self) -> usize {
std::mem::size_of::<BlockedBitpacker>()
+ self.compressed_blocks.capacity()
+ mem_usage(&self.offset_and_bits)
+ mem_usage(&self.buffer)
+ self.offset_and_bits.capacity()
* std::mem::size_of_val(&self.offset_and_bits.get(0).cloned().unwrap_or_default())
+ self.buffer.capacity()
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
}
#[inline]

View File

@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
pub struct AntiCallToken(());
/// Trait used to indicate when no more write need to be done on a writer
pub trait TerminatingWrite: Write + Send + Sync {
pub trait TerminatingWrite: Write + Send {
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
fn terminate(mut self) -> io::Result<()>
where Self: Sized {

View File

@@ -7,11 +7,10 @@
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
use fastfield_codecs::Column;
// ---
// Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::DynamicFastFieldReader;
use tantivy::fastfield::{FastFieldReader, FastFieldReaderImpl};
use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Score, SegmentReader};
@@ -96,7 +95,7 @@ impl Collector for StatsCollector {
}
struct StatsSegmentCollector {
fast_field_reader: DynamicFastFieldReader<u64>,
fast_field_reader: FastFieldReaderImpl<u64>,
stats: Stats,
}
@@ -104,7 +103,7 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get_val(doc as u64) as f64;
let value = self.fast_field_reader.get(doc) as f64;
self.stats.count += 1;
self.stats.sum += value;
self.stats.squared_sum += value * value;

View File

@@ -2,8 +2,8 @@ use std::cmp::Reverse;
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak};
use fastfield_codecs::Column;
use tantivy::collector::TopDocs;
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, TEXT};
use tantivy::{
@@ -52,7 +52,7 @@ impl Warmer for DynamicPriceColumn {
let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc as u64))
.map(|doc| product_id_reader.get(doc))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();

View File

@@ -14,10 +14,10 @@ tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
prettytable-rs = {version="0.9.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"
[dev-dependencies]
more-asserts = "0.3.0"
proptest = "1.0.0"
rand = "0.8.3"
[features]

View File

@@ -4,9 +4,11 @@ extern crate test;
#[cfg(test)]
mod tests {
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::bitpacked::{BitpackedFastFieldCodec, BitpackedFastFieldReader};
use fastfield_codecs::linearinterpol::{LinearInterpolCodec, LinearInterpolFastFieldReader};
use fastfield_codecs::multilinearinterpol::{
MultiLinearInterpolFastFieldCodec, MultiLinearInterpolFastFieldReader,
};
use fastfield_codecs::*;
fn get_data() -> Vec<u64> {
@@ -25,59 +27,69 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
fn bench_get<S: FastFieldCodec, R: FastFieldCodecReader>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
Codec::serialize(&mut bytes, &data).unwrap();
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
S::serialize(
&mut bytes,
&data,
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let reader = R::open_from_bytes(&bytes).unwrap();
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = reader.get_val(pos as u64);
debug_assert_eq!(data[pos as usize], val);
sum = sum.wrapping_add(val);
reader.get_u64(pos as u64, &bytes);
}
sum
});
}
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = Vec::new();
fn bench_create<S: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
b.iter(|| {
bytes.clear();
Codec::serialize(&mut bytes, &data).unwrap();
S::serialize(
&mut bytes,
&data,
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
});
}
use ownedbytes::OwnedBytes;
use test::Bencher;
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedCodec>(b, &data);
bench_create::<BitpackedFastFieldCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearCodec>(b, &data);
bench_create::<LinearInterpolCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearCodec>(b, &data);
bench_create::<MultiLinearInterpolFastFieldCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedCodec>(b, &data);
bench_get::<BitpackedFastFieldCodec, BitpackedFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearCodec>(b, &data);
bench_get::<LinearInterpolCodec, LinearInterpolFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearCodec>(b, &data);
bench_get::<MultiLinearInterpolFastFieldCodec, MultiLinearInterpolFastFieldReader>(
b, &data,
);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);

View File

@@ -4,22 +4,21 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{Column, FastFieldCodec, FastFieldCodecType};
use crate::{FastFieldCodec, FastFieldCodecReader, FastFieldStats};
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedReader {
pub struct BitpackedFastFieldReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
max_value_u64: u64,
num_vals: u64,
pub min_value_u64: u64,
pub max_value_u64: u64,
}
impl Column for BitpackedReader {
impl FastFieldCodecReader for BitpackedFastFieldReader {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
fn get_u64(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
}
#[inline]
@@ -30,21 +29,16 @@ impl Column for BitpackedReader {
fn max_value(&self) -> u64 {
self.max_value_u64
}
#[inline]
fn num_vals(&self) -> u64 {
self.num_vals
}
}
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_vals: u64,
amplitude: u64,
num_bits: u8,
}
impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
@@ -57,16 +51,15 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedSerializerLegacy<'a, W>> {
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
assert!(min_value <= max_value);
let amplitude = max_value - min_value;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(BitpackedSerializerLegacy {
Ok(BitpackedFastFieldSerializerLegacy {
bit_packer,
write,
min_value,
num_vals: 0,
amplitude,
num_bits,
})
@@ -77,42 +70,37 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
self.num_vals += 1;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?;
self.num_vals.serialize(&mut self.write)?;
Ok(())
}
}
pub struct BitpackedCodec;
pub struct BitpackedFastFieldCodec;
impl FastFieldCodec for BitpackedCodec {
/// The CODEC_TYPE is an enum value used for serialization.
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;
impl FastFieldCodec for BitpackedFastFieldCodec {
const NAME: &'static str = "Bitpacked";
type Reader = BitpackedReader;
type Reader = BitpackedFastFieldReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 24;
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
Ok(BitpackedFastFieldReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
bit_unpacker,
})
}
@@ -124,41 +112,45 @@ impl FastFieldCodec for BitpackedCodec {
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
let mut serializer = BitpackedSerializerLegacy::open(
write,
fastfield_accessor.min_value(),
fastfield_accessor.max_value(),
)?;
fn serialize(
&self,
write: &mut impl io::Write,
vals: &[u64],
stats: FastFieldStats,
) -> io::Result<()> {
let mut serializer =
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
for val in fastfield_accessor.iter() {
for &val in vals {
serializer.add_val(val)?;
}
serializer.close_field()?;
Ok(())
}
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
fn is_applicable(_vals: &[u64], _stats: FastFieldStats) -> bool {
true
}
fn estimate(_vals: &[u64], stats: FastFieldStats) -> f32 {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
Some(num_bits as f32 / num_bits_uncompressed as f32)
num_bits as f32 / num_bits_uncompressed as f32
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_datasets;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
crate::tests::create_and_validate(&BitpackedFastFieldCodec, data, name);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_datasets();
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();

View File

@@ -1,49 +0,0 @@
pub trait Column<T = u64> {
/// Return the value associated to the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u64) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx);
}
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value
fn max_value(&self) -> T;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}

View File

@@ -0,0 +1,254 @@
// Copyright (C) 2022 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at hello@quickwit.io.
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
use std::io;
use std::num::NonZeroU64;
use std::sync::Arc;
use common::BinarySerializable;
use fastdivide::DividerU64;
use ownedbytes::OwnedBytes;
use crate::bitpacked::BitpackedFastFieldCodec;
use crate::gcd::{find_gcd, GCDFastFieldCodecReader, GCDParams};
use crate::linearinterpol::LinearInterpolCodec;
use crate::multilinearinterpol::MultiLinearInterpolFastFieldCodec;
use crate::{FastFieldCodec, FastFieldCodecReader, FastFieldStats};
pub struct DynamicFastFieldCodec;
impl FastFieldCodec for DynamicFastFieldCodec {
const NAME: &'static str = "dynamic";
type Reader = DynamicFastFieldReader;
fn is_applicable(_vals: &[u64], _stats: crate::FastFieldStats) -> bool {
true
}
fn estimate(_vals: &[u64], _stats: crate::FastFieldStats) -> f32 {
0f32
}
fn serialize(
&self,
wrt: &mut impl io::Write,
vals: &[u64],
stats: crate::FastFieldStats,
) -> io::Result<()> {
let gcd: NonZeroU64 = find_gcd(vals.iter().copied().map(|val| val - stats.min_value))
.unwrap_or(unsafe { NonZeroU64::new_unchecked(1) });
if gcd.get() > 1 {
let gcd_divider = DividerU64::divide_by(gcd.get());
let scaled_vals: Vec<u64> = vals
.iter()
.copied()
.map(|val| gcd_divider.divide(val - stats.min_value))
.collect();
<CodecType as BinarySerializable>::serialize(&CodecType::Gcd, wrt)?;
let gcd_params = GCDParams {
min_value: stats.min_value,
gcd,
};
gcd_params.serialize(wrt)?;
let codec_type = choose_codec(stats, &scaled_vals);
<CodecType as BinarySerializable>::serialize(&codec_type, wrt)?;
let scaled_stats = FastFieldStats::compute(&scaled_vals);
codec_type.serialize(wrt, &scaled_vals, scaled_stats)?;
} else {
let codec_type = choose_codec(stats, vals);
wrt.write_all(&[codec_type.to_code()])?;
codec_type.serialize(wrt, vals, stats)?;
}
Ok(())
}
fn open_from_bytes(mut bytes: OwnedBytes) -> io::Result<Self::Reader> {
let codec_code = bytes.read_u8();
let codec_type = CodecType::from_code(codec_code).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Unknown codec code `{codec_code}`"),
)
})?;
let fast_field_reader: Arc<dyn FastFieldCodecReader> = match codec_type {
CodecType::Bitpacked => Arc::new(BitpackedFastFieldCodec::open_from_bytes(bytes)?),
CodecType::LinearInterpol => Arc::new(LinearInterpolCodec::open_from_bytes(bytes)?),
CodecType::MultiLinearInterpol => {
Arc::new(MultiLinearInterpolFastFieldCodec::open_from_bytes(bytes)?)
}
CodecType::Gcd => {
let gcd_params = GCDParams::deserialize(&mut bytes)?;
let inner_codec_type = <CodecType as BinarySerializable>::deserialize(&mut bytes)?;
match inner_codec_type {
CodecType::Bitpacked => Arc::new(GCDFastFieldCodecReader {
params: gcd_params,
reader: BitpackedFastFieldCodec::open_from_bytes(bytes)?,
}),
CodecType::LinearInterpol => Arc::new(GCDFastFieldCodecReader {
params: gcd_params,
reader: LinearInterpolCodec::open_from_bytes(bytes)?,
}),
CodecType::MultiLinearInterpol => Arc::new(GCDFastFieldCodecReader {
params: gcd_params,
reader: MultiLinearInterpolFastFieldCodec::open_from_bytes(bytes)?,
}),
CodecType::Gcd => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"A GCD codec may not wrap another GCD codec.",
));
}
}
}
};
Ok(DynamicFastFieldReader(fast_field_reader))
}
}
#[derive(Clone)]
/// DynamicFastFieldReader wraps different readers to access
/// the various encoded fastfield data
pub struct DynamicFastFieldReader(Arc<dyn FastFieldCodecReader>);
#[repr(u8)]
#[derive(Debug, Clone, Copy)]
pub enum CodecType {
Bitpacked = 0,
LinearInterpol = 1,
MultiLinearInterpol = 2,
Gcd = 3,
}
impl BinarySerializable for CodecType {
fn serialize<W: io::Write>(&self, wrt: &mut W) -> io::Result<()> {
wrt.write_all(&[self.to_code()])?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let codec_code = u8::deserialize(reader)?;
let codec_type = CodecType::from_code(codec_code).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid codec type code {codec_code}"),
)
})?;
Ok(codec_type)
}
}
impl CodecType {
pub fn from_code(code: u8) -> Option<Self> {
match code {
0 => Some(CodecType::Bitpacked),
1 => Some(CodecType::LinearInterpol),
2 => Some(CodecType::MultiLinearInterpol),
3 => Some(CodecType::Gcd),
_ => None,
}
}
pub fn to_code(self) -> u8 {
self as u8
}
fn codec_estimation(
&self,
stats: FastFieldStats,
vals: &[u64],
estimations: &mut Vec<(f32, CodecType)>,
) {
let estimate_opt: Option<f32> = match self {
CodecType::Bitpacked => codec_estimation::<BitpackedFastFieldCodec>(stats, vals),
CodecType::LinearInterpol => codec_estimation::<LinearInterpolCodec>(stats, vals),
CodecType::MultiLinearInterpol => {
codec_estimation::<MultiLinearInterpolFastFieldCodec>(stats, vals)
}
CodecType::Gcd => None,
};
if let Some(estimate) = estimate_opt {
if !estimate.is_nan() && estimate.is_finite() {
estimations.push((estimate, *self));
}
}
}
fn serialize(
&self,
wrt: &mut impl io::Write,
fastfield_accessor: &[u64],
stats: FastFieldStats,
) -> io::Result<()> {
match self {
CodecType::Bitpacked => {
BitpackedFastFieldCodec.serialize(wrt, fastfield_accessor, stats)?;
}
CodecType::LinearInterpol => {
LinearInterpolCodec.serialize(wrt, fastfield_accessor, stats)?;
}
CodecType::MultiLinearInterpol => {
MultiLinearInterpolFastFieldCodec.serialize(wrt, fastfield_accessor, stats)?;
}
CodecType::Gcd => {
panic!("GCD should never be called that way.");
}
}
Ok(())
}
}
impl FastFieldCodecReader for DynamicFastFieldReader {
fn get_u64(&self, doc: u64) -> u64 {
self.0.get_u64(doc)
}
fn min_value(&self) -> u64 {
self.0.min_value()
}
fn max_value(&self) -> u64 {
self.0.max_value()
}
}
fn codec_estimation<T: FastFieldCodec>(stats: FastFieldStats, vals: &[u64]) -> Option<f32> {
if !T::is_applicable(vals, stats.clone()) {
return None;
}
let ratio = T::estimate(vals, stats);
Some(ratio)
}
const CODEC_TYPES: [CodecType; 3] = [
CodecType::Bitpacked,
CodecType::LinearInterpol,
CodecType::MultiLinearInterpol,
];
fn choose_codec(stats: FastFieldStats, vals: &[u64]) -> CodecType {
let mut estimations = Vec::new();
for codec_type in &CODEC_TYPES {
codec_type.codec_estimation(stats, vals, &mut estimations);
}
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let (_ratio, codec_type) = estimations[0];
codec_type
}

247
fastfield_codecs/src/gcd.rs Normal file
View File

@@ -0,0 +1,247 @@
use std::io::{self, Write};
use std::num::NonZeroU64;
use common::BinarySerializable;
use fastdivide::DividerU64;
use crate::FastFieldCodecReader;
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct GCDFastFieldCodecReader<CodecReader> {
pub params: GCDParams,
pub reader: CodecReader,
}
impl<C: FastFieldCodecReader> FastFieldCodecReader for GCDFastFieldCodecReader<C> {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
self.params.min_value + self.params.gcd.get() * self.reader.get_u64(doc)
}
fn min_value(&self) -> u64 {
self.params.min_value + self.params.gcd.get() * self.reader.min_value()
}
fn max_value(&self) -> u64 {
self.params.min_value + self.params.gcd.get() * self.reader.max_value()
}
}
#[derive(Debug, Copy, Clone)]
pub struct GCDParams {
pub min_value: u64,
pub gcd: NonZeroU64,
}
impl BinarySerializable for GCDParams {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.gcd.get().serialize(wrt)?;
self.min_value.serialize(wrt)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let gcd = NonZeroU64::new(u64::deserialize(reader)?)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "GCD=0 is invalid."))?;
let min_value = u64::deserialize(reader)?;
Ok(GCDParams { min_value, gcd })
}
}
fn compute_gcd(mut left: u64, mut right: u64) -> u64 {
while right != 0 {
(left, right) = (right, left % right);
}
left
}
// Find GCD for iterator of numbers
//
// If all numbers are '0' (or if there are not numbers, return None).
pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<NonZeroU64> {
let mut numbers = numbers.filter(|n| *n != 0);
let mut gcd = numbers.next()?;
if gcd == 1 {
return NonZeroU64::new(gcd);
}
let mut gcd_divider = DividerU64::divide_by(gcd);
for val in numbers {
let remainder = val - gcd_divider.divide(val) * gcd;
if remainder == 0 {
continue;
}
gcd = compute_gcd(gcd, val);
if gcd == 1 {
return NonZeroU64::new(1);
}
gcd_divider = DividerU64::divide_by(gcd);
}
NonZeroU64::new(gcd)
}
#[cfg(test)]
mod tests {
// TODO Move test
//
// use std::collections::HashMap;
// use std::path::Path;
//
// use crate::directory::{CompositeFile, RamDirectory, WritePtr};
// use crate::fastfield::serializer::FastFieldCodecEnableCheck;
// use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
// use super::{
// find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecName,
// FastFieldReader, FastFieldsWriter, ALL_CODECS,
// };
// use crate::schema::Schema;
// use crate::Directory;
//
// fn get_index(
// docs: &[crate::Document],
// schema: &Schema,
// codec_enable_checker: FastFieldCodecEnableCheck,
// ) -> crate::Result<RamDirectory> {
// let directory: RamDirectory = RamDirectory::create();
// {
// let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
// let mut serializer =
// CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker)
// .unwrap();
// let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
// for doc in docs {
// fast_field_writers.add_document(doc);
// }
// fast_field_writers
// .serialize(&mut serializer, &HashMap::new(), None)
// .unwrap();
// serializer.close().unwrap();
// }
// Ok(directory)
// }
//
// fn test_fastfield_gcd_i64_with_codec(
// codec_name: FastFieldCodecName,
// num_vals: usize,
// ) -> crate::Result<()> {
// let path = Path::new("test");
// let mut docs = vec![];
// for i in 1..=num_vals {
// let val = i as i64 * 1000i64;
// docs.push(doc!(*FIELDI64=>val));
// }
// let directory = get_index(&docs, &SCHEMAI64, codec_name.clone().into())?;
// let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 118);
// let composite_file = CompositeFile::open(&file)?;
// let file = composite_file.open_read(*FIELD).unwrap();
// let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
// assert_eq!(fast_field_reader.get(0), 1000i64);
// assert_eq!(fast_field_reader.get(1), 2000i64);
// assert_eq!(fast_field_reader.get(2), 3000i64);
// assert_eq!(fast_field_reader.max_value(), num_vals as i64 * 1000);
// assert_eq!(fast_field_reader.min_value(), 1000i64);
// let file = directory.open_read(path).unwrap();
//
// Can't apply gcd
// let path = Path::new("test");
// docs.pop();
// docs.push(doc!(*FIELDI64=>2001i64));
// let directory = get_index(&docs, &SCHEMAI64, codec_name.into())?;
// let file2 = directory.open_read(path).unwrap();
// assert!(file2.len() > file.len());
//
// Ok(())
// }
//
// #[test]
// fn test_fastfield_gcd_i64() -> crate::Result<()> {
// for codec_name in ALL_CODECS {
// test_fastfield_gcd_i64_with_codec(codec_name.clone(), 5005)?;
// }
// Ok(())
// }
//
// fn test_fastfield_gcd_u64_with_codec(
// codec_name: FastFieldCodecName,
// num_vals: usize,
// ) -> crate::Result<()> {
// let path = Path::new("test");
// let mut docs = vec![];
// for i in 1..=num_vals {
// let val = i as u64 * 1000u64;
// docs.push(doc!(*FIELD=>val));
// }
// let directory = get_index(&docs, &SCHEMA, codec_name.clone().into())?;
// let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 118);
// let composite_file = CompositeFile::open(&file)?;
// let file = composite_file.open_read(*FIELD).unwrap();
// let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
// assert_eq!(fast_field_reader.get(0), 1000u64);
// assert_eq!(fast_field_reader.get(1), 2000u64);
// assert_eq!(fast_field_reader.get(2), 3000u64);
// assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
// assert_eq!(fast_field_reader.min_value(), 1000u64);
// let file = directory.open_read(path).unwrap();
//
// Can't apply gcd
// let path = Path::new("test");
// docs.pop();
// docs.push(doc!(*FIELDI64=>2001u64));
// let directory = get_index(&docs, &SCHEMA, codec_name.into())?;
// let file2 = directory.open_read(path).unwrap();
// assert!(file2.len() > file.len());
//
// Ok(())
// }
//
// #[test]
// fn test_fastfield_gcd_u64() -> crate::Result<()> {
// for codec_name in ALL_CODECS {
// test_fastfield_gcd_u64_with_codec(codec_name.clone(), 5005)?;
// }
// Ok(())
// }
//
// #[test]
// pub fn test_fastfield2() {
// let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
// assert_eq!(test_fastfield.get(0), 100);
// assert_eq!(test_fastfield.get(1), 200);
// assert_eq!(test_fastfield.get(2), 300);
// }
use std::num::NonZeroU64;
use crate::gcd::{compute_gcd, find_gcd};
#[test]
fn test_compute_gcd() {
assert_eq!(compute_gcd(0, 0), 0);
assert_eq!(compute_gcd(4, 0), 4);
assert_eq!(compute_gcd(0, 4), 4);
assert_eq!(compute_gcd(1, 4), 1);
assert_eq!(compute_gcd(4, 1), 1);
assert_eq!(compute_gcd(4, 2), 2);
assert_eq!(compute_gcd(10, 25), 5);
assert_eq!(compute_gcd(25, 10), 5);
assert_eq!(compute_gcd(25, 25), 25);
}
#[test]
fn find_gcd_test() {
assert_eq!(find_gcd([0].into_iter()), None);
assert_eq!(find_gcd([0, 10].into_iter()), NonZeroU64::new(10));
assert_eq!(find_gcd([10, 0].into_iter()), NonZeroU64::new(10));
assert_eq!(find_gcd([].into_iter()), None);
assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), NonZeroU64::new(5));
assert_eq!(find_gcd([15, 16, 10].into_iter()), NonZeroU64::new(1));
assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), NonZeroU64::new(5));
assert_eq!(find_gcd([0, 0].into_iter()), None);
}
}

View File

@@ -3,189 +3,125 @@
extern crate more_asserts;
use std::io;
use std::io::Write;
use common::BinarySerializable;
use ownedbytes::OwnedBytes;
pub mod bitpacked;
pub mod blockwise_linear;
pub mod linear;
pub mod dynamic;
pub mod gcd;
pub mod linearinterpol;
pub mod multilinearinterpol;
mod column;
// Unify with FastFieldReader
pub use self::column::Column;
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
pub enum FastFieldCodecType {
Bitpacked = 1,
Linear = 2,
BlockwiseLinear = 3,
Gcd = 4,
}
impl BinarySerializable for FastFieldCodecType {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
impl FastFieldCodecType {
pub fn to_code(self) -> u8 {
self as u8
}
pub fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::Bitpacked),
2 => Some(Self::Linear),
3 => Some(Self::BlockwiseLinear),
4 => Some(Self::Gcd),
_ => None,
}
}
pub trait FastFieldCodecReader {
/// reads the metadata and returns the CodecReader
fn get_u64(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodec {
/// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType;
/// A codex needs to provide a unique name used for debugging.
const NAME: &'static str;
type Reader: Column<u64>;
type Reader: FastFieldCodecReader;
/// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
/// Serializes the data using the serializer into write.
///
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
/// performance reasons.
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column<u64>) -> io::Result<()>;
/// Check if the Codec is able to compress the data
fn is_applicable(vals: &[u64], stats: FastFieldStats) -> bool;
/// Returns an estimate of the compression ratio.
/// If the codec is not applicable, returns `None`.
///
/// The baseline is uncompressed 64bit data.
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(fastfield_accessor: &impl Column) -> Option<f32>;
fn estimate(vals: &[u64], stats: FastFieldStats) -> f32;
/// Serializes the data using the serializer into write.
/// There are multiple iterators, in case the codec needs to read the data multiple times.
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
fn serialize(
&self,
write: &mut impl io::Write,
vals: &[u64],
stats: FastFieldStats,
) -> io::Result<()>;
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
}
#[derive(Debug, Clone)]
/// Statistics are used in codec detection and stored in the fast field footer.
#[derive(Clone, Copy, Default, Debug)]
pub struct FastFieldStats {
pub min_value: u64,
pub max_value: u64,
pub num_vals: u64,
}
struct VecColum<'a>(&'a [u64]);
impl<'a> Column for VecColum<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
impl FastFieldStats {
pub fn compute(vals: &[u64]) -> Self {
if vals.is_empty() {
return FastFieldStats::default();
}
let first_val = vals[0];
let mut fast_field_stats = FastFieldStats {
min_value: first_val,
max_value: first_val,
num_vals: 1,
};
for &val in &vals[1..] {
fast_field_stats.record(val);
}
fast_field_stats
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
self.0.iter().min().cloned().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.0.iter().max().cloned().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
impl<'a> From<&'a [u64]> for VecColum<'a> {
fn from(data: &'a [u64]) -> Self {
Self(data)
pub fn record(&mut self, val: u64) {
self.num_vals += 1;
self.min_value = self.min_value.min(val);
self.max_value = self.max_value.max(val);
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use crate::bitpacked::BitpackedFastFieldCodec;
use crate::linearinterpol::LinearInterpolCodec;
use crate::multilinearinterpol::MultiLinearInterpolFastFieldCodec;
use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::linear::LinearCodec;
pub fn create_and_validate<Codec: FastFieldCodec>(
pub fn create_and_validate<S: FastFieldCodec>(
codec: &S,
data: &[u64],
name: &str,
) -> Option<(f32, f32)> {
let estimation = Codec::estimate(&VecColum::from(data))?;
) -> (f32, f32) {
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
let mut out: Vec<u8> = Vec::new();
Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
codec
.serialize(&mut out, &data, crate::tests::stats_from_vec(data))
.unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().copied().enumerate() {
let val = reader.get_val(doc as u64);
assert_eq!(
val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
`{data:?}`",
);
let reader = S::open_from_bytes(OwnedBytes::new(out)).unwrap();
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64);
if val != *orig_val {
panic!(
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
val, orig_val, name, data
);
}
}
Some((estimation, actual_compression))
(estimation, actual_compression)
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
fn num_strategy() -> impl Strategy<Value = u64> {
prop_oneof![
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
20 => prop::num::u64::ANY,
]
}
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];
let data = (10..=10_000_u64).collect::<Vec<_>>();
let data = (10..=20_u64).collect::<Vec<_>>();
data_and_names.push((data, "simple monotonically increasing"));
data_and_names.push((
@@ -195,93 +131,88 @@ mod tests {
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value"));
data_and_names.push((
vec![1572656989877777, 1170935903116329, 720575940379279, 0],
"overflow error",
));
data_and_names
}
fn test_codec<C: FastFieldCodec>() {
let codec_name = format!("{:?}", C::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_datasets() {
let estimate_actual_opt: Option<(f32, f32)> =
crate::tests::create_and_validate::<C>(&data, dataset_name);
let result = if let Some((estimate, actual)) = estimate_actual_opt {
format!("Estimate `{estimate}` Actual `{actual}`")
} else {
fn test_codec<C: FastFieldCodec>(codec: &C) {
let codec_name = C::NAME;
for (data, data_set_name) in get_codec_test_data_sets() {
let (estimate, actual) = crate::tests::create_and_validate(codec, &data, data_set_name);
let result = if estimate == f32::MAX {
"Disabled".to_string()
} else {
format!("Estimate {:?} Actual {:?} ", estimate, actual)
};
println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
println!(
"Codec {}, DataSet {}, {}",
codec_name, data_set_name, result
);
}
}
#[test]
fn test_codec_bitpacking() {
test_codec::<BitpackedCodec>();
test_codec(&BitpackedFastFieldCodec);
}
#[test]
fn test_codec_interpolation() {
test_codec::<LinearCodec>();
test_codec(&LinearInterpolCodec);
}
#[test]
fn test_codec_multi_interpolation() {
test_codec::<BlockwiseLinearCodec>();
test_codec(&MultiLinearInterpolFastFieldCodec);
}
use super::*;
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
let max_value = data.iter().cloned().max().unwrap_or(0);
FastFieldStats {
min_value,
max_value,
num_vals: data.len() as u64,
}
}
#[test]
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();
let data: VecColum = data.as_slice().into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearInterpolCodec::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
let multi_linear_interpol_estimation =
MultiLinearInterpolFastFieldCodec::estimate(&&data[..], stats_from_vec(&data));
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedFastFieldCodec::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
let data = vec![200, 10, 10, 10, 10, 1000, 20];
let data: VecColum = data.into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearInterpolCodec::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.32);
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedFastFieldCodec::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
let mut data: Vec<u64> = (200..=20000_u64).collect();
let mut data = (200..=20000_u64).collect::<Vec<_>>();
data.push(1_000_000);
let data: VecColum = data.as_slice().into();
// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
let linear_interpol_estimation =
LinearInterpolCodec::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.35);
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
let bitpacked_estimation = BitpackedFastFieldCodec::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
#[test]
fn test_fast_field_codec_type_to_code() {
let mut count_codec = 0;
for code in 0..=255 {
if let Some(codec_type) = FastFieldCodecType::from_code(code) {
assert_eq!(codec_type.to_code(), code);
count_codec += 1;
}
}
assert_eq!(count_codec, 4);
}
}

View File

@@ -5,20 +5,20 @@ use common::{BinarySerializable, FixedSize};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{Column, FastFieldCodec, FastFieldCodecType};
use crate::{FastFieldCodec, FastFieldCodecReader, FastFieldStats};
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct LinearReader {
pub struct LinearInterpolFastFieldReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub footer: LinearFooter,
pub footer: LinearInterpolFooter,
pub slope: f32,
}
#[derive(Clone, Debug)]
pub struct LinearFooter {
pub struct LinearInterpolFooter {
pub relative_max_value: u64,
pub offset: u64,
pub first_val: u64,
@@ -28,7 +28,7 @@ pub struct LinearFooter {
pub max_value: u64,
}
impl BinarySerializable for LinearFooter {
impl BinarySerializable for LinearInterpolFooter {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.relative_max_value.serialize(write)?;
self.offset.serialize(write)?;
@@ -40,8 +40,8 @@ impl BinarySerializable for LinearFooter {
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearFooter> {
Ok(LinearFooter {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
Ok(LinearInterpolFooter {
relative_max_value: u64::deserialize(reader)?,
offset: u64::deserialize(reader)?,
first_val: u64::deserialize(reader)?,
@@ -53,13 +53,13 @@ impl BinarySerializable for LinearFooter {
}
}
impl FixedSize for LinearFooter {
impl FixedSize for LinearInterpolFooter {
const SIZE_IN_BYTES: usize = 56;
}
impl Column for LinearReader {
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
fn get_u64(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
}
@@ -72,69 +72,41 @@ impl Column for LinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked.
pub struct LinearCodec;
pub struct LinearInterpolCodec;
#[inline]
pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
if num_vals <= 1 {
return 0.0;
}
// We calculate the slope with f64 high precision and use the result in lower precision f32
// This is done in order to handle estimations for very large values like i64::MAX
let diff = diff(last_val, first_val);
(diff / (num_vals - 1) as f64) as f32
}
/// Delay the cast, to improve precision for very large u64 values.
///
/// Since i64 is mapped monotonically to u64 space, 0i64 is after the mapping i64::MAX.
/// So very large values are not uncommon.
///
/// ```rust
/// let val1 = i64::MAX;
/// let val2 = i64::MAX - 100;
/// assert_eq!(val1 - val2, 100);
/// assert_eq!(val1 as f64 - val2 as f64, 0.0);
/// ```
fn diff(val1: u64, val2: u64) -> f64 {
if val1 >= val2 {
(val1 - val2) as f64
} else {
(val2 - val1) as f64 * -1.0
}
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
}
#[inline]
pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
if slope < 0.0 {
first_val.saturating_sub((pos as f32 * -slope) as u64)
} else {
first_val.saturating_add((pos as f32 * slope) as u64)
}
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
first_val + (pos as f32 * slope) as u64
}
impl FastFieldCodec for LinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear;
impl FastFieldCodec for LinearInterpolCodec {
const NAME: &'static str = "LinearInterpol";
type Reader = LinearReader;
type Reader = LinearInterpolFastFieldReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
Ok(LinearInterpolFastFieldReader {
data,
bit_unpacker,
footer,
@@ -143,16 +115,22 @@ impl FastFieldCodec for LinearCodec {
}
/// Creates a new fast field serializer.
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
fn serialize(
&self,
write: &mut impl Write,
vals: &[u64],
stats: FastFieldStats,
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
let first_val = vals[0];
let last_val = vals[vals.len() - 1];
let slope = get_slope(first_val, last_val, stats.num_vals);
// calculate offset to ensure all values are positive
let mut offset = 0;
let mut rel_positive_max = 0;
for (pos, actual_value) in fastfield_accessor.iter().enumerate() {
for (pos, actual_value) in vals.iter().copied().enumerate() {
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
if calculated_value > actual_value {
// negative value we need to apply an offset
@@ -170,64 +148,62 @@ impl FastFieldCodec for LinearCodec {
let num_bits = compute_num_bits(relative_max_value);
let mut bit_packer = BitPacker::new();
for (pos, val) in fastfield_accessor.iter().enumerate() {
for (pos, val) in vals.iter().copied().enumerate() {
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
let diff = (val + offset) - calculated_value;
bit_packer.write(diff, num_bits, write)?;
}
bit_packer.close(write)?;
let footer = LinearFooter {
let footer = LinearInterpolFooter {
relative_max_value,
offset,
first_val,
last_val,
num_vals: fastfield_accessor.num_vals(),
min_value: fastfield_accessor.min_value(),
max_value: fastfield_accessor.max_value(),
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
};
footer.serialize(write)?;
Ok(())
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 3 {
return None; // disable compressor for this case
fn is_applicable(_vals: &[u64], stats: FastFieldStats) -> bool {
if stats.num_vals < 3 {
return false; // disable compressor for this case
}
// On serialisation the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
// If this doesn't overflow the algorithm should be fine
let theorethical_maximum_offset =
fastfield_accessor.max_value() - fastfield_accessor.min_value();
if fastfield_accessor
.max_value()
let theorethical_maximum_offset = stats.max_value - stats.min_value;
if stats
.max_value
.checked_add(theorethical_maximum_offset)
.is_none()
{
return None;
return false;
}
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(vals: &[u64], stats: FastFieldStats) -> f32 {
let first_val = vals[0];
let last_val = vals[vals.len() - 1];
let slope = get_slope(first_val, last_val, stats.num_vals);
// let's sample at 0%, 5%, 10% .. 95%, 100%
let num_vals = fastfield_accessor.num_vals() as f32 / 100.0;
let sample_positions = (0..20)
let num_vals = stats.num_vals as f32 / 100.0;
let sample_positions: Vec<usize> = (0..20)
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
.collect::<Vec<_>>();
let max_distance = sample_positions
.iter()
.into_iter()
.map(|pos| {
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
let actual_value = fastfield_accessor.get_val(*pos as u64);
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
let actual_value = vals[pos];
distance(calculated_value, actual_value)
})
.max()
@@ -240,11 +216,10 @@ impl FastFieldCodec for LinearCodec {
//
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64
* fastfield_accessor.num_vals()
+ LinearFooter::SIZE_IN_BYTES as u64;
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32)
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
@@ -259,48 +234,26 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)]
mod tests {
use rand::RngCore;
use super::*;
use crate::tests::get_codec_test_datasets;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
crate::tests::create_and_validate::<LinearCodec>(data, name)
}
#[test]
fn get_calculated_value_test() {
// pos slope
assert_eq!(get_calculated_value(100, 10, 5.0), 150);
// neg slope
assert_eq!(get_calculated_value(100, 10, -5.0), 50);
// pos slope, very high values
assert_eq!(
get_calculated_value(i64::MAX as u64, 10, 5.0),
i64::MAX as u64 + 50
);
// neg slope, very high values
assert_eq!(
get_calculated_value(i64::MAX as u64, 10, -5.0),
i64::MAX as u64 - 50
);
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate(&LinearInterpolCodec, data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large").unwrap();
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.01);
assert!(estimate < 0.01);
}
#[test]
fn test_with_codec_datasets() {
let data_sets = get_codec_test_datasets();
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
@@ -317,13 +270,6 @@ mod tests {
create_and_validate(&data, "large amplitude");
}
#[test]
fn overflow_error_test() {
let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
create_and_validate(&data, "overflow test");
}
#[test]
fn linear_interpol_fast_concave_data() {
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
@@ -343,10 +289,10 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
let mut rng = rand::thread_rng();
for _ in 0..50 {
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
for _ in 0..5000 {
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();
create_and_validate(&data, "random");
}

View File

@@ -1,35 +1,11 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
// use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
// use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::bitpacked::BitpackedFastFieldCodec;
use fastfield_codecs::{FastFieldCodec, FastFieldStats};
use prettytable::{Cell, Row, Table};
struct Data<'a>(&'a [u64]);
impl<'a> Column for Data<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
*self.0.iter().min().unwrap_or(&0)
}
fn max_value(&self) -> u64 {
*self.0.iter().max().unwrap_or(&0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
fn main() {
let mut table = Table::new();
@@ -37,32 +13,39 @@ fn main() {
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
for (data, data_set_name) in get_codec_test_data_sets() {
let results: Vec<(f32, f32, FastFieldCodecType)> = [
serialize_with_codec::<LinearCodec>(&data),
serialize_with_codec::<BlockwiseLinearCodec>(&data),
serialize_with_codec::<BlockwiseLinearCodec>(&data),
serialize_with_codec::<BitpackedCodec>(&data),
]
.into_iter()
.flatten()
.collect();
let mut results = Vec::new();
// let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
// results.push(res);
// let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
// results.push(res);
let res = serialize_with_codec(&BitpackedFastFieldCodec, &data);
results.push(res);
// let best_estimation_codec = results
//.iter()
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
//.unwrap();
let best_compression_ratio_codec = results
.iter()
.min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
.cloned()
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (est, comp, codec_type) in results {
let est_cell = est.to_string();
let ratio_cell = comp.to_string();
for (is_applicable, est, comp, name) in results {
let (est_cell, ratio_cell) = if !is_applicable {
("Codec Disabled".to_string(), "".to_string())
} else {
(est.to_string(), comp.to_string())
};
let style = if comp == best_compression_ratio_codec.1 {
"Fb"
} else {
""
};
table.add_row(Row::new(vec![
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
Cell::new(name).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style),
Cell::new(&est_cell).style_spec(""),
]));
@@ -107,15 +90,22 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
data_and_names
}
pub fn serialize_with_codec<C: FastFieldCodec>(
pub fn serialize_with_codec<S: FastFieldCodec>(
codec: &S,
data: &[u64],
) -> Option<(f32, f32, FastFieldCodecType)> {
let data = Data(data);
let estimation = C::estimate(&data)?;
let mut out = Vec::new();
C::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
Some((estimation, actual_compression, C::CODEC_TYPE))
) -> (bool, f32, f32, &'static str) {
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
}
let estimation = S::estimate(&data, stats_from_vec(data));
let mut out = vec![];
codec
.serialize(&mut out, &data, stats_from_vec(data))
.unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
(true, estimation, actual_compression, S::NAME)
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -1,4 +1,4 @@
//! The BlockwiseLinear codec uses linear interpolation to guess a values and stores the
//! MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the
//! offset, but in blocks of 512.
//!
//! With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 /
@@ -17,17 +17,16 @@ use common::{BinarySerializable, CountingWriter, DeserializeFrom};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::linear::{get_calculated_value, get_slope};
use crate::{Column, FastFieldCodec, FastFieldCodecType};
use crate::{FastFieldCodec, FastFieldCodecReader, FastFieldStats};
const CHUNK_SIZE: u64 = 512;
const CHUNK_SIZE: usize = 512;
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct BlockwiseLinearReader {
pub struct MultiLinearInterpolFastFieldReader {
data: OwnedBytes,
pub footer: BlockwiseLinearFooter,
pub footer: MultiLinearInterpolFooter,
}
#[derive(Clone, Debug, Default)]
@@ -102,14 +101,14 @@ impl BinarySerializable for Function {
}
#[derive(Clone, Debug)]
pub struct BlockwiseLinearFooter {
pub struct MultiLinearInterpolFooter {
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
interpolations: Vec<Function>,
}
impl BinarySerializable for BlockwiseLinearFooter {
impl BinarySerializable for MultiLinearInterpolFooter {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
let mut out = vec![];
self.num_vals.serialize(&mut out)?;
@@ -121,45 +120,35 @@ impl BinarySerializable for BlockwiseLinearFooter {
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<BlockwiseLinearFooter> {
let mut footer = BlockwiseLinearFooter {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
let mut footer = MultiLinearInterpolFooter {
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
interpolations: Vec::<Function>::deserialize(reader)?,
};
for (num, interpol) in footer.interpolations.iter_mut().enumerate() {
interpol.start_pos = CHUNK_SIZE * num as u64;
interpol.start_pos = (CHUNK_SIZE * num) as u64;
}
Ok(footer)
}
}
#[inline]
fn get_interpolation_position(doc: u64) -> usize {
let index = doc / CHUNK_SIZE;
index as usize
}
#[inline]
fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Function {
&interpolations[get_interpolation_position(doc)]
&interpolations[doc as usize / CHUNK_SIZE]
}
impl Column for BlockwiseLinearReader {
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
#[inline]
fn get_val(&self, idx: u64) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let in_block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value(
interpolation.value_start_pos,
in_block_idx,
interpolation.slope,
);
let diff = interpolation.bit_unpacker.get(
in_block_idx,
&self.data[interpolation.data_start_offset as usize..],
);
fn get_u64(&self, doc: u64) -> u64 {
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
let doc = doc - interpolation.start_pos;
let calculated_value =
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
let diff = interpolation
.bit_unpacker
.get(doc, &self.data[interpolation.data_start_offset as usize..]);
(calculated_value + diff) - interpolation.positive_val_offset
}
@@ -171,38 +160,49 @@ impl Column for BlockwiseLinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}
/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct BlockwiseLinearCodec;
#[inline]
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
}
impl FastFieldCodec for BlockwiseLinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;
#[inline]
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
(first_val as i64 + (pos as f32 * slope) as i64) as u64
}
type Reader = BlockwiseLinearReader;
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct MultiLinearInterpolFastFieldCodec;
impl FastFieldCodec for MultiLinearInterpolFastFieldCodec {
const NAME: &'static str = "MultiLinearInterpol";
type Reader = MultiLinearInterpolFastFieldReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
Ok(MultiLinearInterpolFastFieldReader { data, footer })
}
/// Creates a new fast field serializer.
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
fn serialize(
&self,
write: &mut impl io::Write,
vals: &[u64],
stats: FastFieldStats,
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
let first_val = vals[0];
let last_val = vals[vals.len() - 1];
let mut first_function = Function {
end_pos: fastfield_accessor.num_vals(),
end_pos: stats.num_vals,
value_start_pos: first_val,
value_end_pos: last_val,
..Default::default()
@@ -210,16 +210,11 @@ impl FastFieldCodec for BlockwiseLinearCodec {
first_function.calc_slope();
let mut interpolations = vec![first_function];
// Since we potentially apply multiple passes over the data, the data is cached.
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
// iteration)
let data = fastfield_accessor.iter().collect::<Vec<_>>();
//// let's split this into chunks of CHUNK_SIZE
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
for vals_pos in (0..vals.len()).step_by(CHUNK_SIZE).skip(1) {
let new_fun = {
let current_interpolation = interpolations.last_mut().unwrap();
current_interpolation.split(data_pos, data[data_pos as usize])
current_interpolation.split(vals_pos as u64, vals[vals_pos])
};
interpolations.push(new_fun);
}
@@ -227,7 +222,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
for interpolation in &mut interpolations {
let mut offset = 0;
let mut rel_positive_max = 0;
for (pos, actual_value) in data
for (pos, actual_value) in vals
[interpolation.start_pos as usize..interpolation.end_pos as usize]
.iter()
.cloned()
@@ -258,7 +253,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
for interpolation in &mut interpolations {
interpolation.data_start_offset = write.written_bytes();
let num_bits = interpolation.num_bits;
for (pos, actual_value) in data
for (pos, actual_value) in vals
[interpolation.start_pos as usize..interpolation.end_pos as usize]
.iter()
.cloned()
@@ -276,47 +271,46 @@ impl FastFieldCodec for BlockwiseLinearCodec {
}
bit_packer.close(write)?;
let footer = BlockwiseLinearFooter {
num_vals: fastfield_accessor.num_vals(),
min_value: fastfield_accessor.min_value(),
max_value: fastfield_accessor.max_value(),
let footer = MultiLinearInterpolFooter {
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
interpolations,
};
footer.serialize(write)?;
Ok(())
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
return None;
fn is_applicable(_vals: &[u64], stats: FastFieldStats) -> bool {
if stats.num_vals < 5_000 {
return false;
}
// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
// If this doesn't overflow the algorithm should be fine
let theorethical_maximum_offset =
fastfield_accessor.max_value() - fastfield_accessor.min_value();
if fastfield_accessor
.max_value()
let theorethical_maximum_offset = stats.max_value - stats.min_value;
if stats
.max_value
.checked_add(theorethical_maximum_offset)
.is_none()
{
return None;
return false;
}
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(vals: &[u64], stats: FastFieldStats) -> f32 {
// TODO simplify now that we have a vals array.
let first_val_in_first_block = vals[0];
let last_elem_in_first_chunk = CHUNK_SIZE.min(vals.len());
let last_val_in_first_block = vals[last_elem_in_first_chunk - 1];
let slope = get_slope(
first_val_in_first_block,
last_val_in_first_block,
fastfield_accessor.num_vals(),
stats.num_vals,
);
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
@@ -326,10 +320,11 @@ impl FastFieldCodec for BlockwiseLinearCodec {
let max_distance = sample_positions
.iter()
.copied()
.map(|pos| {
let calculated_value =
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
let actual_value = fastfield_accessor.get_val(*pos as u64);
get_calculated_value(first_val_in_first_block, pos as u64, slope);
let actual_value = vals[pos];
distance(calculated_value, actual_value)
})
.max()
@@ -343,11 +338,11 @@ impl FastFieldCodec for BlockwiseLinearCodec {
//
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * fastfield_accessor.num_vals() as u64
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
// function metadata per block
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32)
+ 29 * (stats.num_vals / CHUNK_SIZE as u64);
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
@@ -362,35 +357,17 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_datasets;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
}
const HIGHEST_BIT: u64 = 1 << 63;
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
#[test]
fn test_compression_i64() {
let data = (i64::MAX - 600_000..=i64::MAX - 550_000)
.map(i64_to_u64)
.collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large i64").unwrap();
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
assert!(actual_compression > 0.01);
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate(&MultiLinearInterpolFastFieldCodec, data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large").unwrap();
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
@@ -399,7 +376,7 @@ mod tests {
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_datasets();
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();

View File

@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
use stable_deref_trait::StableDeref;
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a slice.
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]

View File

@@ -1,5 +1,3 @@
#![allow(clippy::derive_partial_eq_without_eq)]
mod occur;
mod query_grammar;
mod user_input_ast;

View File

@@ -10,7 +10,7 @@ use super::metric::{AverageAggregation, StatsAggregation};
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
type_and_cardinality, FastFieldReaderImpl, FastType, MultiValuedFastFieldReader,
};
use crate::schema::{Cardinality, Type};
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
@@ -37,10 +37,10 @@ impl AggregationsWithAccessor {
#[derive(Clone)]
pub(crate) enum FastFieldAccessor {
Multi(MultiValuedFastFieldReader<u64>),
Single(DynamicFastFieldReader<u64>),
Single(FastFieldReaderImpl<u64>),
}
impl FastFieldAccessor {
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
pub fn as_single(&self) -> Option<&FastFieldReaderImpl<u64>> {
match self {
FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(reader),
@@ -118,7 +118,7 @@ impl BucketAggregationWithAccessor {
pub struct MetricAggregationWithAccessor {
pub metric: MetricAggregation,
pub field_type: Type,
pub accessor: DynamicFastFieldReader<u64>,
pub accessor: FastFieldReaderImpl<u64>,
}
impl MetricAggregationWithAccessor {

View File

@@ -1,7 +1,6 @@
use std::cmp::Ordering;
use std::fmt::Display;
use fastfield_codecs::Column;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
@@ -15,7 +14,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
};
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl};
use crate::schema::Type;
use crate::{DocId, TantivyError};
@@ -264,7 +263,7 @@ impl SegmentHistogramCollector {
req: &HistogramAggregation,
sub_aggregation: &AggregationsWithAccessor,
field_type: Type,
accessor: &DynamicFastFieldReader<u64>,
accessor: &FastFieldReaderImpl<u64>,
) -> crate::Result<Self> {
req.validate()?;
let min = f64_from_fastfield_u64(accessor.min_value(), &field_type);
@@ -332,10 +331,10 @@ impl SegmentHistogramCollector {
.expect("unexpected fast field cardinatility");
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0] as u64));
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1] as u64));
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2] as u64));
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3] as u64));
let val0 = self.f64_from_fastfield_u64(accessor.get(docs[0]));
let val1 = self.f64_from_fastfield_u64(accessor.get(docs[1]));
let val2 = self.f64_from_fastfield_u64(accessor.get(docs[2]));
let val3 = self.f64_from_fastfield_u64(accessor.get(docs[3]));
let bucket_pos0 = get_bucket_num(val0);
let bucket_pos1 = get_bucket_num(val1);
@@ -371,8 +370,8 @@ impl SegmentHistogramCollector {
&bucket_with_accessor.sub_aggregation,
)?;
}
for &doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get_val(doc as u64), &self.field_type);
for doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type);
if !bounds.contains(val) {
continue;
}
@@ -383,7 +382,7 @@ impl SegmentHistogramCollector {
self.buckets[bucket_pos].key,
get_bucket_val(val, self.interval, self.offset) as f64
);
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?;
}
if force_flush {
if let Some(sub_aggregations) = self.sub_aggregations.as_mut() {

View File

@@ -1,7 +1,6 @@
use std::fmt::Debug;
use std::ops::Range;
use fastfield_codecs::Column;
use fnv::FnvHashMap;
use serde::{Deserialize, Serialize};
@@ -13,6 +12,7 @@ use crate::aggregation::intermediate_agg_result::{
};
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
use crate::fastfield::FastFieldReader;
use crate::schema::Type;
use crate::{DocId, TantivyError};
@@ -264,10 +264,10 @@ impl SegmentRangeCollector {
.as_single()
.expect("unexpected fast field cardinatility");
for docs in iter.by_ref() {
let val1 = accessor.get_val(docs[0] as u64);
let val2 = accessor.get_val(docs[1] as u64);
let val3 = accessor.get_val(docs[2] as u64);
let val4 = accessor.get_val(docs[3] as u64);
let val1 = accessor.get(docs[0]);
let val2 = accessor.get(docs[1]);
let val3 = accessor.get(docs[2]);
let val4 = accessor.get(docs[3]);
let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3);
@@ -278,10 +278,10 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
}
for &doc in iter.remainder() {
let val = accessor.get_val(doc as u64);
for doc in iter.remainder() {
let val = accessor.get(*doc);
let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?;
}
if force_flush {
for bucket in &mut self.buckets {

View File

@@ -1,10 +1,9 @@
use std::fmt::Debug;
use fastfield_codecs::Column;
use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl};
use crate::schema::Type;
use crate::DocId;
@@ -44,7 +43,7 @@ pub(crate) struct SegmentAverageCollector {
}
impl Debug for SegmentAverageCollector {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("AverageCollector")
.field("data", &self.data)
.finish()
@@ -58,13 +57,13 @@ impl SegmentAverageCollector {
data: Default::default(),
}
}
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &FastFieldReaderImpl<u64>) {
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3] as u64);
let val1 = field.get(docs[0]);
let val2 = field.get(docs[1]);
let val3 = field.get(docs[2]);
let val4 = field.get(docs[3]);
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -74,8 +73,8 @@ impl SegmentAverageCollector {
self.data.collect(val3);
self.data.collect(val4);
}
for &doc in iter.remainder() {
let val = field.get_val(doc as u64);
for doc in iter.remainder() {
let val = field.get(*doc);
let val = f64_from_fastfield_u64(val, &self.field_type);
self.data.collect(val);
}

View File

@@ -1,8 +1,7 @@
use fastfield_codecs::Column;
use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl};
use crate::schema::Type;
use crate::{DocId, TantivyError};
@@ -164,13 +163,13 @@ impl SegmentStatsCollector {
stats: IntermediateStats::default(),
}
}
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &FastFieldReaderImpl<u64>) {
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3] as u64);
let val1 = field.get(docs[0]);
let val2 = field.get(docs[1]);
let val3 = field.get(docs[2]);
let val4 = field.get(docs[3]);
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -180,8 +179,8 @@ impl SegmentStatsCollector {
self.stats.collect(val3);
self.stats.collect(val4);
}
for &doc in iter.remainder() {
let val = field.get_val(doc as u64);
for doc in iter.remainder() {
let val = field.get(*doc);
let val = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val);
}

View File

@@ -11,10 +11,8 @@
// Importing tantivy...
use std::marker::PhantomData;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{DynamicFastFieldReader, FastValue};
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl, FastValue};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -160,7 +158,7 @@ where
TPredicate: 'static,
TPredicateValue: FastValue,
{
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
fast_field_reader: FastFieldReaderImpl<TPredicateValue>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
@@ -176,7 +174,7 @@ where
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_val(doc as u64);
let value = self.fast_field_reader.get(doc);
if (self.predicate)(value) {
self.segment_collector.collect(doc, score)
}

View File

@@ -1,8 +1,7 @@
use fastdivide::DividerU64;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{DynamicFastFieldReader, FastValue};
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl, FastValue};
use crate::schema::{Field, Type};
use crate::{DocId, Score};
@@ -85,14 +84,14 @@ impl HistogramComputer {
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: DynamicFastFieldReader<u64>,
ff_reader: FastFieldReaderImpl<u64>,
}
impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc as u64);
let value = self.ff_reader.get(doc);
self.histogram_computer.add_value(value);
}

View File

@@ -1,9 +1,7 @@
use fastfield_codecs::Column;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader;
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader};
use crate::fastfield::{BytesFastFieldReader, FastFieldReader, FastFieldReaderImpl};
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
@@ -158,7 +156,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector {
vals: Vec<u64>,
reader: DynamicFastFieldReader<u64>,
reader: FastFieldReaderImpl<u64>,
}
impl FastFieldTestCollector {
@@ -199,7 +197,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get_val(doc as u64);
let val = self.reader.get(doc);
self.vals.push(val);
}

View File

@@ -2,8 +2,6 @@ use std::collections::BinaryHeap;
use std::fmt;
use std::marker::PhantomData;
use fastfield_codecs::Column;
use super::Collector;
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector};
@@ -11,7 +9,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::fastfield::{DynamicFastFieldReader, FastValue};
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl, FastValue};
use crate::query::Weight;
use crate::schema::Field;
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
@@ -131,12 +129,12 @@ impl fmt::Debug for TopDocs {
}
struct ScorerByFastFieldReader {
ff_reader: DynamicFastFieldReader<u64>,
ff_reader: FastFieldReaderImpl<u64>,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc as u64)
self.ff_reader.get(doc)
}
}
@@ -409,7 +407,7 @@ impl TopDocs {
/// # use tantivy::query::QueryParser;
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::fastfield::Column;
/// use tantivy::fastfield::FastFieldReader;
/// use tantivy::schema::Field;
///
/// fn create_schema() -> Schema {
@@ -458,7 +456,7 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let popularity: u64 = popularity_reader.get(doc);
/// // Well.. For the sake of the example we use a simple logarithm
/// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -517,7 +515,7 @@ impl TopDocs {
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::schema::Field;
/// use fastfield_codecs::Column;
/// use tantivy::fastfield::FastFieldReader;
///
/// # fn create_schema() -> Schema {
/// # let mut schema_builder = Schema::builder();
@@ -569,8 +567,8 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let boosted: u64 = boosted_reader.get_val(doc as u64);
/// let popularity: u64 = popularity_reader.get(doc);
/// let boosted: u64 = boosted_reader.get(doc);
/// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order
/// // for free.

View File

@@ -7,7 +7,6 @@ use std::sync::Arc;
use super::segment::Segment;
use super::IndexSettings;
use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
use crate::core::{
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
};
@@ -17,7 +16,7 @@ use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::indexer::segment_updater::save_new_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -48,34 +47,6 @@ fn load_metas(
.map_err(From::from)
}
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// IndexBuilder can be used to create an index.
///
/// Use in conjunction with `SchemaBuilder`. Global index settings
@@ -164,25 +135,6 @@ impl IndexBuilder {
self.create(mmap_directory)
}
/// Dragons ahead!!!
///
/// The point of this API is to let users create a simple index with a single segment
/// and without starting any thread.
///
/// Do not use this method if you are not sure what you are doing.
///
/// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)]
pub fn single_segment_index_writer(
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
}
/// Creates a new index in a temp directory.
///
/// The index will use the `MMapDirectory` in a newly created directory.
@@ -628,12 +580,10 @@ impl fmt::Debug for Index {
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback};
use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
use crate::schema::{Field, Schema, INDEXED, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
#[test]
fn test_indexer_for_field() {
@@ -899,28 +849,4 @@ mod tests {
);
Ok(())
}
#[test]
fn test_single_segment_index_writer() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let directory = RamDirectory::default();
let mut single_segment_index_writer = Index::builder()
.schema(schema)
.single_segment_index_writer(directory, 10_000_000)?;
for _ in 0..10 {
let doc = doc!(text_field=>"hello");
single_segment_index_writer.add_document(doc)?;
}
let index = single_segment_index_writer.finalize()?;
let searcher = index.reader()?.searcher();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
);
let count = searcher.search(&term_query, &Count)?;
assert_eq!(count, 10);
Ok(())
}
}

View File

@@ -7,7 +7,6 @@ mod segment;
mod segment_component;
mod segment_id;
mod segment_reader;
mod single_segment_index_writer;
use std::path::Path;
@@ -24,7 +23,6 @@ pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
/// The meta file contains all the information about the list of segments and the schema
/// of the index.

View File

@@ -247,14 +247,6 @@ impl SearcherInner {
generation: TrackedObject<SearcherGeneration>,
doc_store_cache_size: usize,
) -> io::Result<SearcherInner> {
assert_eq!(
&segment_readers
.iter()
.map(|reader| (reader.segment_id(), reader.delete_opstamp()))
.collect::<BTreeMap<_, _>>(),
generation.segments(),
"Set of segments referenced by this Searcher and its SearcherGeneration must match"
);
let store_readers: Vec<StoreReader> = segment_readers
.iter()
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))

View File

@@ -1,47 +0,0 @@
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter {
segment_writer: SegmentWriter,
segment: Segment,
opstamp: Opstamp,
}
impl SingleSegmentIndexWriter {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
Ok(Self {
segment_writer,
segment,
opstamp: 0,
})
}
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
let opstamp = self.opstamp;
self.opstamp += 1;
self.segment_writer
.add_document(AddOperation { opstamp, document })
}
pub fn finalize(self) -> crate::Result<Index> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer.finalize()?;
let segment: Segment = self.segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: vec![segment.meta().clone()],
schema: index.schema(),
opstamp: 0,
payload: None,
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;
Ok(segment.index().clone())
}
}

View File

@@ -38,7 +38,7 @@ impl BinarySerializable for FileAddr {
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: Vec<(FileAddr, u64)>,
offsets: HashMap<FileAddr, u64>,
}
impl<W: TerminatingWrite + Write> CompositeWrite<W> {
@@ -47,7 +47,7 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
pub fn wrap(w: W) -> CompositeWrite<W> {
CompositeWrite {
write: CountingWriter::wrap(w),
offsets: Vec::new(),
offsets: HashMap::new(),
}
}
@@ -60,8 +60,8 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
let offset = self.write.written_bytes();
let file_addr = FileAddr::new(field, idx);
assert!(!self.offsets.iter().any(|el| el.0 == file_addr));
self.offsets.push((file_addr, offset));
assert!(!self.offsets.contains_key(&file_addr));
self.offsets.insert(file_addr, offset);
&mut self.write
}
@@ -73,8 +73,16 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self
.offsets
.iter()
.map(|(file_addr, offset)| (*offset, *file_addr))
.collect();
offset_fields.sort();
let mut prev_offset = 0;
for (file_addr, offset) in self.offsets {
for (offset, file_addr) in offset_fields {
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
file_addr.serialize(&mut self.write)?;
prev_offset = offset;
@@ -98,14 +106,6 @@ pub struct CompositeFile {
offsets_index: HashMap<FileAddr, Range<usize>>,
}
impl std::fmt::Debug for CompositeFile {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CompositeFile")
.field("offsets_index", &self.offsets_index)
.finish()
}
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `FileSlice`.
@@ -233,56 +233,4 @@ mod test {
}
Ok(())
}
#[test]
fn test_composite_file_bug() -> crate::Result<()> {
let path = Path::new("test_path");
let directory = RamDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);
let mut write = composite_write.for_field_with_idx(Field::from_field_id(1u32), 0);
VInt(32431123u64).serialize(&mut write)?;
write.flush()?;
let write = composite_write.for_field_with_idx(Field::from_field_id(1u32), 1);
write.flush()?;
let mut write = composite_write.for_field_with_idx(Field::from_field_id(0u32), 0);
VInt(1_000_000).serialize(&mut write)?;
write.flush()?;
composite_write.close()?;
}
{
let r = directory.open_read(path)?;
let composite_file = CompositeFile::open(&r)?;
{
let file = composite_file
.open_read_with_idx(Field::from_field_id(1u32), 0)
.unwrap()
.read_bytes()?;
let mut file0_buf = file.as_slice();
let payload_0 = VInt::deserialize(&mut file0_buf)?.0;
assert_eq!(file0_buf.len(), 0);
assert_eq!(payload_0, 32431123u64);
}
{
let file = composite_file
.open_read_with_idx(Field::from_field_id(1u32), 1)
.unwrap()
.read_bytes()?;
let file = file.as_slice();
assert_eq!(file.len(), 0);
}
{
let file = composite_file
.open_read_with_idx(Field::from_field_id(0u32), 0)
.unwrap()
.read_bytes()?;
let file = file.as_slice();
assert_eq!(file.len(), 3);
}
}
Ok(())
}
}

View File

@@ -1,7 +1,5 @@
use fastfield_codecs::Column;
use crate::directory::{FileSlice, OwnedBytes};
use crate::fastfield::{DynamicFastFieldReader, MultiValueLength};
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl, MultiValueLength};
use crate::DocId;
/// Reader for byte array fast fields
@@ -16,13 +14,13 @@ use crate::DocId;
/// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: DynamicFastFieldReader<u64>,
idx_reader: FastFieldReaderImpl<u64>,
values: OwnedBytes,
}
impl BytesFastFieldReader {
pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>,
idx_reader: FastFieldReaderImpl<u64>,
values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?;
@@ -30,9 +28,8 @@ impl BytesFastFieldReader {
}
fn range(&self, doc: DocId) -> (usize, usize) {
let idx = doc as u64;
let start = self.idx_reader.get_val(idx) as usize;
let stop = self.idx_reader.get_val(idx + 1) as usize;
let start = self.idx_reader.get(doc) as usize;
let stop = self.idx_reader.get(doc + 1) as usize;
(start, stop)
}

View File

@@ -1,361 +0,0 @@
use std::io::{self, Write};
use std::num::NonZeroU64;
use common::BinarySerializable;
use fastdivide::DividerU64;
use fastfield_codecs::{Column, FastFieldCodec};
use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1;
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct GCDReader<CodecReader: Column> {
gcd_params: GCDParams,
reader: CodecReader,
}
#[derive(Debug, Clone, Copy)]
struct GCDParams {
gcd: u64,
min_value: u64,
num_vals: u64,
}
impl GCDParams {
pub fn eval(&self, val: u64) -> u64 {
self.min_value + self.gcd * val
}
}
impl BinarySerializable for GCDParams {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.gcd.serialize(writer)?;
self.min_value.serialize(writer)?;
self.num_vals.serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let gcd: u64 = u64::deserialize(reader)?;
let min_value: u64 = u64::deserialize(reader)?;
let num_vals: u64 = u64::deserialize(reader)?;
Ok(Self {
gcd,
min_value,
num_vals,
})
}
}
pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
bytes: OwnedBytes,
) -> io::Result<GCDReader<WrappedCodec::Reader>> {
let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset);
let gcd_params = GCDParams::deserialize(&mut footer)?;
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
Ok(GCDReader { gcd_params, reader })
}
impl<C: Column + Clone> Column for GCDReader<C> {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
let val = self.reader.get_val(doc);
self.gcd_params.eval(val)
}
fn min_value(&self) -> u64 {
self.gcd_params.eval(self.reader.min_value())
}
fn max_value(&self) -> u64 {
self.gcd_params.eval(self.reader.max_value())
}
fn num_vals(&self) -> u64 {
self.gcd_params.num_vals
}
}
pub fn write_gcd_header<W: Write>(
field_write: &mut W,
min_value: u64,
gcd: u64,
num_vals: u64,
) -> io::Result<()> {
gcd.serialize(field_write)?;
min_value.serialize(field_write)?;
num_vals.serialize(field_write)?;
Ok(())
}
/// Compute the gcd of two non null numbers.
///
/// It is recommended, but not required, to feed values such that `large >= small`.
fn compute_gcd(mut large: NonZeroU64, mut small: NonZeroU64) -> NonZeroU64 {
loop {
let rem: u64 = large.get() % small;
if let Some(new_small) = NonZeroU64::new(rem) {
(large, small) = (small, new_small);
} else {
return small;
}
}
}
// Find GCD for iterator of numbers
pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<NonZeroU64> {
let mut numbers = numbers.flat_map(NonZeroU64::new);
let mut gcd: NonZeroU64 = numbers.next()?;
if gcd.get() == 1 {
return Some(gcd);
}
let mut gcd_divider = DividerU64::divide_by(gcd.get());
for val in numbers {
let remainder = val.get() - (gcd_divider.divide(val.get())) * gcd.get();
if remainder == 0 {
continue;
}
gcd = compute_gcd(val, gcd);
if gcd.get() == 1 {
return Some(gcd);
}
gcd_divider = DividerU64::divide_by(gcd.get());
}
Some(gcd)
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::num::NonZeroU64;
use std::path::Path;
use std::time::{Duration, SystemTime};
use common::HasLen;
use fastfield_codecs::Column;
use crate::directory::{CompositeFile, RamDirectory, WritePtr};
use crate::fastfield::gcd::compute_gcd;
use crate::fastfield::serializer::FastFieldCodecEnableCheck;
use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
use crate::fastfield::{
find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecType,
FastFieldsWriter, ALL_CODECS,
};
use crate::schema::{Cardinality, Schema};
use crate::{DateOptions, DatePrecision, DateTime, Directory};
fn get_index(
docs: &[crate::Document],
schema: &Schema,
codec_enable_checker: FastFieldCodecEnableCheck,
) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer =
CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker)
.unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
for doc in docs {
fast_field_writers.add_document(doc);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
Ok(directory)
}
fn test_fastfield_gcd_i64_with_codec(
code_type: FastFieldCodecType,
num_vals: usize,
) -> crate::Result<()> {
let path = Path::new("test");
let mut docs = vec![];
for i in 1..=num_vals {
let val = (i as i64 - 5) * 1000i64;
docs.push(doc!(*FIELDI64=>val));
}
let directory = get_index(&docs, &SCHEMAI64, code_type.into())?;
let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), -4000i64);
assert_eq!(fast_field_reader.get_val(1), -3000i64);
assert_eq!(fast_field_reader.get_val(2), -2000i64);
assert_eq!(fast_field_reader.max_value(), (num_vals as i64 - 5) * 1000);
assert_eq!(fast_field_reader.min_value(), -4000i64);
let file = directory.open_read(path).unwrap();
// Can't apply gcd
let path = Path::new("test");
docs.pop();
docs.push(doc!(*FIELDI64=>2001i64));
let directory = get_index(&docs, &SCHEMAI64, code_type.into())?;
let file2 = directory.open_read(path).unwrap();
assert!(file2.len() > file.len());
Ok(())
}
#[test]
fn test_fastfield_gcd_i64() -> crate::Result<()> {
for &code_type in ALL_CODECS {
test_fastfield_gcd_i64_with_codec(code_type, 5500)?;
}
Ok(())
}
fn test_fastfield_gcd_u64_with_codec(
code_type: FastFieldCodecType,
num_vals: usize,
) -> crate::Result<()> {
let path = Path::new("test");
let mut docs = vec![];
for i in 1..=num_vals {
let val = i as u64 * 1000u64;
docs.push(doc!(*FIELD=>val));
}
let directory = get_index(&docs, &SCHEMA, code_type.into())?;
let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), 1000u64);
assert_eq!(fast_field_reader.get_val(1), 2000u64);
assert_eq!(fast_field_reader.get_val(2), 3000u64);
assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
assert_eq!(fast_field_reader.min_value(), 1000u64);
let file = directory.open_read(path).unwrap();
// Can't apply gcd
let path = Path::new("test");
docs.pop();
docs.push(doc!(*FIELDI64=>2001u64));
let directory = get_index(&docs, &SCHEMA, code_type.into())?;
let file2 = directory.open_read(path).unwrap();
assert!(file2.len() > file.len());
Ok(())
}
#[test]
fn test_fastfield_gcd_u64() -> crate::Result<()> {
for &code_type in ALL_CODECS {
test_fastfield_gcd_u64_with_codec(code_type, 5500)?;
}
Ok(())
}
#[test]
pub fn test_fastfield2() {
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get_val(0), 100);
assert_eq!(test_fastfield.get_val(1), 200);
assert_eq!(test_fastfield.get_val(2), 300);
}
#[test]
pub fn test_gcd_date() -> crate::Result<()> {
let size_prec_sec =
test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Seconds)?;
let size_prec_micro =
test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Microseconds)?;
assert!(size_prec_sec < size_prec_micro);
let size_prec_sec =
test_gcd_date_with_codec(FastFieldCodecType::Linear, DatePrecision::Seconds)?;
let size_prec_micro =
test_gcd_date_with_codec(FastFieldCodecType::Linear, DatePrecision::Microseconds)?;
assert!(size_prec_sec < size_prec_micro);
Ok(())
}
fn test_gcd_date_with_codec(
codec_type: FastFieldCodecType,
precision: DatePrecision,
) -> crate::Result<usize> {
let time1 = DateTime::from_timestamp_micros(
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs() as i64,
);
let time2 = DateTime::from_timestamp_micros(
SystemTime::now()
.checked_sub(Duration::from_micros(4111))
.unwrap()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs() as i64,
);
let time3 = DateTime::from_timestamp_micros(
SystemTime::now()
.checked_sub(Duration::from_millis(2000))
.unwrap()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs() as i64,
);
let mut schema_builder = Schema::builder();
let date_options = DateOptions::default()
.set_fast(Cardinality::SingleValue)
.set_precision(precision);
let field = schema_builder.add_date_field("field", date_options);
let schema = schema_builder.build();
let docs = vec![doc!(field=>time1), doc!(field=>time2), doc!(field=>time3)];
let directory = get_index(&docs, &schema, codec_type.into())?;
let path = Path::new("test");
let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let len = file.len();
let test_fastfield = DynamicFastFieldReader::<DateTime>::open(file)?;
assert_eq!(test_fastfield.get_val(0), time1.truncate(precision));
assert_eq!(test_fastfield.get_val(1), time2.truncate(precision));
assert_eq!(test_fastfield.get_val(2), time3.truncate(precision));
Ok(len)
}
#[test]
fn test_compute_gcd() {
let test_compute_gcd_aux = |large, small, expected| {
let large = NonZeroU64::new(large).unwrap();
let small = NonZeroU64::new(small).unwrap();
let expected = NonZeroU64::new(expected).unwrap();
assert_eq!(compute_gcd(small, large), expected);
assert_eq!(compute_gcd(large, small), expected);
};
test_compute_gcd_aux(1, 4, 1);
test_compute_gcd_aux(2, 4, 2);
test_compute_gcd_aux(10, 25, 5);
test_compute_gcd_aux(25, 25, 25);
}
#[test]
fn find_gcd_test() {
assert_eq!(find_gcd([0].into_iter()), None);
assert_eq!(find_gcd([0, 10].into_iter()), NonZeroU64::new(10));
assert_eq!(find_gcd([10, 0].into_iter()), NonZeroU64::new(10));
assert_eq!(find_gcd([].into_iter()), None);
assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), NonZeroU64::new(5));
assert_eq!(find_gcd([15, 16, 10].into_iter()), NonZeroU64::new(1));
assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), NonZeroU64::new(5));
assert_eq!(find_gcd([0, 0].into_iter()), None);
}
}

View File

@@ -20,18 +20,18 @@
//!
//! Read access performance is comparable to that of an array lookup.
use fastfield_codecs::FastFieldCodecType;
use fastfield_codecs::dynamic::DynamicFastFieldCodec;
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT};
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::DynamicFastFieldReader;
pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType};
pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats};
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldStats};
pub use self::wrapper::FastFieldReaderWrapper;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::{Cardinality, FieldType, Type, Value};
use crate::{DateTime, DocId};
@@ -40,19 +40,13 @@ mod alive_bitset;
mod bytes;
mod error;
mod facet_reader;
mod gcd;
mod multivalued;
mod reader;
mod readers;
mod serializer;
mod wrapper;
mod writer;
pub(crate) const ALL_CODECS: &[FastFieldCodecType; 3] = &[
FastFieldCodecType::Bitpacked,
FastFieldCodecType::Linear,
FastFieldCodecType::BlockwiseLinear,
];
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
/// for a doc_id
pub trait MultiValueLength {
@@ -122,6 +116,9 @@ impl FastValue for u64 {
}
}
// TODO rename
pub type FastFieldReaderImpl<V> = FastFieldReaderWrapper<V, DynamicFastFieldCodec>;
impl FastValue for i64 {
fn from_u64(val: u64) -> Self {
common::u64_to_i64(val)
@@ -286,21 +283,14 @@ mod tests {
schema_builder.build()
});
pub static SCHEMAI64: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
schema_builder.add_i64_field("field", FAST);
schema_builder.build()
});
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());
#[test]
pub fn test_fastfield() {
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get_val(0u64), 100);
assert_eq!(test_fastfield.get_val(1u64), 200);
assert_eq!(test_fastfield.get_val(2u64), 300);
let test_fastfield = FastFieldReaderImpl::<u64>::from(&[100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
}
#[test]
@@ -326,13 +316,13 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 45);
assert_eq!(file.len(), 37);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), 13u64);
assert_eq!(fast_field_reader.get_val(1), 14u64);
assert_eq!(fast_field_reader.get_val(2), 2u64);
let fast_field_reader = FastFieldReaderImpl::<u64>::open(file)?;
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
Ok(())
}
@@ -357,20 +347,20 @@ mod tests {
serializer.close()?;
}
let file = directory.open_read(path)?;
assert_eq!(file.len(), 70);
assert_eq!(file.len(), 62);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get_val(0), 4u64);
assert_eq!(fast_field_reader.get_val(1), 14_082_001u64);
assert_eq!(fast_field_reader.get_val(2), 3_052u64);
assert_eq!(fast_field_reader.get_val(3), 9002u64);
assert_eq!(fast_field_reader.get_val(4), 15_001u64);
assert_eq!(fast_field_reader.get_val(5), 777u64);
assert_eq!(fast_field_reader.get_val(6), 1_002u64);
assert_eq!(fast_field_reader.get_val(7), 1_501u64);
assert_eq!(fast_field_reader.get_val(8), 215u64);
let fast_field_reader = FastFieldReaderImpl::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
assert_eq!(fast_field_reader.get(3), 9002u64);
assert_eq!(fast_field_reader.get(4), 15_001u64);
assert_eq!(fast_field_reader.get(5), 777u64);
assert_eq!(fast_field_reader.get(6), 1_002u64);
assert_eq!(fast_field_reader.get(7), 1_501u64);
assert_eq!(fast_field_reader.get(8), 215u64);
}
Ok(())
}
@@ -393,13 +383,13 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 43);
assert_eq!(file.len(), 35);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
let fast_field_reader = FastFieldReaderImpl::<u64>::open(data)?;
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
}
Ok(())
@@ -425,15 +415,15 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80051);
assert_eq!(file.len(), 80043);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get_val(0), 0u64);
let fast_field_reader = FastFieldReaderImpl::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(
fast_field_reader.get_val(doc),
fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
);
}
@@ -465,17 +455,16 @@ mod tests {
}
let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
// assert_eq!(file.len(), 10175_usize); // linear interpol size
assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement
assert_eq!(file.len(), 10175_usize); // linear interpol size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
let fast_field_reader = FastFieldReaderImpl::<i64>::open(data)?;
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get_val(doc as u64), i);
assert_eq!(fast_field_reader.get(doc as u32), i);
}
let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]);
@@ -510,8 +499,8 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.get_val(0), 0i64);
let fast_field_reader = FastFieldReaderImpl::<i64>::open(data)?;
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
Ok(())
}
@@ -548,10 +537,10 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
let fast_field_reader = FastFieldReaderImpl::<u64>::open(data)?;
for a in 0..n {
assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
}
}
Ok(())
@@ -842,19 +831,19 @@ mod tests {
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
let mut dates = vec![];
{
assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
assert_eq!(date_fast_field.get(0u32).into_timestamp_micros(), 1i64);
dates_fast_field.get_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
}
{
assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
assert_eq!(date_fast_field.get(1u32).into_timestamp_micros(), 4i64);
dates_fast_field.get_vals(1u32, &mut dates);
assert!(dates.is_empty());
}
{
assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
assert_eq!(date_fast_field.get(2u32).into_timestamp_micros(), 0i64);
dates_fast_field.get_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
@@ -865,11 +854,11 @@ mod tests {
#[test]
pub fn test_fastfield_bool() {
let test_fastfield = DynamicFastFieldReader::<bool>::from(vec![true, false, true, false]);
assert_eq!(test_fastfield.get_val(0), true);
assert_eq!(test_fastfield.get_val(1), false);
assert_eq!(test_fastfield.get_val(2), true);
assert_eq!(test_fastfield.get_val(3), false);
let test_fastfield = FastFieldReaderImpl::<bool>::from(&[true, false, true, false]);
assert_eq!(test_fastfield.get(0), true);
assert_eq!(test_fastfield.get(1), false);
assert_eq!(test_fastfield.get(2), true);
assert_eq!(test_fastfield.get(3), false);
}
#[test]
@@ -896,14 +885,14 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 44);
assert_eq!(file.len(), 36);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), true);
assert_eq!(fast_field_reader.get_val(1), false);
assert_eq!(fast_field_reader.get_val(2), true);
assert_eq!(fast_field_reader.get_val(3), false);
let fast_field_reader = FastFieldReaderImpl::<bool>::open(file)?;
assert_eq!(fast_field_reader.get(0), true);
assert_eq!(fast_field_reader.get(1), false);
assert_eq!(fast_field_reader.get(2), true);
assert_eq!(fast_field_reader.get(3), false);
Ok(())
}
@@ -932,13 +921,13 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 56);
assert_eq!(file.len(), 48);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
let fast_field_reader = FastFieldReaderImpl::<bool>::open(file)?;
for i in 0..25 {
assert_eq!(fast_field_reader.get_val(i * 2), true);
assert_eq!(fast_field_reader.get_val(i * 2 + 1), false);
assert_eq!(fast_field_reader.get(i * 2), true);
assert_eq!(fast_field_reader.get(i * 2 + 1), false);
}
Ok(())
@@ -966,11 +955,11 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 43);
assert_eq!(file.len(), 35);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), false);
let fast_field_reader = FastFieldReaderImpl::<bool>::open(file)?;
assert_eq!(fast_field_reader.get(0), false);
Ok(())
}
@@ -978,20 +967,37 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use fastfield_codecs::Column;
use std::collections::HashMap;
use std::path::Path;
use test::{self, Bencher};
use super::tests::generate_permutation;
use super::tests::{generate_permutation, FIELD, SCHEMA};
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fastfield::tests::generate_permutation_gcd;
use crate::fastfield::FastFieldReader;
#[bench]
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for _ in 0..n {
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
@@ -999,83 +1005,102 @@ mod bench {
}
#[bench]
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = column.get_val(a as u64);
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
a
});
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|val| val * 7) {
a ^= fast_field_reader.get(i);
}
a
});
}
}
#[bench]
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += permutation[i as usize];
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
a
});
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get(i) as u32;
}
a
});
}
}
#[bench]
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += column.get_val(i as u64);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0u64..n as u64 {
a += column.get_val(i);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation_gcd();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0..n as u64 {
a += column.get_val(i);
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
a
});
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
#[bench]
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let mut a = 0u64;
for i in 0..permutation.len() {
a += permutation[i as usize] as u64;
}
a
});
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get(i) as u32;
}
a
});
}
}
}

View File

@@ -346,26 +346,32 @@ mod tests {
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
}
}
#[test]
fn test_proptest_merge_multivalued_bug() {
use IndexingOp::*;
let ops = &[AddDoc { id: 7 }, AddDoc { id: 4 }, Merge];
assert!(test_multivalued_no_panic(ops).is_ok());
}
#[test]
fn test_multivalued_proptest_gcd() {
use IndexingOp::*;
let ops = [AddDoc { id: 9 }, AddDoc { id: 9 }, Merge];
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
let ops = &[AddDoc { id: 9 }, AddDoc { id: 9 }, Merge];
assert!(test_multivalued_no_panic(ops).is_ok());
}
#[test]
fn test_multivalued_proptest_off_by_one_bug_1151() {
use IndexingOp::*;
let ops = [
let ops = &[
AddDoc { id: 3 },
AddDoc { id: 1 },
AddDoc { id: 3 },
Commit,
Merge,
];
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
assert!(test_multivalued_no_panic(ops).is_ok());
}
#[test]

View File

@@ -1,8 +1,6 @@
use std::ops::Range;
use fastfield_codecs::Column;
use crate::fastfield::{DynamicFastFieldReader, FastValue, MultiValueLength};
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl, FastValue, MultiValueLength};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -14,14 +12,14 @@ use crate::DocId;
/// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: DynamicFastFieldReader<Item>,
idx_reader: FastFieldReaderImpl<u64>,
vals_reader: FastFieldReaderImpl<Item>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: DynamicFastFieldReader<Item>,
idx_reader: FastFieldReaderImpl<u64>,
vals_reader: FastFieldReaderImpl<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
@@ -33,9 +31,8 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// to the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let idx = doc as u64;
let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
let start = self.idx_reader.get(doc);
let end = self.idx_reader.get(doc + 1);
start..end
}

View File

@@ -3,7 +3,7 @@ use std::io;
use fnv::FnvHashMap;
use tantivy_bitpacker::minmax;
use crate::fastfield::serializer::BitpackedSerializerLegacy;
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType, FastValue};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
@@ -171,7 +171,7 @@ impl MultiValuedFastFieldWriter {
}
{
// writing the values themselves.
let mut value_serializer: BitpackedSerializerLegacy<'_, _>;
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
if let Some(mapping) = mapping_opt {
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,

View File

@@ -1,193 +1,8 @@
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
use common::BinarySerializable;
use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader};
use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader};
use fastfield_codecs::linear::{LinearCodec, LinearReader};
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType};
use super::gcd::open_gcd_from_bytes;
use super::FastValue;
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
use crate::error::DataCorruption;
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader};
use crate::schema::{Schema, FAST};
use crate::DocId;
#[derive(Clone)]
/// DynamicFastFieldReader wraps different readers to access
/// the various encoded fastfield data
pub enum DynamicFastFieldReader<Item: FastValue> {
/// Bitpacked compressed fastfield data.
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
/// Linear interpolated values + bitpacked
Linear(FastFieldReaderCodecWrapper<Item, LinearReader>),
/// Blockwise linear interpolated values + bitpacked
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
/// GCD and Bitpacked compressed fastfield data.
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BitpackedReader>>),
/// GCD and Linear interpolated values + bitpacked
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<LinearReader>>),
/// GCD and Blockwise linear interpolated values + bitpacked
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BlockwiseLinearReader>>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open_from_id(
mut bytes: OwnedBytes,
codec_type: FastFieldCodecType,
) -> crate::Result<DynamicFastFieldReader<Item>> {
let reader = match codec_type {
FastFieldCodecType::Bitpacked => {
DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
}
FastFieldCodecType::Linear => {
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
}
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
),
FastFieldCodecType::Gcd => {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
match codec_type {
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
),
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
),
FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinearGCD(
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
)
}
FastFieldCodecType::Gcd => {
return Err(DataCorruption::comment_only(
"Gcd codec wrapped into another gcd codec. This combination is not \
allowed.",
)
.into())
}
}
}
};
Ok(reader)
}
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
let mut bytes = file.read_bytes()?;
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
Self::open_from_id(bytes, codec_type)
}
}
impl<Item: FastValue> Column<Item> for DynamicFastFieldReader<Item> {
#[inline]
fn get_val(&self, idx: u64) -> Item {
match self {
Self::Bitpacked(reader) => reader.get_val(idx),
Self::Linear(reader) => reader.get_val(idx),
Self::BlockwiseLinear(reader) => reader.get_val(idx),
Self::BitpackedGCD(reader) => reader.get_val(idx),
Self::LinearGCD(reader) => reader.get_val(idx),
Self::BlockwiseLinearGCD(reader) => reader.get_val(idx),
}
}
#[inline]
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::Linear(reader) => reader.get_range(start, output),
Self::BlockwiseLinear(reader) => reader.get_range(start, output),
Self::BitpackedGCD(reader) => reader.get_range(start, output),
Self::LinearGCD(reader) => reader.get_range(start, output),
Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.min_value(),
Self::Linear(reader) => reader.min_value(),
Self::BlockwiseLinear(reader) => reader.min_value(),
Self::BitpackedGCD(reader) => reader.min_value(),
Self::LinearGCD(reader) => reader.min_value(),
Self::BlockwiseLinearGCD(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.max_value(),
Self::Linear(reader) => reader.max_value(),
Self::BlockwiseLinear(reader) => reader.max_value(),
Self::BitpackedGCD(reader) => reader.max_value(),
Self::LinearGCD(reader) => reader.max_value(),
Self::BlockwiseLinearGCD(reader) => reader.max_value(),
}
}
fn num_vals(&self) -> u64 {
match self {
Self::Bitpacked(reader) => reader.num_vals(),
Self::Linear(reader) => reader.num_vals(),
Self::BlockwiseLinear(reader) => reader.num_vals(),
Self::BitpackedGCD(reader) => reader.num_vals(),
Self::LinearGCD(reader) => reader.num_vals(),
Self::BlockwiseLinearGCD(reader) => reader.num_vals(),
}
}
}
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
reader: CodecReader,
_phantom: PhantomData<Item>,
}
impl<Item: FastValue, CodecReader> From<CodecReader>
for FastFieldReaderCodecWrapper<Item, CodecReader>
{
fn from(reader: CodecReader) -> Self {
FastFieldReaderCodecWrapper {
reader,
_phantom: PhantomData,
}
}
}
impl<Item: FastValue, D: Column> FastFieldReaderCodecWrapper<Item, D> {
#[inline]
pub(crate) fn get_u64(&self, idx: u64) -> Item {
let data = self.reader.get_val(idx);
Item::from_u64(data)
}
/// Internally `multivalued` also use SingleValue Fast fields.
/// It works as follows... A first column contains the list of start index
/// for each document, a second column contains the actual values.
///
/// The values associated to a given doc, are then
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
///
/// Which means single value fast field reader can be indexed internally with
/// something different from a `DocId`. For this use case, we want to use `u64`
/// values.
///
/// See `get_range` for an actual documentation about this method.
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
for (i, out) in output.iter_mut().enumerate() {
*out = self.get_u64(start + (i as u64));
}
}
}
impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWrapper<Item, C> {
/// FastFieldReader is the trait to access fast field data.
pub trait FastFieldReader<Item: FastValue> {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
@@ -195,10 +10,7 @@ impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWr
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get_val(&self, idx: u64) -> Item {
self.get_u64(idx)
}
fn get(&self, doc: DocId) -> Item;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
@@ -213,66 +25,19 @@ impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWr
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}
fn get_range(&self, start: u64, output: &mut [Item]);
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item {
Item::from_u64(self.reader.min_value())
}
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
fn min_value(&self) -> Item;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item {
Item::from_u64(self.reader.max_value())
}
fn num_vals(&self) -> u64 {
self.reader.num_vals()
}
}
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer_mut(field)
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_file = composite_file
.open_read(field)
.expect("File component not found");
DynamicFastFieldReader::open(field_file).unwrap()
}
fn max_value(&self) -> Item;
}

View File

@@ -1,7 +1,7 @@
use super::reader::DynamicFastFieldReader;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
BytesFastFieldReader, FastFieldNotAvailableError, FastFieldReaderImpl, FastValue,
MultiValuedFastFieldReader,
};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
@@ -109,14 +109,15 @@ impl FastFieldReaders {
&self,
field: Field,
index: usize,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
) -> crate::Result<FastFieldReaderImpl<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, index)?;
DynamicFastFieldReader::open(fast_field_slice)
let fast_field_data = fast_field_slice.read_bytes()?;
FastFieldReaderImpl::open_from_bytes(fast_field_data)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
) -> crate::Result<FastFieldReaderImpl<TFastValue>> {
self.typed_fast_field_reader_with_idx(field, 0)
}
@@ -132,7 +133,7 @@ impl FastFieldReaders {
/// Returns the `u64` fast field reader reader associated to `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReaderImpl<u64>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -142,14 +143,14 @@ impl FastFieldReaders {
///
/// If not, the fastfield reader will returns the u64-value associated to the original
/// FastValue.
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
pub fn u64_lenient(&self, field: Field) -> crate::Result<FastFieldReaderImpl<u64>> {
self.typed_fast_field_reader(field)
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReaderImpl<i64>> {
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -157,7 +158,7 @@ impl FastFieldReaders {
/// Returns the `date` fast field reader reader associated to `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> {
pub fn date(&self, field: Field) -> crate::Result<FastFieldReaderImpl<DateTime>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -165,7 +166,7 @@ impl FastFieldReaders {
/// Returns the `f64` fast field reader reader associated to `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReaderImpl<f64>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -173,7 +174,7 @@ impl FastFieldReaders {
/// Returns the `bool` fast field reader reader associated to `field`.
///
/// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field: Field) -> crate::Result<DynamicFastFieldReader<bool>> {
pub fn bool(&self, field: Field) -> crate::Result<FastFieldReaderImpl<bool>> {
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -241,7 +242,8 @@ impl FastFieldReaders {
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let idx_reader = DynamicFastFieldReader::open(fast_field_idx_file)?;
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
let idx_reader = FastFieldReaderImpl::open_from_bytes(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {

View File

@@ -1,17 +1,13 @@
use std::io::{self, Write};
use std::num::NonZeroU64;
use common::{BinarySerializable, CountingWriter};
use fastdivide::DividerU64;
pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::FastFieldCodecType;
pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
pub use fastfield_codecs::bitpacked::{
BitpackedFastFieldCodec, BitpackedFastFieldSerializerLegacy,
};
use fastfield_codecs::dynamic::{CodecType, DynamicFastFieldCodec};
pub use fastfield_codecs::{FastFieldCodec, FastFieldStats};
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
use crate::directory::{CompositeWrite, WritePtr};
use crate::fastfield::gcd::write_gcd_header;
use crate::schema::Field;
/// `CompositeFastFieldSerializer` is in charge of serializing
@@ -36,60 +32,13 @@ use crate::schema::Field;
/// * `close()`
pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
codec_enable_checker: FastFieldCodecEnableCheck,
}
#[derive(Debug, Clone)]
pub struct FastFieldCodecEnableCheck {
enabled_codecs: Vec<FastFieldCodecType>,
}
impl FastFieldCodecEnableCheck {
fn allow_all() -> Self {
FastFieldCodecEnableCheck {
enabled_codecs: ALL_CODECS.to_vec(),
}
}
fn is_enabled(&self, code_type: FastFieldCodecType) -> bool {
self.enabled_codecs.contains(&code_type)
}
}
impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
fn from(code_type: FastFieldCodecType) -> Self {
FastFieldCodecEnableCheck {
enabled_codecs: vec![code_type],
}
}
}
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<C: FastFieldCodec>(
fastfield_accessor: &impl Column,
estimations: &mut Vec<(f32, FastFieldCodecType)>,
) {
if let Some(ratio) = C::estimate(fastfield_accessor) {
estimations.push((ratio, C::CODEC_TYPE));
}
}
impl CompositeFastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
Self::from_write_with_codec(write, FastFieldCodecEnableCheck::allow_all())
}
/// Constructor
pub fn from_write_with_codec(
write: WritePtr,
codec_enable_checker: FastFieldCodecEnableCheck,
) -> io::Result<CompositeFastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(CompositeFastFieldSerializer {
composite_write,
codec_enable_checker,
})
Ok(CompositeFastFieldSerializer { composite_write })
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
@@ -97,19 +46,10 @@ impl CompositeFastFieldSerializer {
pub fn create_auto_detect_u64_fast_field(
&mut self,
field: Field,
fastfield_accessor: impl Column,
stats: FastFieldStats,
vals: &[u64],
) -> io::Result<()> {
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn write_header<W: Write>(
field_write: &mut W,
codec_type: FastFieldCodecType,
) -> io::Result<()> {
codec_type.to_code().serialize(field_write)?;
Ok(())
self.create_auto_detect_u64_fast_field_with_idx(field, stats, vals, 0)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
@@ -117,133 +57,12 @@ impl CompositeFastFieldSerializer {
pub fn create_auto_detect_u64_fast_field_with_idx(
&mut self,
field: Field,
fastfield_accessor: impl Column,
stats: FastFieldStats,
vals: &[u64],
idx: usize,
) -> io::Result<()> {
let min_value = fastfield_accessor.min_value();
let field_write = self.composite_write.for_field_with_idx(field, idx);
let gcd = find_gcd(fastfield_accessor.iter().map(|val| val - min_value))
.map(NonZeroU64::get)
.unwrap_or(GCD_DEFAULT);
if gcd == 1 {
return Self::create_auto_detect_u64_fast_field_with_idx_gcd(
self.codec_enable_checker.clone(),
field,
field_write,
fastfield_accessor,
);
}
Self::write_header(field_write, FastFieldCodecType::Gcd)?;
struct GCDWrappedFFAccess<T: Column> {
fastfield_accessor: T,
base_value: u64,
max_value: u64,
num_vals: u64,
gcd: DividerU64,
}
impl<T: Column> Column for GCDWrappedFFAccess<T> {
fn get_val(&self, position: u64) -> u64 {
self.gcd
.divide(self.fastfield_accessor.get_val(position) - self.base_value)
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(
self.fastfield_accessor
.iter()
.map(|val| self.gcd.divide(val - self.base_value)),
)
}
fn min_value(&self) -> u64 {
0
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u64 {
self.num_vals
}
}
let num_vals = fastfield_accessor.num_vals();
let base_value = fastfield_accessor.min_value();
let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd;
let fastfield_accessor = GCDWrappedFFAccess {
fastfield_accessor,
base_value,
max_value,
num_vals,
gcd: DividerU64::divide_by(gcd),
};
Self::create_auto_detect_u64_fast_field_with_idx_gcd(
self.codec_enable_checker.clone(),
field,
field_write,
fastfield_accessor,
)?;
write_gcd_header(field_write, base_value, gcd, num_vals)?;
Ok(())
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field_with_idx_gcd<W: Write>(
codec_enable_checker: FastFieldCodecEnableCheck,
field: Field,
field_write: &mut CountingWriter<W>,
fastfield_accessor: impl Column,
) -> io::Result<()> {
let mut estimations = vec![];
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
}
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
}
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
}
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{
warn!(
"broken estimation for fast field codec {:?}",
broken_estimation.1
);
}
// removing nan values for codecs with broken calculations, and max values which disables
// codecs
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let (_ratio, codec_type) = estimations[0];
debug!("choosing fast field codec {codec_type:?} for field_id {field:?}"); // todo print actual field name
Self::write_header(field_write, codec_type)?;
match codec_type {
FastFieldCodecType::Bitpacked => {
BitpackedCodec::serialize(field_write, &fastfield_accessor)?;
}
FastFieldCodecType::Linear => {
LinearCodec::serialize(field_write, &fastfield_accessor)?;
}
FastFieldCodecType::BlockwiseLinear => {
BlockwiseLinearCodec::serialize(field_write, &fastfield_accessor)?;
}
FastFieldCodecType::Gcd => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"GCD codec not supported.",
));
}
}
field_write.flush()?;
DynamicFastFieldCodec.serialize(field_write, vals, stats)?;
Ok(())
}
@@ -253,7 +72,7 @@ impl CompositeFastFieldSerializer {
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedSerializerLegacy<'_, CountingWriter<WritePtr>>> {
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}
@@ -263,7 +82,7 @@ impl CompositeFastFieldSerializer {
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedSerializerLegacy<'_, CountingWriter<WritePtr>>> {
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}
@@ -274,11 +93,11 @@ impl CompositeFastFieldSerializer {
min_value: u64,
max_value: u64,
idx: usize,
) -> io::Result<BitpackedSerializerLegacy<'_, CountingWriter<WritePtr>>> {
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
FastFieldCodecType::Bitpacked.serialize(field_write)?;
BitpackedSerializerLegacy::open(field_write, min_value, max_value)
CodecType::Bitpacked.serialize(field_write)?;
BitpackedFastFieldSerializerLegacy::open(field_write, min_value, max_value)
}
/// Start serializing a new [u8] fast field

184
src/fastfield/wrapper.rs Normal file
View File

@@ -0,0 +1,184 @@
// Copyright (C) 2022 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at hello@quickwit.io.
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
use std::marker::PhantomData;
use fastfield_codecs::dynamic::DynamicFastFieldCodec;
use fastfield_codecs::{FastFieldCodec, FastFieldCodecReader, FastFieldStats};
use ownedbytes::OwnedBytes;
use crate::directory::FileSlice;
use crate::fastfield::{FastFieldReader, FastFieldReaderImpl, FastValue};
use crate::DocId;
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
pub struct FastFieldReaderWrapper<Item: FastValue, Codec: FastFieldCodec> {
reader: Codec::Reader,
_phantom: PhantomData<Item>,
_codec: PhantomData<Codec>,
}
impl<Item: FastValue, Codec: FastFieldCodec> FastFieldReaderWrapper<Item, Codec> {
fn new(reader: Codec::Reader) -> Self {
Self {
reader,
_phantom: PhantomData,
_codec: PhantomData,
}
}
}
impl<Item: FastValue, Codec: FastFieldCodec> Clone for FastFieldReaderWrapper<Item, Codec>
where Codec::Reader: Clone
{
fn clone(&self) -> Self {
Self {
reader: self.reader.clone(),
_phantom: PhantomData,
_codec: PhantomData,
}
}
}
impl<Item: FastValue, C: FastFieldCodec> FastFieldReader<Item> for FastFieldReaderWrapper<Item, C> {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item {
Item::from_u64(self.reader.min_value())
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item {
Item::from_u64(self.reader.max_value())
}
}
impl<Item: FastValue, Codec: FastFieldCodec> FastFieldReaderWrapper<Item, Codec> {
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
// TODO
// let codec_id = bytes.read_u8();
// assert_eq!(
// 0u8, codec_id,
// "Tried to open fast field as bitpacked encoded (id=1), but got serializer with \
// different id"
// );
Self::open_from_bytes(bytes)
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = Codec::open_from_bytes(bytes)?;
Ok(FastFieldReaderWrapper {
reader,
_codec: PhantomData,
_phantom: PhantomData,
})
}
#[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item {
let data = self.reader.get_u64(doc);
Item::from_u64(data)
}
/// Internally `multivalued` also use SingleValue Fast fields.
/// It works as follows... A first column contains the list of start index
/// for each document, a second column contains the actual values.
///
/// The values associated to a given doc, are then
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
///
/// Which means single value fast field reader can be indexed internally with
/// something different from a `DocId`. For this use case, we want to use `u64`
/// values.
///
/// See `get_range` for an actual documentation about this method.
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
for (i, out) in output.iter_mut().enumerate() {
*out = self.get_u64(start + (i as u64));
}
}
}
use itertools::Itertools;
impl<Item: FastValue, Arr: AsRef<[Item]>> From<Arr> for FastFieldReaderImpl<Item> {
fn from(vals: Arr) -> FastFieldReaderImpl<Item> {
let mut buffer = Vec::new();
let vals_u64: Vec<u64> = vals.as_ref().iter().map(|val| val.to_u64()).collect();
let (min_value, max_value) = vals_u64
.iter()
.copied()
.minmax()
.into_option()
.expect("Expected non empty");
let stats = FastFieldStats {
min_value,
max_value,
num_vals: vals_u64.len() as u64,
};
DynamicFastFieldCodec
.serialize(&mut buffer, &vals_u64, stats)
.unwrap();
let bytes = OwnedBytes::new(buffer);
let fast_field_reader = DynamicFastFieldCodec::open_from_bytes(bytes).unwrap();
FastFieldReaderImpl::new(fast_field_reader)
}
}

View File

@@ -2,7 +2,6 @@ use std::collections::HashMap;
use std::io;
use common;
use fastfield_codecs::Column;
use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker;
@@ -218,12 +217,13 @@ impl FastFieldsWriter {
) -> io::Result<()> {
for field_writer in &self.term_id_writers {
let field = field_writer.field();
dbg!("multifield", field);
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
dbg!("singlefield");
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in &self.multi_values_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
@@ -360,71 +360,26 @@ impl IntFastFieldWriter {
(self.val_min, self.val_max)
};
let vals = compute_fast_field_vals(&self.vals, doc_id_map);
let stats = FastFieldStats {
min_value: min,
max_value: max,
num_vals: self.val_count as u64,
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
stats,
};
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
dbg!(&stats);
dbg!(&vals);
serializer.create_auto_detect_u64_fast_field(self.field, stats, &vals)?;
Ok(())
}
}
#[derive(Clone)]
struct WriterFastFieldAccessProvider<'map, 'bitp> {
doc_id_map: Option<&'map DocIdMapping>,
vals: &'bitp BlockedBitpacker,
stats: FastFieldStats,
}
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given doc.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the index.
fn get_val(&self, doc: u64) -> u64 {
if let Some(doc_id_map) = self.doc_id_map {
self.vals
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra
// FastFieldReader wrapper for
// non doc_id_map
} else {
self.vals.get(doc as usize)
}
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
if let Some(doc_id_map) = self.doc_id_map {
Box::new(
doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(doc_id as usize)),
)
} else {
Box::new(self.vals.iter())
}
}
fn min_value(&self) -> u64 {
self.stats.min_value
}
fn max_value(&self) -> u64 {
self.stats.max_value
}
fn num_vals(&self) -> u64 {
self.stats.num_vals
fn compute_fast_field_vals(vals: &BlockedBitpacker, doc_id_map: Option<&DocIdMapping>) -> Vec<u64> {
if let Some(doc_id_mapping) = doc_id_map {
doc_id_mapping
.iter_old_doc_ids()
.map(|old_doc_id| vals.get(old_doc_id as usize))
.collect()
} else {
vals.iter().collect()
}
}

View File

@@ -2,42 +2,35 @@
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
use std::cmp::Reverse;
use std::ops::Index;
use super::SegmentWriter;
use crate::schema::{Field, Schema};
use crate::{DocAddress, DocId, IndexSortByField, Order, TantivyError};
use crate::{DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError};
/// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)]
pub(crate) struct SegmentDocIdMapping {
new_doc_id_to_old_doc_addr: Vec<DocAddress>,
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool,
}
impl SegmentDocIdMapping {
pub(crate) fn new(new_doc_id_to_old_and_segment: Vec<DocAddress>, is_trivial: bool) -> Self {
pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool,
) -> Self {
Self {
new_doc_id_to_old_doc_addr: new_doc_id_to_old_and_segment,
new_doc_id_to_old_and_segment,
is_trivial,
}
}
/// Returns an iterator over the old document addresses, ordered by the new document ids.
///
/// In the returned `DocAddress`, the `segment_ord` is the ordinal of targetted segment
/// in the list of merged segments.
pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator<Item = DocAddress> + '_ {
self.new_doc_id_to_old_doc_addr.iter().copied()
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentOrdinal)> {
self.new_doc_id_to_old_and_segment.iter()
}
pub(crate) fn len(&self) -> usize {
self.new_doc_id_to_old_doc_addr.len()
self.new_doc_id_to_old_and_segment.len()
}
pub(crate) fn get_old_doc_addr(&self, new_doc_id: DocId) -> DocAddress {
self.new_doc_id_to_old_doc_addr[new_doc_id as usize]
}
/// This flags means the segments are simply stacked in the order of their ordinal.
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
///
@@ -46,6 +39,21 @@ impl SegmentDocIdMapping {
self.is_trivial
}
}
impl Index<usize> for SegmentDocIdMapping {
type Output = (DocId, SegmentOrdinal);
fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx]
}
}
impl IntoIterator for SegmentDocIdMapping {
type Item = (DocId, SegmentOrdinal);
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.new_doc_id_to_old_and_segment.into_iter()
}
}
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
pub struct DocIdMapping {
@@ -143,9 +151,8 @@ pub(crate) fn get_doc_id_mapping_from_field(
#[cfg(test)]
mod tests_indexsorting {
use fastfield_codecs::Column;
use crate::collector::TopDocs;
use crate::fastfield::FastFieldReader;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::query::QueryParser;
use crate::schema::{Schema, *};
@@ -465,9 +472,9 @@ mod tests_indexsorting {
let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64(my_number).unwrap();
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
assert_eq!(fast_field.get(0u32), 10u64);
assert_eq!(fast_field.get(1u32), 20u64);
assert_eq!(fast_field.get(2u32), 30u64);
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
let multifield = fast_fields.u64s(multi_numbers).unwrap();

View File

@@ -174,7 +174,9 @@ fn index_documents(
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
for document_group in grouped_document_iterator {
for doc in document_group {
segment_writer.add_document(doc)?;
@@ -775,7 +777,6 @@ impl Drop for IndexWriter {
mod tests {
use std::collections::{HashMap, HashSet};
use fastfield_codecs::Column;
use proptest::prelude::*;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
@@ -784,6 +785,7 @@ mod tests {
use crate::collector::TopDocs;
use crate::directory::error::LockError;
use crate::error::*;
use crate::fastfield::FastFieldReader;
use crate::indexer::NoMergePolicy;
use crate::query::{QueryParser, TermQuery};
use crate::schema::{
@@ -1325,7 +1327,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc as u64))
.map(|doc| fast_field_reader.get(doc))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(())
@@ -1491,7 +1493,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc as u64))
.map(move |doc| ff_reader.get(doc))
})
.collect();
@@ -1502,7 +1504,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc as u64))
.map(move |doc| ff_reader.get(doc))
})
.collect();
@@ -1620,7 +1622,7 @@ mod tests {
facet_reader
.facet_from_ord(facet_ords[0], &mut facet)
.unwrap();
let id = ff_reader.get_val(doc_id as u64);
let id = ff_reader.get(doc_id);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected);

View File

@@ -4,13 +4,14 @@ use std::sync::Arc;
use itertools::Itertools;
use measure_time::debug_time;
use tantivy_bitpacker::minmax;
use crate::core::{Segment, SegmentReader};
use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{
AliveBitSet, Column, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldStats,
MultiValueLength, MultiValuedFastFieldReader,
AliveBitSet, CompositeFastFieldSerializer, FastFieldReader, FastFieldReaderImpl,
FastFieldStats, MultiValueLength, MultiValuedFastFieldReader,
};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -20,8 +21,8 @@ use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
use crate::{
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
SegmentComponent, SegmentOrdinal,
DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentComponent,
SegmentOrdinal,
};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
@@ -87,7 +88,7 @@ pub struct IndexMerger {
}
fn compute_min_max_val(
u64_reader: &impl Column<u64>,
u64_reader: &impl FastFieldReader<u64>,
segment_reader: &SegmentReader,
) -> Option<(u64, u64)> {
if segment_reader.max_doc() == 0 {
@@ -101,11 +102,11 @@ fn compute_min_max_val(
}
// some deleted documents,
// we need to recompute the max / min
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get_val(doc_id as u64))
.minmax()
.into_option()
minmax(
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get(doc_id)),
)
}
struct TermOrdinalMapping {
@@ -133,7 +134,7 @@ impl TermOrdinalMapping {
fn max_term_ord(&self) -> TermOrdinal {
self.per_segment_new_term_ordinals
.iter()
.flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
.flat_map(|term_ordinals| term_ordinals.iter().cloned().max())
.max()
.unwrap_or_default()
}
@@ -163,6 +164,30 @@ impl DeltaComputer {
}
}
fn compute_sorted_multivalued_vals(
doc_id_mapping: &SegmentDocIdMapping,
fast_field_readers: &Vec<MultiValuedFastFieldReader<u64>>,
) -> Vec<u64> {
let mut vals = Vec::new();
let mut buf: Vec<u64> = Vec::new();
for &(doc_id, segment_ord) in doc_id_mapping.iter() {
fast_field_readers[segment_ord as usize].get_vals(doc_id, &mut buf);
vals.extend_from_slice(&buf);
}
vals
}
fn compute_vals_sorted(
doc_id_mapping: &SegmentDocIdMapping,
fast_field_readers: &[FastFieldReaderImpl<u64>],
) -> Vec<u64> {
let mut vals = Vec::with_capacity(doc_id_mapping.len());
for &(doc_id, segment_ord) in doc_id_mapping.iter() {
vals.push(fast_field_readers[segment_ord as usize].get_u64(doc_id as u64));
}
vals
}
impl IndexMerger {
pub fn open(
schema: Schema,
@@ -259,9 +284,9 @@ impl IndexMerger {
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let fieldnorms_reader = &fieldnorms_readers[old_doc_addr.segment_ord as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(old_doc_addr.doc_id);
for (doc_id, reader_ordinal) in doc_id_mapping.iter() {
let fieldnorms_reader = &fieldnorms_readers[*reader_ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
@@ -341,7 +366,7 @@ impl IndexMerger {
.readers
.iter()
.filter_map(|reader| {
let u64_reader: DynamicFastFieldReader<u64> =
let u64_reader: FastFieldReaderImpl<u64> =
reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.",
@@ -355,7 +380,7 @@ impl IndexMerger {
.readers
.iter()
.map(|reader| {
let u64_reader: DynamicFastFieldReader<u64> =
let u64_reader: crate::fastfield::FastFieldReaderImpl<u64> =
reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.",
@@ -369,50 +394,9 @@ impl IndexMerger {
max_value,
num_vals: doc_id_mapping.len() as u64,
};
#[derive(Clone)]
struct SortedDocIdFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocIdMapping,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
stats: FastFieldStats,
}
impl<'a> Column for SortedDocIdFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 {
let DocAddress {
doc_id,
segment_ord,
} = self.doc_id_mapping.get_old_doc_addr(doc as u32);
self.fast_field_readers[segment_ord as usize].get_val(doc_id as u64)
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(
self.doc_id_mapping
.iter_old_doc_addrs()
.map(|old_doc_addr| {
let fast_field_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize];
fast_field_reader.get_val(old_doc_addr.doc_id as u64)
}),
)
}
fn min_value(&self) -> u64 {
self.stats.min_value
}
fn max_value(&self) -> u64 {
self.stats.max_value
}
fn num_vals(&self) -> u64 {
self.stats.num_vals
}
}
let fastfield_accessor = SortedDocIdFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
stats,
};
fast_field_serializer.create_auto_detect_u64_fast_field(field, fastfield_accessor)?;
let vals = compute_vals_sorted(doc_id_mapping, &fast_field_readers);
fast_field_serializer.create_auto_detect_u64_fast_field(field, stats, &vals)?;
Ok(())
}
@@ -428,7 +412,7 @@ impl IndexMerger {
let everything_is_in_order = reader_ordinal_and_field_accessors
.into_iter()
.map(|(_, col)| Arc::new(col))
.map(|reader| reader.1)
.tuple_windows()
.all(|(field_accessor1, field_accessor2)| {
if sort_by_field.order.is_asc() {
@@ -443,7 +427,7 @@ impl IndexMerger {
pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader,
sort_by_field: &IndexSortByField,
) -> crate::Result<impl Column> {
) -> crate::Result<FastFieldReaderImpl<u64>> {
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor)
@@ -452,7 +436,7 @@ impl IndexMerger {
pub(crate) fn get_reader_with_sort_field_accessor(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(SegmentOrdinal, impl Column)>> {
) -> crate::Result<Vec<(SegmentOrdinal, FastFieldReaderImpl<u64>)>> {
let reader_ordinal_and_field_accessors = self
.readers
.iter()
@@ -485,11 +469,15 @@ impl IndexMerger {
let doc_id_reader_pair =
reader_ordinal_and_field_accessors
.iter()
.map(|(reader_ord, ff_reader)| {
let reader = &self.readers[*reader_ord as usize];
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_ord, ff_reader))
.map(|reader_and_field_accessor| {
let reader = &self.readers[reader_and_field_accessor.0 as usize];
reader.doc_ids_alive().map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
});
let total_num_new_docs = self
@@ -498,25 +486,22 @@ impl IndexMerger {
.map(|reader| reader.num_docs() as usize)
.sum();
let mut sorted_doc_ids: Vec<DocAddress> = Vec::with_capacity(total_num_new_docs);
let mut sorted_doc_ids = Vec::with_capacity(total_num_new_docs);
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
sorted_doc_ids.extend(
doc_id_reader_pair
.into_iter()
.kmerge_by(|a, b| {
let val1 = a.2.get_val(a.0 as u64);
let val2 = b.2.get_val(b.0 as u64);
let val1 = a.2.get(a.0);
let val2 = b.2.get(b.0);
if sort_by_field.order == Order::Asc {
val1 < val2
} else {
val1 > val2
}
})
.map(|(doc_id, &segment_ord, _)| DocAddress {
doc_id,
segment_ord,
}),
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id)),
);
Ok(SegmentDocIdMapping::new(sorted_doc_ids, false))
}
@@ -563,46 +548,16 @@ impl IndexMerger {
// access on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = Vec::with_capacity(doc_id_mapping.len());
let mut offsets: Vec<u64> = Vec::with_capacity(doc_id_mapping.len());
let mut offset = 0;
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let reader = &reader_and_field_accessors[old_doc_addr.segment_ord as usize].1;
for (doc_id, reader) in doc_id_mapping.iter() {
let reader = &reader_and_field_accessors[*reader as usize].1;
offsets.push(offset);
offset += reader.get_len(old_doc_addr.doc_id) as u64;
offset += reader.get_len(*doc_id) as u64;
}
offsets.push(offset);
#[derive(Clone)]
struct FieldIndexAccessProvider<'a> {
offsets: &'a [u64],
stats: FastFieldStats,
}
impl<'a> Column for FieldIndexAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 {
self.offsets[doc as usize]
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(self.offsets.iter().cloned())
}
fn min_value(&self) -> u64 {
self.stats.min_value
}
fn max_value(&self) -> u64 {
self.stats.max_value
}
fn num_vals(&self) -> u64 {
self.stats.num_vals
}
}
let fastfield_accessor = FieldIndexAccessProvider {
offsets: &offsets,
stats,
};
fast_field_serializer.create_auto_detect_u64_fast_field(field, fastfield_accessor)?;
fast_field_serializer.create_auto_detect_u64_fast_field(field, stats, &offsets[..])?;
Ok(offsets)
}
/// Returns the fastfield index (index for the data, not the data).
@@ -611,7 +566,7 @@ impl IndexMerger {
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<Vec<u64>> {
) -> crate::Result<()> {
let reader_ordinal_and_field_accessors = self
.readers
.iter()
@@ -632,7 +587,8 @@ impl IndexMerger {
fast_field_serializer,
doc_id_mapping,
&reader_ordinal_and_field_accessors,
)
)?;
Ok(())
}
fn write_term_id_fast_field(
@@ -670,12 +626,12 @@ impl IndexMerger {
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals = Vec::with_capacity(100);
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
for (old_doc_id, reader_ordinal) in doc_id_mapping.iter() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(old_doc_addr.segment_ord as usize);
term_ordinal_mappings.get_segment(*reader_ordinal as usize);
let ff_reader = &fast_field_reader[old_doc_addr.segment_ord as usize];
ff_reader.get_vals(old_doc_addr.doc_id, &mut vals);
let ff_reader = &fast_field_reader[*reader_ordinal as usize];
ff_reader.get_vals(*old_doc_id, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
@@ -696,17 +652,16 @@ impl IndexMerger {
.map(|reader| reader.num_docs() as usize)
.sum();
let mut mapping: Vec<DocAddress> = Vec::with_capacity(total_num_new_docs);
let mut mapping = Vec::with_capacity(total_num_new_docs);
mapping.extend(
self.readers
.iter()
.enumerate()
.flat_map(|(segment_ord, reader)| {
reader.doc_ids_alive().map(move |doc_id| DocAddress {
segment_ord: segment_ord as u32,
doc_id,
})
.flat_map(|(reader_ordinal, reader)| {
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal))
}),
);
Ok(SegmentDocIdMapping::new(mapping, true))
@@ -722,12 +677,7 @@ impl IndexMerger {
// The second contains the actual values.
// First we merge the idx fast field.
let offsets =
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let mut min_value = u64::MAX;
let mut max_value = u64::MIN;
let mut num_vals = 0;
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let mut vals = Vec::with_capacity(100);
@@ -749,94 +699,18 @@ impl IndexMerger {
);
for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
for &val in &vals {
min_value = cmp::min(val, min_value);
max_value = cmp::max(val, max_value);
}
num_vals += vals.len();
}
ff_readers.push(ff_reader);
// TODO optimize when no deletes
}
if min_value > max_value {
min_value = 0;
max_value = 0;
}
let vals = compute_sorted_multivalued_vals(doc_id_mapping, &ff_readers);
let stats = FastFieldStats::compute(&vals);
// We can now initialize our serializer, and push it the different values
let stats = FastFieldStats {
max_value,
num_vals: num_vals as u64,
min_value,
};
struct SortedDocIdMultiValueAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocIdMapping,
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
offsets: Vec<u64>,
stats: FastFieldStats,
}
impl<'a> Column for SortedDocIdMultiValueAccessProvider<'a> {
fn get_val(&self, pos: u64) -> u64 {
// use the offsets index to find the doc_id which will contain the position.
// the offsets are strictly increasing so we can do a simple search on it.
let new_doc_id: DocId =
self.offsets
.iter()
.position(|&offset| offset > pos)
.expect("pos is out of bounds") as DocId
- 1u32;
// now we need to find the position of `pos` in the multivalued bucket
let num_pos_covered_until_now = self.offsets[new_doc_id as usize];
let pos_in_values = pos - num_pos_covered_until_now;
let old_doc_addr = self.doc_id_mapping.get_old_doc_addr(new_doc_id);
let num_vals = self.fast_field_readers[old_doc_addr.segment_ord as usize]
.get_len(old_doc_addr.doc_id);
assert!(num_vals >= pos_in_values);
let mut vals = Vec::new();
self.fast_field_readers[old_doc_addr.segment_ord as usize]
.get_vals(old_doc_addr.doc_id, &mut vals);
vals[pos_in_values as usize]
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(
self.doc_id_mapping
.iter_old_doc_addrs()
.flat_map(|old_doc_addr| {
let ff_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize];
let mut vals = Vec::new();
ff_reader.get_vals(old_doc_addr.doc_id, &mut vals);
vals.into_iter()
}),
)
}
fn min_value(&self) -> u64 {
self.stats.min_value
}
fn max_value(&self) -> u64 {
self.stats.max_value
}
fn num_vals(&self) -> u64 {
self.stats.num_vals
}
}
let fastfield_accessor = SortedDocIdMultiValueAccessProvider {
doc_id_mapping,
fast_field_readers: &ff_readers,
offsets,
stats,
};
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
field,
fastfield_accessor,
stats,
&vals[..],
1,
)?;
@@ -869,9 +743,9 @@ impl IndexMerger {
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1);
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let bytes_reader = &reader_and_field_accessors[old_doc_addr.segment_ord as usize].1;
let val = bytes_reader.get_bytes(old_doc_addr.doc_id);
for (doc_id, reader_ordinal) in doc_id_mapping.iter() {
let bytes_reader = &reader_and_field_accessors[*reader_ordinal as usize].1;
let val = bytes_reader.get_bytes(*doc_id);
serialize_vals.write_all(val)?;
}
@@ -927,9 +801,9 @@ impl IndexMerger {
segment_local_map
})
.collect();
for (new_doc_id, old_doc_addr) in doc_id_mapping.iter_old_doc_addrs().enumerate() {
let segment_map = &mut merged_doc_id_map[old_doc_addr.segment_ord as usize];
segment_map[old_doc_addr.doc_id as usize] = Some(new_doc_id as DocId);
for (new_doc_id, (old_doc_id, segment_ord)) in doc_id_mapping.iter().enumerate() {
let segment_map = &mut merged_doc_id_map[*segment_ord as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}
// Note that the total number of tokens is not exact.
@@ -1104,15 +978,15 @@ impl IndexMerger {
.map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
.collect();
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
for (old_doc_id, reader_ordinal) in doc_id_mapping.iter() {
let doc_bytes_it = &mut document_iterators[*reader_ordinal as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(&format!(
"unexpected missing document in docstore on merge, doc address \
{old_doc_addr:?}",
"unexpected missing document in docstore on merge, doc id {:?}",
old_doc_id
))
.into());
}
@@ -1199,7 +1073,6 @@ impl IndexMerger {
#[cfg(test)]
mod tests {
use byteorder::{BigEndian, ReadBytesExt};
use fastfield_codecs::Column;
use schema::FAST;
use crate::collector::tests::{
@@ -1207,6 +1080,7 @@ mod tests {
};
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::fastfield::FastFieldReader;
use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery};
use crate::schema::{
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,

View File

@@ -1,10 +1,8 @@
#[cfg(test)]
mod tests {
use fastfield_codecs::Column;
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader};
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
@@ -188,17 +186,17 @@ mod tests {
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64(int_field).unwrap();
assert_eq!(fast_field.get_val(5), 1u64);
assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get_val(3), 3u64);
assert_eq!(fast_field.get(5u32), 1u64);
assert_eq!(fast_field.get(4u32), 2u64);
assert_eq!(fast_field.get(3u32), 3u64);
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get_val(2u64), 20u64);
assert_eq!(fast_field.get_val(1u64), 100u64);
assert_eq!(fast_field.get(2u32), 20u64);
assert_eq!(fast_field.get(1u32), 100u64);
} else {
assert_eq!(fast_field.get_val(2u64), 10u64);
assert_eq!(fast_field.get_val(1u64), 20u64);
assert_eq!(fast_field.get(2u32), 10u64);
assert_eq!(fast_field.get(1u32), 20u64);
}
assert_eq!(fast_field.get_val(0u64), 1_000u64);
assert_eq!(fast_field.get(0u32), 1_000u64);
// test new field norm mapping
{
@@ -375,12 +373,12 @@ mod tests {
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64(int_field).unwrap();
assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get_val(5), 1_000u64);
assert_eq!(fast_field.get(0u32), 1u64);
assert_eq!(fast_field.get(1u32), 2u64);
assert_eq!(fast_field.get(2u32), 3u64);
assert_eq!(fast_field.get(3u32), 10u64);
assert_eq!(fast_field.get(4u32), 20u64);
assert_eq!(fast_field.get(5u32), 1_000u64);
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![];
@@ -480,13 +478,13 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench_sorted_index_merge {
use fastfield_codecs::Column;
use test::{self, Bencher};
use crate::core::Index;
use crate::fastfield::DynamicFastFieldReader;
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::indexer::merger::IndexMerger;
use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::schema::{Cardinality, Document, NumericOptions, Schema};
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
let mut schema_builder = Schema::builder();
@@ -505,7 +503,9 @@ mod bench_sorted_index_merge {
{
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
index_writer.add_document(doc!(int_field=>val)).unwrap();
let mut doc = Document::default();
doc.add_u64(int_field, val);
index_writer.add_document(doc).unwrap();
};
// 3 segments with 10_000 values in the fast fields
for _ in 0..3 {
@@ -518,7 +518,6 @@ mod bench_sorted_index_merge {
}
index
}
#[bench]
fn create_sorted_index_walk_overkmerge_on_merge_fastfield(
b: &mut Bencher,
@@ -534,19 +533,19 @@ mod bench_sorted_index_merge {
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
let reader = &merger.readers[doc_addr.segment_ord as usize];
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)| {
let reader = &merger.readers[*ordinal as usize];
let u64_reader: DynamicFastFieldReader<u64> =
reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.",
);
(doc_addr.doc_id, reader, u64_reader)
(doc_id, reader, u64_reader)
});
// add values in order of the new doc_ids
let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get_val(doc_id as u64);
val = field_reader.get(*doc_id);
}
val

View File

@@ -25,10 +25,39 @@ use crate::indexer::{
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
SegmentSerializer,
};
use crate::schema::Schema;
use crate::{FutureResult, Opstamp};
const NUM_MERGE_THREADS: usize = 4;
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// Save the index meta file.
/// This operation is atomic:
/// Either
@@ -38,7 +67,7 @@ const NUM_MERGE_THREADS: usize = 4;
/// and flushed.
///
/// This method is not part of tantivy's public API
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.

View File

@@ -80,8 +80,8 @@ impl SegmentWriter {
pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment,
schema: Schema,
) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;

View File

@@ -11,7 +11,6 @@
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
#![allow(clippy::len_without_is_empty)]
#![allow(clippy::derive_partial_eq_without_eq)]
//! # `tantivy`
//!
@@ -301,7 +300,7 @@ pub use self::docset::{DocSet, TERMINATED};
pub use crate::core::{
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
SegmentReader, SingleSegmentIndexWriter,
SegmentReader,
};
pub use crate::directory::Directory;
pub use crate::indexer::demuxer::*;
@@ -421,7 +420,6 @@ pub struct DocAddress {
#[cfg(test)]
pub mod tests {
use common::{BinarySerializable, FixedSize};
use fastfield_codecs::Column;
use rand::distributions::{Bernoulli, Uniform};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
@@ -430,6 +428,7 @@ pub mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::FastFieldReader;
use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery;
use crate::schema::*;
@@ -1036,21 +1035,21 @@ pub mod tests {
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
assert!(fast_field_reader_opt.is_ok());
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4u64)
assert_eq!(fast_field_reader.get(0), 4u64)
}
{
let fast_field_reader_res = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4i64)
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get_val(0), 4f64)
assert_eq!(fast_field_reader.get(0), 4f64)
}
Ok(())
}

View File

@@ -227,7 +227,7 @@ pub mod tests {
{
let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
{
// checking that position works if the field has two values
let op = AddOperation {

View File

@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `MemoryArena`.
pub(crate) trait PostingsWriter: Send + Sync {
pub(crate) trait PostingsWriter {
/// Record that a document contains a term at a given position.
///
/// * doc - the document id

View File

@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
/// * the document id
/// * the term frequency
/// * the term positions
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
pub(crate) trait Recorder: Copy + Default + 'static {
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document

View File

@@ -371,7 +371,7 @@ mod tests {
fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut scorer = Union::build(term_scorers, SumCombiner::default);
let mut scorer: Union<TermScorer, SumCombiner> = Union::from(term_scorers);
let mut limit = Score::MIN;
loop {

View File

@@ -1,5 +1,7 @@
use std::collections::BTreeMap;
use super::boolean_weight::BooleanWeight;
use crate::query::{Occur, Query, SumWithCoordsCombiner, TermQuery, Weight};
use crate::query::{Occur, Query, TermQuery, Weight};
use crate::schema::{IndexRecordOption, Term};
use crate::Searcher;
@@ -151,16 +153,12 @@ impl Query for BooleanQuery {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
})
.collect::<crate::Result<_>>()?;
Ok(Box::new(BooleanWeight::new(
sub_weights,
scoring_enabled,
Box::new(SumWithCoordsCombiner::default),
)))
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_occur, subquery) in &self.subqueries {
subquery.query_terms(visitor);
subquery.query_terms(terms);
}
}
}

View File

@@ -3,7 +3,7 @@ use std::collections::HashMap;
use crate::core::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use crate::query::term_query::TermScorer;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::{
@@ -17,13 +17,8 @@ enum SpecializedScorer {
Other(Box<dyn Scorer>),
}
fn scorer_union<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner_fn: impl Fn() -> TScoreCombiner,
) -> SpecializedScorer
where
TScoreCombiner: ScoreCombiner,
{
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> SpecializedScorer
where TScoreCombiner: ScoreCombiner {
assert!(!scorers.is_empty());
if scorers.len() == 1 {
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand
@@ -43,45 +38,35 @@ where
// Block wand is only available if we read frequencies.
return SpecializedScorer::TermUnion(scorers);
} else {
return SpecializedScorer::Other(Box::new(Union::build(
return SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(
scorers,
score_combiner_fn,
)));
}
}
}
SpecializedScorer::Other(Box::new(Union::build(scorers, score_combiner_fn)))
SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(scorers)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
scorer: SpecializedScorer,
score_combiner_fn: impl Fn() -> TScoreCombiner,
) -> Box<dyn Scorer> {
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(scorer: SpecializedScorer) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer = Union::build(term_scorers, score_combiner_fn);
let union_scorer = Union::<TermScorer, TScoreCombiner>::from(term_scorers);
Box::new(union_scorer)
}
SpecializedScorer::Other(scorer) => scorer,
}
}
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
pub struct BooleanWeight {
weights: Vec<(Occur, Box<dyn Weight>)>,
scoring_enabled: bool,
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
}
impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
pub fn new(
weights: Vec<(Occur, Box<dyn Weight>)>,
scoring_enabled: bool,
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
) -> BooleanWeight<TScoreCombiner> {
impl BooleanWeight {
pub fn new(weights: Vec<(Occur, Box<dyn Weight>)>, scoring_enabled: bool) -> BooleanWeight {
BooleanWeight {
weights,
scoring_enabled,
score_combiner_fn,
}
}
@@ -101,23 +86,21 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
Ok(per_occur_scorers)
}
fn complex_scorer<TComplexScoreCombiner: ScoreCombiner>(
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
&self,
reader: &SegmentReader,
boost: Score,
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
) -> crate::Result<SpecializedScorer> {
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
.remove(&Occur::Should)
.map(|scorers| scorer_union(scorers, &score_combiner_fn));
.map(scorer_union::<TScoreCombiner>);
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot)
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
});
.map(scorer_union::<DoNothingCombiner>)
.map(into_box_scorer::<DoNothingCombiner>);
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::Must)
@@ -129,10 +112,10 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
Box<dyn Scorer>,
Box<dyn Scorer>,
TComplexScoreCombiner,
TScoreCombiner,
>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn),
into_box_scorer::<TScoreCombiner>(should_scorer),
)))
} else {
SpecializedScorer::Other(must_scorer)
@@ -146,7 +129,8 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
};
if let Some(exclude_scorer) = exclude_scorer_opt {
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
let positive_scorer_boxed: Box<dyn Scorer> =
into_box_scorer::<TScoreCombiner>(positive_scorer);
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
positive_scorer_boxed,
exclude_scorer,
@@ -157,7 +141,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
}
}
impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombiner> {
impl Weight for BooleanWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if self.weights.is_empty() {
Ok(Box::new(EmptyScorer))
@@ -169,15 +153,13 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
weight.scorer(reader, boost)
}
} else if self.scoring_enabled {
self.complex_scorer(reader, boost, &self.score_combiner_fn)
self.complex_scorer::<SumWithCoordsCombiner>(reader, boost)
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, &self.score_combiner_fn)
into_box_scorer::<SumWithCoordsCombiner>(specialized_scorer)
})
} else {
self.complex_scorer(reader, boost, &DoNothingCombiner::default)
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, &DoNothingCombiner::default)
})
self.complex_scorer::<DoNothingCombiner>(reader, boost)
.map(into_box_scorer::<DoNothingCombiner>)
}
}
@@ -206,10 +188,11 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
let scorer = self.complex_scorer::<SumWithCoordsCombiner>(reader, 1.0)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
let mut union_scorer =
Union::<TermScorer, SumWithCoordsCombiner>::from(term_scorers);
for_each_scorer(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
@@ -235,7 +218,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
let scorer = self.complex_scorer::<SumWithCoordsCombiner>(reader, 1.0)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
super::block_wand(term_scorers, threshold, callback);

View File

@@ -4,7 +4,6 @@ mod boolean_weight;
pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer};
pub use self::boolean_query::BooleanQuery;
pub(crate) use self::boolean_weight::BooleanWeight;
#[cfg(test)]
mod tests {

View File

@@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::fmt;
use crate::fastfield::AliveBitSet;
@@ -48,8 +49,8 @@ impl Query for BoostQuery {
Ok(boosted_weight)
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
self.query.query_terms(visitor)
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.query.query_terms(terms)
}
}

View File

@@ -1,176 +0,0 @@
use std::fmt;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, TantivyError, Term};
/// `ConstScoreQuery` is a wrapper over a query to provide a constant score.
/// It can avoid unnecessary score computation on the wrapped query.
///
/// The document set matched by the `ConstScoreQuery` is strictly the same as the underlying query.
/// The configured score is used for each document.
pub struct ConstScoreQuery {
query: Box<dyn Query>,
score: Score,
}
impl ConstScoreQuery {
/// Builds a const score query.
pub fn new(query: Box<dyn Query>, score: Score) -> ConstScoreQuery {
ConstScoreQuery { query, score }
}
}
impl Clone for ConstScoreQuery {
fn clone(&self) -> Self {
ConstScoreQuery {
query: self.query.box_clone(),
score: self.score,
}
}
}
impl fmt::Debug for ConstScoreQuery {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Const(score={}, query={:?})", self.score, self.query)
}
}
impl Query for ConstScoreQuery {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let inner_weight = self.query.weight(searcher, scoring_enabled)?;
Ok(if scoring_enabled {
Box::new(ConstWeight::new(inner_weight, self.score))
} else {
inner_weight
})
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
self.query.query_terms(visitor);
}
}
struct ConstWeight {
weight: Box<dyn Weight>,
score: Score,
}
impl ConstWeight {
pub fn new(weight: Box<dyn Weight>, score: Score) -> Self {
ConstWeight { weight, score }
}
}
impl Weight for ConstWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let inner_scorer = self.weight.scorer(reader, boost)?;
Ok(Box::new(ConstScorer::new(inner_scorer, boost * self.score)))
}
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({}) does not match",
doc
)));
}
let mut explanation = Explanation::new("Const", self.score);
let underlying_explanation = self.weight.explain(reader, doc)?;
explanation.add_detail(underlying_explanation);
Ok(explanation)
}
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
self.weight.count(reader)
}
}
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
/// The `ConstScorer` is useful if you have a `DocSet` where
/// you needed a scorer.
///
/// The `ConstScorer`'s constant score can be set
/// by calling `.set_score(...)`.
pub struct ConstScorer<TDocSet: DocSet> {
docset: TDocSet,
score: Score,
}
impl<TDocSet: DocSet> ConstScorer<TDocSet> {
/// Creates a new `ConstScorer`.
pub fn new(docset: TDocSet, score: Score) -> ConstScorer<TDocSet> {
ConstScorer { docset, score }
}
}
impl<TDocSet: DocSet> From<TDocSet> for ConstScorer<TDocSet> {
fn from(docset: TDocSet) -> Self {
ConstScorer::new(docset, 1.0)
}
}
impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
fn advance(&mut self) -> DocId {
self.docset.advance()
}
fn seek(&mut self, target: DocId) -> DocId {
self.docset.seek(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
self.docset.fill_buffer(buffer)
}
fn doc(&self) -> DocId {
self.docset.doc()
}
fn size_hint(&self) -> u32 {
self.docset.size_hint()
}
}
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
fn score(&mut self) -> Score {
self.score
}
}
#[cfg(test)]
mod tests {
use super::ConstScoreQuery;
use crate::query::{AllQuery, Query};
use crate::schema::Schema;
use crate::{DocAddress, Document, Index};
#[test]
fn test_const_score_query_explain() -> crate::Result<()> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::new())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query = ConstScoreQuery::new(Box::new(AllQuery), 0.42);
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
assert_eq!(
explanation.to_pretty_json(),
r#"{
"value": 0.42,
"description": "Const",
"details": [
{
"value": 1.0,
"description": "AllQuery",
"context": []
}
],
"context": []
}"#
);
Ok(())
}
}

View File

@@ -1,131 +0,0 @@
use tantivy_query_grammar::Occur;
use crate::query::{BooleanWeight, DisjunctionMaxCombiner, Query, Weight};
use crate::{Score, Searcher, Term};
/// The disjunction max query кeturns documents matching one or more wrapped queries,
/// called query clauses or clauses.
///
/// If a returned document matches multiple query clauses,
/// the `DisjunctionMaxQuery` assigns the document the highest relevance score from any matching
/// clause, plus a tie breaking increment for any additional matching subqueries.
///
/// ```rust
/// use tantivy::collector::TopDocs;
/// use tantivy::doc;
/// use tantivy::query::{DisjunctionMaxQuery, Query, QueryClone, TermQuery};
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
/// use tantivy::Term;
/// use tantivy::Index;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let body = schema_builder.add_text_field("body", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of Girl",
/// ))?;
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ))?;
/// index_writer.add_document(doc!(
/// title => "The Diary of Girl",
/// ))?;
/// index_writer.commit()?;
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// // Make TermQuery's for "girl" and "diary" in the title
/// let girl_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(title, "girl"),
/// IndexRecordOption::Basic,
/// ));
/// let diary_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(title, "diary"),
/// IndexRecordOption::Basic,
/// ));
///
/// // TermQuery "diary" and "girl" should be present and only one should be accounted in score
/// let queries1 = vec![diary_term_query.box_clone(), girl_term_query.box_clone()];
/// let diary_and_girl = DisjunctionMaxQuery::new(queries1);
/// let documents = searcher.search(&diary_and_girl, &TopDocs::with_limit(3))?;
/// assert_eq!(documents[0].0, documents[1].0);
/// assert_eq!(documents[1].0, documents[2].0);
///
/// // TermQuery "diary" and "girl" should be present
/// // and one should be accounted with multiplier 0.7
/// let queries2 = vec![diary_term_query.box_clone(), girl_term_query.box_clone()];
/// let tie_breaker = 0.7;
/// let diary_and_girl_with_tie_breaker = DisjunctionMaxQuery::with_tie_breaker(queries2, tie_breaker);
/// let documents = searcher.search(&diary_and_girl_with_tie_breaker, &TopDocs::with_limit(3))?;
/// assert_eq!(documents[1].0, documents[2].0);
/// // For this test all terms brings the same score. So we can do easy math and assume that
/// // `DisjunctionMaxQuery` with tie breakers score should be equal
/// // to term1 score + `tie_breaker` * term2 score or (1.0 + tie_breaker) * term score
/// assert!(f32::abs(documents[0].0 - documents[1].0 * (1.0 + tie_breaker)) < 0.001);
/// Ok(())
/// }
/// ```
#[derive(Debug)]
pub struct DisjunctionMaxQuery {
disjuncts: Vec<Box<dyn Query>>,
tie_breaker: Score,
}
impl Clone for DisjunctionMaxQuery {
fn clone(&self) -> Self {
DisjunctionMaxQuery::with_tie_breaker(
self.disjuncts
.iter()
.map(|disjunct| disjunct.box_clone())
.collect::<Vec<_>>(),
self.tie_breaker,
)
}
}
impl Query for DisjunctionMaxQuery {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let disjuncts = self
.disjuncts
.iter()
.map(|disjunct| Ok((Occur::Should, disjunct.weight(searcher, scoring_enabled)?)))
.collect::<crate::Result<_>>()?;
let tie_breaker = self.tie_breaker;
Ok(Box::new(BooleanWeight::new(
disjuncts,
scoring_enabled,
Box::new(move || DisjunctionMaxCombiner::with_tie_breaker(tie_breaker)),
)))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
for disjunct in &self.disjuncts {
disjunct.query_terms(visitor);
}
}
}
impl DisjunctionMaxQuery {
/// Creates a new `DisjunctionMaxQuery` with tie breaker.
pub fn with_tie_breaker(
disjuncts: Vec<Box<dyn Query>>,
tie_breaker: Score,
) -> DisjunctionMaxQuery {
DisjunctionMaxQuery {
disjuncts,
tie_breaker,
}
}
/// Creates a new `DisjunctionMaxQuery` with no tie breaker.
pub fn new(disjuncts: Vec<Box<dyn Query>>) -> DisjunctionMaxQuery {
DisjunctionMaxQuery::with_tie_breaker(disjuncts, 0.0)
}
}

View File

@@ -6,8 +6,6 @@ mod bitset;
mod bm25;
mod boolean_query;
mod boost_query;
mod const_score_query;
mod disjunction_max_query;
mod empty_query;
mod exclude;
mod explanation;
@@ -36,10 +34,7 @@ pub use self::automaton_weight::AutomatonWeight;
pub use self::bitset::BitSetDocSet;
pub(crate) use self::bm25::Bm25Weight;
pub use self::boolean_query::BooleanQuery;
pub(crate) use self::boolean_query::BooleanWeight;
pub use self::boost_query::BoostQuery;
pub use self::const_score_query::{ConstScoreQuery, ConstScorer};
pub use self::disjunction_max_query::DisjunctionMaxQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::explanation::Explanation;
@@ -54,10 +49,7 @@ pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{
DisjunctionMaxCombiner, ScoreCombiner, SumCombiner, SumWithCoordsCombiner,
};
pub use self::scorer::Scorer;
pub use self::scorer::{ConstScorer, Scorer};
pub use self::term_query::TermQuery;
pub use self::union::Union;
#[cfg(test)]
@@ -66,6 +58,8 @@ pub use self::weight::Weight;
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use crate::query::QueryParser;
use crate::schema::{Schema, TEXT};
use crate::{Index, Term};
@@ -80,34 +74,49 @@ mod tests {
let term_a = Term::from_field_text(text_field, "a");
let term_b = Term::from_field_text(text_field, "b");
{
let query = query_parser.parse_query("a").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, false)], terms);
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a")
.unwrap()
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let query = query_parser.parse_query("a b").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, false), (&term_b, false)], terms);
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a b")
.unwrap()
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
{
let query = query_parser.parse_query("\"a b\"").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, true), (&term_b, true)], terms);
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("\"a b\"")
.unwrap()
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
}
{
let query = query_parser.parse_query("a a a a a").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, false); 5], terms);
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a a a a a")
.unwrap()
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let query = query_parser.parse_query("a -b").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, false), (&term_b, false)], terms);
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a -b")
.unwrap()
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
}
}

View File

@@ -1,3 +1,5 @@
use std::collections::BTreeMap;
use super::PhraseWeight;
use crate::core::searcher::Searcher;
use crate::query::bm25::Bm25Weight;
@@ -127,9 +129,9 @@ impl Query for PhraseQuery {
Ok(Box::new(phrase_weight))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_, term) in &self.phrase_terms {
visitor(term, true);
terms.insert(term.clone(), true);
}
}
}

View File

@@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::fmt;
use downcast_rs::impl_downcast;
@@ -66,15 +67,12 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
Ok(result)
}
/// Extract all of the terms associated to the query and pass them to the
/// given closure.
/// Extract all of the terms associated to the query and insert them in the
/// term set given in arguments.
///
/// Each term is associated with a boolean indicating whether
/// positions are required or not.
///
/// Note that there can be multiple instances of any given term
/// in a query and deduplication must be handled by the visitor.
fn query_terms<'a>(&'a self, _visitor: &mut dyn FnMut(&'a Term, bool)) {}
/// Positions are required or not.
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
}
/// Implements `box_clone`.
@@ -100,8 +98,8 @@ impl Query for Box<dyn Query> {
self.as_ref().count(searcher)
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
self.as_ref().query_terms(visitor);
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.as_ref().query_terms(terms);
}
}

View File

@@ -184,7 +184,7 @@ pub struct QueryParser {
fn all_negative(ast: &LogicalAst) -> bool {
match ast {
LogicalAst::Leaf(_) => false,
LogicalAst::Boost(ref child_ast, _) => all_negative(child_ast),
LogicalAst::Boost(ref child_ast, _) => all_negative(&*child_ast),
LogicalAst::Clause(children) => children
.iter()
.all(|(ref occur, child)| (*occur == Occur::MustNot) || all_negative(child)),

View File

@@ -77,40 +77,3 @@ impl ScoreCombiner for SumWithCoordsCombiner {
self.score
}
}
/// Take max score of different scorers
/// and optionally sum it with other matches multiplied by `tie_breaker`
#[derive(Default, Clone, Copy)]
pub struct DisjunctionMaxCombiner {
max: Score,
sum: Score,
tie_breaker: Score,
}
impl DisjunctionMaxCombiner {
/// Creates `DisjunctionMaxCombiner` with tie breaker
pub fn with_tie_breaker(tie_breaker: Score) -> DisjunctionMaxCombiner {
DisjunctionMaxCombiner {
max: 0.0,
sum: 0.0,
tie_breaker,
}
}
}
impl ScoreCombiner for DisjunctionMaxCombiner {
fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer) {
let score = scorer.score();
self.max = Score::max(score, self.max);
self.sum += score;
}
fn clear(&mut self) {
self.max = 0.0;
self.sum = 0.0;
}
fn score(&self) -> Score {
self.max + (self.sum - self.max) * self.tie_breaker
}
}

View File

@@ -3,7 +3,7 @@ use std::ops::DerefMut;
use downcast_rs::impl_downcast;
use crate::docset::DocSet;
use crate::Score;
use crate::{DocId, Score};
/// Scored set of documents matching a query within a specific segment.
///
@@ -22,3 +22,55 @@ impl Scorer for Box<dyn Scorer> {
self.deref_mut().score()
}
}
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
/// The `ConstScorer` is useful if you have a `DocSet` where
/// you needed a scorer.
///
/// The `ConstScorer`'s constant score can be set
/// by calling `.set_score(...)`.
pub struct ConstScorer<TDocSet: DocSet> {
docset: TDocSet,
score: Score,
}
impl<TDocSet: DocSet> ConstScorer<TDocSet> {
/// Creates a new `ConstScorer`.
pub fn new(docset: TDocSet, score: Score) -> ConstScorer<TDocSet> {
ConstScorer { docset, score }
}
}
impl<TDocSet: DocSet> From<TDocSet> for ConstScorer<TDocSet> {
fn from(docset: TDocSet) -> Self {
ConstScorer::new(docset, 1.0)
}
}
impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
fn advance(&mut self) -> DocId {
self.docset.advance()
}
fn seek(&mut self, target: DocId) -> DocId {
self.docset.seek(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
self.docset.fill_buffer(buffer)
}
fn doc(&self) -> DocId {
self.docset.doc()
}
fn size_hint(&self) -> u32 {
self.docset.size_hint()
}
}
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
fn score(&mut self) -> Score {
self.score
}
}

View File

@@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::fmt;
use super::term_weight::TermWeight;
@@ -120,7 +121,7 @@ impl Query for TermQuery {
self.specialized_weight(searcher, scoring_enabled)?,
))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
visitor(&self.term, false);
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
terms.insert(self.term.clone(), false);
}
}

View File

@@ -36,6 +36,34 @@ pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
score: Score,
}
impl<TScorer, TScoreCombiner> From<Vec<TScorer>> for Union<TScorer, TScoreCombiner>
where
TScoreCombiner: ScoreCombiner,
TScorer: Scorer,
{
fn from(docsets: Vec<TScorer>) -> Union<TScorer, TScoreCombiner> {
let non_empty_docsets: Vec<TScorer> = docsets
.into_iter()
.filter(|docset| docset.doc() != TERMINATED)
.collect();
let mut union = Union {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
scores: Box::new([TScoreCombiner::default(); HORIZON as usize]),
cursor: HORIZON_NUM_TINYBITSETS,
offset: 0,
doc: 0,
score: 0.0,
};
if union.refill() {
union.advance();
} else {
union.doc = TERMINATED;
}
union
}
}
fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
scorers: &mut Vec<TScorer>,
bitsets: &mut [TinySet; HORIZON_NUM_TINYBITSETS],
@@ -62,31 +90,6 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
pub(crate) fn build(
docsets: Vec<TScorer>,
score_combiner_fn: impl FnOnce() -> TScoreCombiner,
) -> Union<TScorer, TScoreCombiner> {
let non_empty_docsets: Vec<TScorer> = docsets
.into_iter()
.filter(|docset| docset.doc() != TERMINATED)
.collect();
let mut union = Union {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
scores: Box::new([score_combiner_fn(); HORIZON as usize]),
cursor: HORIZON_NUM_TINYBITSETS,
offset: 0,
doc: 0,
score: 0.0,
};
if union.refill() {
union.advance();
} else {
union.doc = TERMINATED;
}
union
}
fn refill(&mut self) -> bool {
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
self.offset = min_doc;
@@ -176,6 +179,7 @@ where
// The target is outside of the buffered horizon.
// advance all docsets to a doc >= to the target.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target {
docset.seek(target);
@@ -262,13 +266,12 @@ mod tests {
let union_vals: Vec<u32> = val_set.into_iter().collect();
let mut union_expected = VecDocSet::from(union_vals);
let make_union = || {
Union::build(
Union::from(
vals.iter()
.cloned()
.map(VecDocSet::from)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<ConstScorer<VecDocSet>>>(),
DoNothingCombiner::default,
)
};
let mut union: Union<_, DoNothingCombiner> = make_union();
@@ -309,14 +312,13 @@ mod tests {
btree_set.extend(docs.iter().cloned());
}
let docset_factory = || {
let res: Box<dyn DocSet> = Box::new(Union::build(
let res: Box<dyn DocSet> = Box::new(Union::<_, DoNothingCombiner>::from(
docs_list
.iter()
.cloned()
.map(VecDocSet::from)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
DoNothingCombiner::default,
));
res
};
@@ -344,13 +346,10 @@ mod tests {
#[test]
fn test_union_skip_corner_case3() {
let mut docset = Union::build(
vec![
ConstScorer::from(VecDocSet::from(vec![0u32, 5u32])),
ConstScorer::from(VecDocSet::from(vec![1u32, 4u32])),
],
DoNothingCombiner::default,
);
let mut docset = Union::<_, DoNothingCombiner>::from(vec![
ConstScorer::from(VecDocSet::from(vec![0u32, 5u32])),
ConstScorer::from(VecDocSet::from(vec![1u32, 4u32])),
]);
assert_eq!(docset.doc(), 0u32);
assert_eq!(docset.seek(0u32), 0u32);
assert_eq!(docset.seek(0u32), 0u32);
@@ -406,13 +405,12 @@ mod bench {
tests::sample_with_seed(100_000, 0.2, 1),
];
bench.iter(|| {
let mut v = Union::build(
let mut v = Union::<_, DoNothingCombiner>::from(
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
DoNothingCombiner::default,
);
while v.doc() != TERMINATED {
v.advance();
@@ -427,13 +425,12 @@ mod bench {
tests::sample_with_seed(100_000, 0.001, 2),
];
bench.iter(|| {
let mut v = Union::build(
let mut v = Union::<_, DoNothingCombiner>::from(
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
DoNothingCombiner::default,
);
while v.doc() != TERMINATED {
v.advance();

View File

@@ -164,18 +164,21 @@ impl InnerIndexReader {
doc_store_cache_size: usize,
index: Index,
warming_state: WarmingState,
// The searcher_generation_inventory is not used as source, but as target to track the
// loaded segments.
searcher_generation_inventory: Inventory<SearcherGeneration>,
) -> crate::Result<Self> {
let searcher_generation_counter: Arc<AtomicU64> = Default::default();
let segment_readers = Self::open_segment_readers(&index)?;
let searcher_generation = Self::create_new_searcher_generation(
&segment_readers,
&searcher_generation_counter,
&searcher_generation_inventory,
);
let searcher = Self::create_searcher(
&index,
doc_store_cache_size,
&warming_state,
&searcher_generation_counter,
&searcher_generation_inventory,
searcher_generation,
)?;
Ok(InnerIndexReader {
doc_store_cache_size,
@@ -201,12 +204,12 @@ impl InnerIndexReader {
Ok(segment_readers)
}
fn track_segment_readers_in_inventory(
fn create_new_searcher_generation(
segment_readers: &[SegmentReader],
searcher_generation_counter: &Arc<AtomicU64>,
searcher_generation_inventory: &Inventory<SearcherGeneration>,
) -> TrackedObject<SearcherGeneration> {
let generation_id = searcher_generation_counter.fetch_add(1, atomic::Ordering::AcqRel);
let generation_id = searcher_generation_counter.fetch_add(1, atomic::Ordering::Relaxed);
let searcher_generation =
SearcherGeneration::from_segment_readers(segment_readers, generation_id);
searcher_generation_inventory.track(searcher_generation)
@@ -216,16 +219,9 @@ impl InnerIndexReader {
index: &Index,
doc_store_cache_size: usize,
warming_state: &WarmingState,
searcher_generation_counter: &Arc<AtomicU64>,
searcher_generation_inventory: &Inventory<SearcherGeneration>,
searcher_generation: TrackedObject<SearcherGeneration>,
) -> crate::Result<Arc<SearcherInner>> {
let segment_readers = Self::open_segment_readers(index)?;
let searcher_generation = Self::track_segment_readers_in_inventory(
&segment_readers,
searcher_generation_counter,
searcher_generation_inventory,
);
let schema = index.schema();
let searcher = Arc::new(SearcherInner::new(
schema,
@@ -240,12 +236,17 @@ impl InnerIndexReader {
}
fn reload(&self) -> crate::Result<()> {
let segment_readers = Self::open_segment_readers(&self.index)?;
let searcher_generation = Self::create_new_searcher_generation(
&segment_readers,
&self.searcher_generation_counter,
&self.searcher_generation_inventory,
);
let searcher = Self::create_searcher(
&self.index,
self.doc_store_cache_size,
&self.warming_state,
&self.searcher_generation_counter,
&self.searcher_generation_inventory,
searcher_generation,
)?;
self.searcher.store(searcher);

View File

@@ -7,7 +7,6 @@ use std::string::FromUtf8Error;
use common::BinarySerializable;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::de::Error as _;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
const SLASH_BYTE: u8 = b'/';
@@ -231,9 +230,7 @@ impl Serialize for Facet {
impl<'de> Deserialize<'de> for Facet {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> {
<Cow<'de, str> as Deserialize<'de>>::deserialize(deserializer).and_then(|path| {
Facet::from_text(&*path).map_err(|err| D::Error::custom(err.to_string()))
})
<&'de str as Deserialize<'de>>::deserialize(deserializer).map(Facet::from)
}
}
@@ -330,25 +327,4 @@ mod tests {
assert!(Facet::from("/").is_prefix_of(&Facet::from("/foobar")));
assert!(!Facet::from("/").is_prefix_of(&Facet::from("/")));
}
#[test]
fn deserialize_from_borrowed_string() {
let facet = serde_json::from_str::<Facet>(r#""/foo/bar""#).unwrap();
assert_eq!(facet, Facet::from_path(["foo", "bar"]));
}
#[test]
fn deserialize_from_owned_string() {
let facet = serde_json::from_str::<Facet>(r#""/foo/\u263A""#).unwrap();
assert_eq!(facet, Facet::from_path(["foo", ""]));
}
#[test]
fn deserialize_from_invalid_string() {
let error = serde_json::from_str::<Facet>(r#""foo/bar""#).unwrap_err();
assert_eq!(
error.to_string(),
"Failed to parse the facet string: 'foo/bar'"
);
}
}

View File

@@ -1,5 +1,5 @@
use std::cmp::Ordering;
use std::collections::{BTreeMap, BTreeSet};
use std::collections::BTreeMap;
use std::ops::Range;
use htmlescape::encode_minimal;
@@ -7,7 +7,7 @@ use htmlescape::encode_minimal;
use crate::query::Query;
use crate::schema::{Field, Value};
use crate::tokenizer::{TextAnalyzer, Token};
use crate::{Document, Score, Searcher, Term};
use crate::{Document, Score, Searcher};
const DEFAULT_MAX_NUM_CHARS: usize = 150;
@@ -79,7 +79,7 @@ impl Snippet {
let mut html = String::new();
let mut start_from: usize = 0;
for item in collapse_overlapped_ranges(&self.highlighted) {
for item in self.highlighted.iter() {
html.push_str(&encode_minimal(&self.fragment[start_from..item.start]));
html.push_str(HIGHLIGHTEN_PREFIX);
html.push_str(&encode_minimal(&self.fragment[item.clone()]));
@@ -186,53 +186,6 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
}
}
/// Returns ranges that are collapsed into non-overlapped ranges.
///
/// ## Examples
/// - [0..1, 2..3] -> [0..1, 2..3] # no overlap
/// - [0..1, 1..2] -> [0..1, 1..2] # no overlap
/// - [0..2, 1..2] -> [0..2] # collapsed
/// - [0..2, 1..3] -> [0..3] # collapsed
/// - [0..3, 1..2] -> [0..3] # second range's end is also inside of the first range
///
/// Note: This function assumes `ranges` is sorted by `Range.start` in ascending order.
fn collapse_overlapped_ranges(ranges: &[Range<usize>]) -> Vec<Range<usize>> {
debug_assert!(is_sorted(ranges.iter().map(|range| range.start)));
let mut result = Vec::new();
let mut ranges_it = ranges.iter();
let mut current = match ranges_it.next() {
Some(range) => range.clone(),
None => return result,
};
for range in ranges {
if current.end > range.start {
current = current.start..std::cmp::max(current.end, range.end);
} else {
result.push(current);
current = range.clone();
}
}
result.push(current);
result
}
fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for item in it {
if item < prev {
return false;
}
prev = item;
}
}
true
}
/// `SnippetGenerator`
///
/// # Example
@@ -302,20 +255,19 @@ impl SnippetGenerator {
query: &dyn Query,
field: Field,
) -> crate::Result<SnippetGenerator> {
let mut terms: BTreeSet<&Term> = BTreeSet::new();
query.query_terms(&mut |term, _| {
if term.field() == field {
terms.insert(term);
}
});
let mut terms = BTreeMap::new();
query.query_terms(&mut terms);
let mut terms_text: BTreeMap<String, Score> = Default::default();
for term in terms {
for (term, _) in terms {
if term.field() != field {
continue;
}
let term_str = if let Some(term_str) = term.as_str() {
term_str
} else {
continue;
};
let doc_freq = searcher.doc_freq(term)?;
let doc_freq = searcher.doc_freq(&term)?;
if doc_freq > 0 {
let score = 1.0 / (1.0 + doc_freq as Score);
terms_text.insert(term_str.to_string(), score);
@@ -367,10 +319,10 @@ mod tests {
use maplit::btreemap;
use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination};
use super::{search_fragments, select_best_fragment_combination};
use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::tokenizer::{NgramTokenizer, SimpleTokenizer};
use crate::tokenizer::SimpleTokenizer;
use crate::{Index, SnippetGenerator};
const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by
@@ -635,44 +587,4 @@ Survey in 2016, 2017, and 2018."#;
}
Ok(())
}
#[test]
fn test_collapse_overlapped_ranges() {
assert_eq!(&collapse_overlapped_ranges(&[0..1, 2..3,]), &[0..1, 2..3]);
assert_eq!(
collapse_overlapped_ranges(&vec![0..1, 1..2,]),
vec![0..1, 1..2]
);
assert_eq!(collapse_overlapped_ranges(&[0..2, 1..2,]), vec![0..2]);
assert_eq!(collapse_overlapped_ranges(&[0..2, 1..3,]), vec![0..3]);
assert_eq!(collapse_overlapped_ranges(&[0..3, 1..2,]), vec![0..3]);
}
#[test]
fn test_snippet_with_overlapped_highlighted_ranges() {
let text = "abc";
let mut terms = BTreeMap::new();
terms.insert(String::from("ab"), 0.9);
terms.insert(String::from("bc"), 1.0);
let fragments = search_fragments(
&From::from(NgramTokenizer::all_ngrams(2, 2)),
text,
&terms,
3,
);
assert_eq!(fragments.len(), 1);
{
let first = &fragments[0];
assert_eq!(first.score, 1.9);
assert_eq!(first.start_offset, 0);
assert_eq!(first.stop_offset, 3);
}
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragment, "abc");
assert_eq!(snippet.to_html(), "<b>abc</b>");
}
}