Compare commits

..

13 Commits

Author SHA1 Message Date
Paul Masurel
8303bd8e6d Fixed num_vals 2023-01-19 11:30:45 +09:00
Paul Masurel
1e27387c0f Date fast field working. Precision still broken 2023-01-18 18:23:21 +09:00
Paul Masurel
ae28aab67c Demuxer 2023-01-18 16:13:21 +09:00
Paul Masurel
8bedcab91d blop 2023-01-18 16:13:21 +09:00
Paul Masurel
23ace8bd44 connected datetime 2023-01-18 16:13:21 +09:00
Paul Masurel
58eec2c214 blop 2023-01-18 16:13:21 +09:00
Paul Masurel
7587656f1e Sisiphe work 2023-01-18 16:13:21 +09:00
Paul Masurel
e435b6fdd1 Disconnected facet / fast field merges / examples 2023-01-18 16:13:21 +09:00
Paul Masurel
007168ff4c Changed add_document 2023-01-18 16:13:21 +09:00
Paul Masurel
29c1a76d5a Removed cardinality from fast field options. 2023-01-18 16:13:21 +09:00
Paul Masurel
a0c1ba46c7 added columnar to workspace 2023-01-18 16:13:21 +09:00
Paul Masurel
5110ee7456 Removing cardinality. 2023-01-18 16:13:21 +09:00
Paul Masurel
575931bee4 Added solution to force the type of a column. 2023-01-18 16:13:21 +09:00
115 changed files with 3226 additions and 5329 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.19.1-quickwit"
version = "0.19.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -23,7 +23,7 @@ regex = { version = "1.5.5", default-features = false, features = ["std", "unico
aho-corasick = "0.7"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
@@ -55,11 +55,11 @@ measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
#columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
columnar = { version= "0.1", path="./columnar", package="tantivy-columnar" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }

View File

@@ -41,7 +41,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Text, i64, u64, f64, dates, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
- Range queries
- Faceted search
@@ -80,21 +80,56 @@ There are many ways to support this project.
# Contributing code
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
Feel free to update CHANGELOG.md with your contribution.
## Tokenizer
When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
## Minimum supported Rust version
Tantivy currently requires at least Rust 1.62 or later to compile.
## Clone and build locally
Tantivy compiles on stable Rust.
To check out and run tests, you can simply run:
```bash
git clone https://github.com/quickwit-oss/tantivy.git
cd tantivy
cargo test
git clone https://github.com/quickwit-oss/tantivy.git
cd tantivy
cargo build
```
## Run tests
Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`.
## Debug
You might find it useful to step through the programme with a debugger.
### A failing test
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that the `target/` directory exists. Use this bash script to find the name of the most recent debug build of Tantivy and run it under `rust-gdb`:
```bash
find target/debug/ -maxdepth 1 -executable -type f -name "tantivy*" -printf '%TY-%Tm-%Td %TT %p\n' | sort -r | cut -d " " -f 3 | xargs -I RECENT_DBG_TANTIVY rust-gdb RECENT_DBG_TANTIVY
```
Now that you are in `rust-gdb`, you can set breakpoints on lines and methods that match your source code and run the debug executable with flags that you normally pass to `cargo test` like this:
```bash
$gdb run --test-threads 1 --test $NAME_OF_TEST
```
### An example
By default, `rustc` compiles everything in the `examples/` directory in debug mode. This makes it easy for you to make examples to reproduce bugs:
```bash
rust-gdb target/debug/examples/$EXAMPLE_NAME
$ gdb run
```
# Companies Using Tantivy

18
TODO.txt Normal file
View File

@@ -0,0 +1,18 @@
Make schema_builder API fluent.
fix doc serialization and prevent compression problems
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
remove fastfield codecs
ditch the first_or_default trick. if it is still useful, improve its implementation.
rename FastFieldReaders::open to load
remove fast field reader
find a way to unify the two DateTime.
readd type check in the filter wrapper
add unit test on columnar list columns.
make sure sort works

View File

@@ -15,7 +15,3 @@ homepage = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
[dev-dependencies]
rand = "0.8"
proptest = "1"

View File

@@ -4,39 +4,9 @@ extern crate test;
#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
use tantivy_bitpacker::BlockedBitpacker;
use test::Bencher;
#[inline(never)]
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
let mut bitpacker = BitPacker::new();
let mut buffer = Vec::new();
for _ in 0..num_els {
// the values do not matter.
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
bitpacker.flush(&mut buffer).unwrap();
}
buffer
}
#[bench]
fn bench_bitpacking_read(b: &mut Bencher) {
let bit_width = 3;
let num_els = 1_000_000u32;
let bit_unpacker = BitUnpacker::new(bit_width);
let data = create_bitpacked_data(bit_width, num_els);
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for &idx in &idxs {
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
}
out
});
}
#[bench]
fn bench_blockedbitp_read(b: &mut Bencher) {
let mut blocked_bitpacker = BlockedBitpacker::new();
@@ -44,9 +14,9 @@ mod tests {
blocked_bitpacker.add(val * val);
}
b.iter(|| {
let mut out = 0u64;
let mut out = 0;
for val in 0..=21500 {
out = out.wrapping_add(blocked_bitpacker.get(val));
out = blocked_bitpacker.get(val);
}
out
});

View File

@@ -56,31 +56,27 @@ impl BitPacker {
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
Ok(())
}
}
#[derive(Clone, Debug, Default, Copy)]
#[derive(Clone, Debug, Default)]
pub struct BitUnpacker {
num_bits: u32,
num_bits: u64,
mask: u64,
}
impl BitUnpacker {
/// Creates a bit unpacker, that assumes the same bitwidth for all values.
///
/// The bitunpacker works by doing an unaligned read of 8 bytes.
/// For this reason, values of `num_bits` between
/// [57..63] are forbidden.
pub fn new(num_bits: u8) -> BitUnpacker {
assert!(num_bits <= 7 * 8 || num_bits == 64);
let mask: u64 = if num_bits == 64 {
!0u64
} else {
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: u32::from(num_bits),
num_bits: u64::from(num_bits),
mask,
}
}
@@ -91,40 +87,28 @@ impl BitUnpacker {
#[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
let addr_in_bits = idx * self.num_bits;
let addr = (addr_in_bits >> 3) as usize;
if addr + 8 > data.len() {
if self.num_bits == 0 {
return 0;
}
let bit_shift = addr_in_bits & 7;
return self.get_slow_path(addr, bit_shift, data);
if self.num_bits == 0 {
return 0u64;
}
let addr_in_bits = idx * self.num_bits as u32;
let addr = (addr_in_bits >> 3) as usize;
let bit_shift = addr_in_bits & 7;
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
let val_shifted = val_unshifted_unmasked >> bit_shift;
val_shifted & self.mask
}
#[inline(never)]
fn get_slow_path(&self, addr: usize, bit_shift: u32, data: &[u8]) -> u64 {
let mut bytes: [u8; 8] = [0u8; 8];
let available_bytes = data.len() - addr;
// This function is meant to only be called if we did not have 8 bytes to load.
debug_assert!(available_bytes < 8);
bytes[..available_bytes].copy_from_slice(&data[addr..]);
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
let val_shifted = val_unshifted_unmasked >> bit_shift;
val_shifted & self.mask
}
}
#[cfg(test)]
mod test {
use super::{BitPacker, BitUnpacker};
fn create_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new();
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
@@ -135,13 +119,13 @@ mod test {
bitpacker.write(val, num_bits, &mut data).unwrap();
}
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8);
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(num_bits);
(bitunpacker, vals, data)
}
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals, data) = create_bitpacker(len, num_bits);
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u32, &data), *val);
}
@@ -155,49 +139,4 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
use proptest::prelude::*;
fn num_bits_strategy() -> impl Strategy<Value = u8> {
prop_oneof!(Just(0), Just(1), 2u8..56u8, Just(56), Just(64),)
}
fn vals_strategy() -> impl Strategy<Value = (u8, Vec<u64>)> {
(num_bits_strategy(), 0usize..100usize).prop_flat_map(|(num_bits, len)| {
let max_val = if num_bits == 64 {
u64::MAX
} else {
(1u64 << num_bits as u32) - 1
};
let vals = proptest::collection::vec(0..=max_val, len);
vals.prop_map(move |vals| (num_bits, vals))
})
}
fn test_bitpacker_aux(num_bits: u8, vals: &[u64]) {
let mut buffer: Vec<u8> = Vec::new();
let mut bitpacker = BitPacker::new();
for &val in vals {
bitpacker.write(val, num_bits, &mut buffer).unwrap();
}
bitpacker.flush(&mut buffer).unwrap();
assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8);
let bitunpacker = BitUnpacker::new(num_bits);
let max_val = if num_bits == 64 {
u64::MAX
} else {
(1u64 << num_bits) - 1
};
for (i, val) in vals.iter().copied().enumerate() {
assert!(val <= max_val);
assert_eq!(bitunpacker.get(i as u32, &buffer), val);
}
}
proptest::proptest! {
#[test]
fn test_bitpacker_proptest((num_bits, vals) in vals_strategy()) {
test_bitpacker_aux(num_bits, &vals);
}
}
}

View File

@@ -5,23 +5,24 @@ edition = "2021"
license = "MIT"
[dependencies]
itertools = "0.10.5"
log = "0.4.17"
fnv = "1.0.7"
fastdivide = "0.4.0"
rand = { version = "0.8.5", optional = true }
measure_time = { version = "0.8.2", optional = true }
prettytable-rs = { version = "0.10.0", optional = true }
stacker = { path = "../stacker", package="tantivy-stacker"}
serde_json = "1"
thiserror = "1"
fnv = "1"
sstable = { path = "../sstable", package = "tantivy-sstable" }
common = { path = "../common", package = "tantivy-common" }
itertools = "0.10"
log = "0.4"
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
prettytable-rs = {version="0.10.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"
measure_time = { version="0.8.2", optional=true}
[dev-dependencies]
proptest = "1.0.0"
more-asserts = "0.3.1"
rand = "0.8.5"
proptest = "1"
more-asserts = "0.3.0"
rand = "0.8.3"
[features]
unstable = []

View File

@@ -26,6 +26,7 @@ Add alignment?
Consider another codec to bridge the gap between few and 5k elements
# Cleanup and rationalization
remove the 6 bit limitation of columntype. use 4 + 4 bits instead.
in benchmark, unify percent vs ratio, f32 vs f64.
investigate if should have better errors? io::Error is overused at the moment.
rename rank/select in unit tests

View File

@@ -5,16 +5,10 @@ use std::sync::Arc;
use sstable::{Dictionary, VoidSSTable};
use crate::column::Column;
use crate::column_index::ColumnIndex;
use crate::RowId;
/// Dictionary encoded column.
///
/// The column simply gives access to a regular u64-column that, in
/// which the values are term-ordinals.
///
/// These ordinals are ids uniquely identify the bytes that are stored in
/// the column. These ordinals are small, and sorted in the same order
/// as the term_ord_column.
#[derive(Clone)]
pub struct BytesColumn {
pub(crate) dictionary: Arc<Dictionary<VoidSSTable>>,
@@ -22,57 +16,30 @@ pub struct BytesColumn {
}
impl BytesColumn {
/// Fills the given `output` buffer with the term associated to the ordinal `ord`.
///
/// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
/// overll number of terms).
pub fn ord_to_bytes(&self, ord: u64, output: &mut Vec<u8>) -> io::Result<bool> {
self.dictionary.ord_to_term(ord, output)
pub fn term_ord_to_str(&self, term_ord: u64, output: &mut Vec<u8>) -> io::Result<bool> {
self.dictionary.ord_to_term(term_ord, output)
}
pub fn term_ords(&self) -> &Column<u64> {
&self.term_ord_column
}
/// Returns the number of rows in the column.
pub fn num_rows(&self) -> RowId {
self.term_ord_column.num_rows()
}
/// Returns the column of ordinals
pub fn ords(&self) -> &Column<u64> {
&self.term_ord_column
}
}
#[derive(Clone)]
pub struct StrColumn(BytesColumn);
impl From<BytesColumn> for StrColumn {
fn from(bytes_col: BytesColumn) -> Self {
StrColumn(bytes_col)
}
}
impl StrColumn {
/// Fills the buffer
pub fn ord_to_str(&self, term_ord: u64, output: &mut String) -> io::Result<bool> {
unsafe {
let buf = output.as_mut_vec();
self.0.dictionary.ord_to_term(term_ord, buf)?;
// TODO consider remove checks if it hurts performance.
if std::str::from_utf8(buf.as_slice()).is_err() {
buf.clear();
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Not valid utf-8",
));
}
}
Ok(true)
}
}
impl Deref for StrColumn {
type Target = BytesColumn;
impl Deref for BytesColumn {
type Target = ColumnIndex<'static>;
fn deref(&self) -> &Self::Target {
&self.0
&**self.term_ords()
}
}
#[cfg(test)]
mod tests {
use crate::{ColumnarReader, ColumnarWriter};
}

View File

@@ -5,13 +5,10 @@ use std::ops::Deref;
use std::sync::Arc;
use common::BinarySerializable;
pub use dictionary_encoded::{BytesColumn, StrColumn};
pub use serialize::{
open_column_bytes, open_column_u128, open_column_u64, serialize_column_mappable_to_u128,
serialize_column_mappable_to_u64,
};
pub use dictionary_encoded::BytesColumn;
pub use serialize::{open_column_bytes, open_column_u64, serialize_column_u64};
use crate::column_index::ColumnIndex;
use crate::column_index::{ColumnIndex, Set};
use crate::column_values::ColumnValues;
use crate::{Cardinality, RowId};
@@ -24,32 +21,25 @@ pub struct Column<T> {
impl<T: PartialOrd> Column<T> {
pub fn num_rows(&self) -> RowId {
match &self.idx {
ColumnIndex::Full => self.values.num_vals() as u32,
ColumnIndex::Optional(optional_index) => optional_index.num_rows(),
ColumnIndex::Multivalued(col_index) => {
// The multivalued index contains all value start row_id,
// and one extra value at the end with the overall number of rows.
col_index.num_vals() - 1
}
ColumnIndex::Full => self.values.num_vals(),
ColumnIndex::Optional(optional_idx) => optional_idx.num_rows(),
ColumnIndex::Multivalued(_) => todo!(),
}
}
pub fn min_value(&self) -> T {
self.values.min_value()
}
pub fn max_value(&self) -> T {
self.values.max_value()
}
}
impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
pub fn first(&self, row_id: RowId) -> Option<T> {
self.values(row_id).next()
}
pub fn values(&self, row_id: RowId) -> impl Iterator<Item = T> + '_ {
self.value_row_ids(row_id)
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
match &self.idx {
ColumnIndex::Full => Some(self.values.get_val(row_id)),
ColumnIndex::Optional(opt_idx) => {
let value_row_idx = opt_idx.rank_if_exists(row_id)?;
Some(self.values.get_val(value_row_idx))
}
ColumnIndex::Multivalued(_multivalued_index) => {
todo!();
}
}
}
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {

View File

@@ -2,51 +2,24 @@ use std::io;
use std::io::Write;
use std::sync::Arc;
use common::OwnedBytes;
use common::{CountingWriter, OwnedBytes};
use sstable::Dictionary;
use crate::column::{BytesColumn, Column};
use crate::column_index::{serialize_column_index, SerializableColumnIndex};
use crate::column_values::serialize::serialize_column_values_u128;
use crate::column_values::{
serialize_column_values, ColumnValues, FastFieldCodecType, MonotonicallyMappableToU128,
MonotonicallyMappableToU64,
serialize_column_values, ColumnValues, MonotonicallyMappableToU64, ALL_CODEC_TYPES,
};
pub fn serialize_column_mappable_to_u128<
F: Fn() -> I,
I: Iterator<Item = T>,
T: MonotonicallyMappableToU128,
>(
column_index: SerializableColumnIndex<'_>,
column_values: F,
num_vals: u32,
output: &mut impl Write,
) -> io::Result<()> {
let column_index_num_bytes = serialize_column_index(column_index, output)?;
serialize_column_values_u128(
|| column_values().map(|val| val.to_u128()),
num_vals,
output,
)?;
output.write_all(&column_index_num_bytes.to_le_bytes())?;
Ok(())
}
pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
pub fn serialize_column_u64<T: MonotonicallyMappableToU64>(
column_index: SerializableColumnIndex<'_>,
column_values: &impl ColumnValues<T>,
output: &mut impl Write,
) -> io::Result<()> {
let column_index_num_bytes = serialize_column_index(column_index, output)?;
serialize_column_values(
column_values,
&[
FastFieldCodecType::Bitpacked,
FastFieldCodecType::BlockwiseLinear,
],
output,
)?;
let mut counting_writer = CountingWriter::wrap(output);
serialize_column_index(column_index, &mut counting_writer)?;
let column_index_num_bytes = counting_writer.written_bytes() as u32;
let output = counting_writer.finish();
serialize_column_values(column_values, &ALL_CODEC_TYPES[..], output)?;
output.write_all(&column_index_num_bytes.to_le_bytes())?;
Ok(())
}
@@ -68,34 +41,14 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
})
}
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
bytes: OwnedBytes,
) -> io::Result<Column<T>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
.as_slice()
.try_into()
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
Ok(Column {
idx: column_index,
values: column_values,
})
}
pub fn open_column_bytes<T: From<BytesColumn>>(data: OwnedBytes) -> io::Result<T> {
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
let (body, dictionary_len_bytes) = data.rsplit(4);
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
let bytes_column = BytesColumn {
Ok(BytesColumn {
dictionary,
term_ord_column,
};
Ok(bytes_column.into())
})
}

View File

@@ -2,7 +2,6 @@ mod multivalued_index;
mod optional_index;
mod serialize;
use std::ops::Range;
use std::sync::Arc;
pub use optional_index::{OptionalIndex, SerializableOptionalIndex, Set};
@@ -15,12 +14,8 @@ use crate::{Cardinality, RowId};
pub enum ColumnIndex<'a> {
Full,
Optional(OptionalIndex),
// TODO Remove the static by fixing the codec if possible.
/// The column values enclosed contains for all row_id,
/// the value start_index.
///
/// In addition, at index num_rows, an extra value is added
/// containing the overal number of values.
// TODO remove the Arc<dyn> apart from serialization this is not
// dynamic at all.
Multivalued(Arc<dyn ColumnValues<RowId> + 'a>),
}
@@ -32,23 +27,4 @@ impl<'a> ColumnIndex<'a> {
ColumnIndex::Multivalued(_) => Cardinality::Multivalued,
}
}
pub fn value_row_ids(&self, row_id: RowId) -> Range<RowId> {
match self {
ColumnIndex::Full => row_id..row_id + 1,
ColumnIndex::Optional(optional_index) => {
if let Some(val) = optional_index.rank_if_exists(row_id) {
val..val + 1
} else {
0..0
}
}
ColumnIndex::Multivalued(multivalued_index) => {
let multivalued_index_ref = &**multivalued_index;
let start: u32 = multivalued_index_ref.get_val(row_id);
let end: u32 = multivalued_index_ref.get_val(row_id + 1);
start..end
}
}
}
}

View File

@@ -11,11 +11,11 @@ use crate::RowId;
pub struct MultivaluedIndex(Arc<dyn ColumnValues<RowId>>);
pub fn serialize_multivalued_index(
multivalued_index: &dyn ColumnValues<RowId>,
multivalued_index: MultivaluedIndex,
output: &mut impl Write,
) -> io::Result<()> {
crate::column_values::serialize_column_values(
&*multivalued_index,
&*multivalued_index.0,
&[FastFieldCodecType::Bitpacked, FastFieldCodecType::Linear],
output,
)?;
@@ -23,7 +23,5 @@ pub fn serialize_multivalued_index(
}
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<RowId>>> {
let start_index_column: Arc<dyn ColumnValues<RowId>> =
crate::column_values::open_u64_mapped(bytes)?;
Ok(start_index_column)
todo!();
}

View File

@@ -1,7 +1,7 @@
mod dense;
mod set_block;
mod sparse;
pub use dense::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
pub use set_block::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
pub use sparse::{SparseBlock, SparseBlockCodec};
#[cfg(test)]

View File

@@ -1,8 +1,7 @@
use std::collections::HashMap;
use crate::column_index::optional_index::set_block::{
DenseBlockCodec, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
};
use crate::column_index::optional_index::set_block::set_block::DENSE_BLOCK_NUM_BYTES;
use crate::column_index::optional_index::set_block::{DenseBlockCodec, SparseBlockCodec};
use crate::column_index::optional_index::{Set, SetCodec};
fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
@@ -52,7 +51,6 @@ fn test_sparse_block_set_u16_max() {
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(1))]
#[test]
fn test_prop_test_dense(els in proptest::collection::btree_set(0..=u16::MAX, 0..=u16::MAX as usize)) {
let vals: Vec<u16> = els.into_iter().collect();

View File

@@ -1,20 +1,19 @@
use std::io;
use std::io::Write;
use common::{CountingWriter, OwnedBytes};
use common::OwnedBytes;
use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::multivalued_index::{serialize_multivalued_index, MultivaluedIndex};
use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::{ColumnIndex, SerializableOptionalIndex};
use crate::column_values::ColumnValues;
use crate::{Cardinality, RowId};
use crate::Cardinality;
pub enum SerializableColumnIndex<'a> {
Full,
Optional(Box<dyn SerializableOptionalIndex<'a> + 'a>),
// TODO remove the Arc<dyn> apart from serialization this is not
// dynamic at all.
Multivalued(Box<dyn ColumnValues<RowId> + 'a>),
Multivalued(MultivaluedIndex),
}
impl<'a> SerializableColumnIndex<'a> {
@@ -30,21 +29,19 @@ impl<'a> SerializableColumnIndex<'a> {
pub fn serialize_column_index(
column_index: SerializableColumnIndex,
output: &mut impl Write,
) -> io::Result<u32> {
let mut output = CountingWriter::wrap(output);
) -> io::Result<()> {
let cardinality = column_index.get_cardinality().to_code();
output.write_all(&[cardinality])?;
match column_index {
SerializableColumnIndex::Full => {}
SerializableColumnIndex::Optional(optional_index) => {
serialize_optional_index(&*optional_index, &mut output)?
serialize_optional_index(&*optional_index, output)?
}
SerializableColumnIndex::Multivalued(multivalued_index) => {
serialize_multivalued_index(&*multivalued_index, &mut output)?
serialize_multivalued_index(multivalued_index, output)?
}
}
let column_index_num_bytes = output.written_bytes() as u32;
Ok(column_index_num_bytes)
Ok(())
}
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex<'static>> {

View File

@@ -78,32 +78,6 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
}
}
impl<T: Copy + PartialOrd> ColumnValues<T> for std::sync::Arc<dyn ColumnValues<T>> {
fn get_val(&self, idx: u32) -> T {
self.as_ref().get_val(idx)
}
fn min_value(&self) -> T {
self.as_ref().min_value()
}
fn max_value(&self) -> T {
self.as_ref().max_value()
}
fn num_vals(&self) -> u32 {
self.as_ref().num_vals()
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
self.as_ref().iter()
}
fn get_range(&self, start: u64, output: &mut [T]) {
self.as_ref().get_range(start, output)
}
}
impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd> ColumnValues<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)

View File

@@ -28,7 +28,7 @@ mod compact_space;
mod line;
mod linear;
pub(crate) mod monotonic_mapping;
pub(crate) mod monotonic_mapping_u128;
// mod monotonic_mapping_u128;
mod column;
mod column_with_cardinality;
@@ -37,10 +37,8 @@ pub mod serialize;
pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn};
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
#[cfg(test)]
pub use self::serialize::tests::serialize_and_load;
pub use self::serialize::{serialize_column_values, NormalizedHeader};
// pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
pub use self::serialize::{serialize_and_load, serialize_column_values, NormalizedHeader};
use crate::column_values::bitpacked::BitpackedCodec;
use crate::column_values::blockwise_linear::BlockwiseLinearCodec;
use crate::column_values::linear::LinearCodec;
@@ -124,17 +122,19 @@ impl U128FastFieldCodecType {
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128_mapped<T: MonotonicallyMappableToU128>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn ColumnValues<T>>> {
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceDecompressor::open(bytes)?;
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<T>> =
StrictlyMonotonicMappingToInternal::<T>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted)))
}
// pub fn open_u128<Item: MonotonicallyMappableToU128>(
// bytes: OwnedBytes,
// ) -> io::Result<Arc<dyn Column<Item>>> {
// todo!();
// // let (bytes, _format_version) = read_format_version(bytes)?;
// // let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
// // let header = U128Header::deserialize(&mut bytes)?;
// // assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
// // let reader = CompactSpaceDecompressor::open(bytes)?;
// // let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
// // StrictlyMonotonicMappingToInternal::<Item>::new().into();
// // Ok(Arc::new(monotonic_map_column(reader, inverted)))
// }
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u64_mapped<T: MonotonicallyMappableToU64>(
@@ -198,6 +198,13 @@ pub(crate) trait FastFieldCodec: 'static {
fn estimate(column: &dyn ColumnValues) -> Option<f32>;
}
/// The list of all available codecs for u64 convertible data.
pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
FastFieldCodecType::Bitpacked,
FastFieldCodecType::BlockwiseLinear,
FastFieldCodecType::Linear,
];
#[cfg(all(test, feature = "unstable"))]
mod bench {
use std::sync::Arc;

View File

@@ -2,7 +2,6 @@ use std::marker::PhantomData;
use fastdivide::DividerU64;
use super::MonotonicallyMappableToU128;
use crate::RowId;
/// Monotonic maps a value to u64 value space.
@@ -81,20 +80,21 @@ impl<T> StrictlyMonotonicMappingToInternal<T> {
}
}
impl<External: MonotonicallyMappableToU128, T: MonotonicallyMappableToU128>
StrictlyMonotonicFn<External, u128> for StrictlyMonotonicMappingToInternal<T>
where T: MonotonicallyMappableToU128
{
#[inline(always)]
fn mapping(&self, inp: External) -> u128 {
External::to_u128(inp)
}
// TODO
// impl<External: MonotonicallyMappableToU128, T: MonotonicallyMappableToU128>
// StrictlyMonotonicFn<External, u128> for StrictlyMonotonicMappingToInternal<T>
// where T: MonotonicallyMappableToU128
// {
// #[inline(always)]
// fn mapping(&self, inp: External) -> u128 {
// External::to_u128(inp)
// }
#[inline(always)]
fn inverse(&self, out: u128) -> External {
External::from_u128(out)
}
}
// #[inline(always)]
// fn inverse(&self, out: u128) -> External {
// External::from_u128(out)
// }
// }
impl<External: MonotonicallyMappableToU64, T: MonotonicallyMappableToU64>
StrictlyMonotonicFn<External, u64> for StrictlyMonotonicMappingToInternal<T>

View File

@@ -19,8 +19,9 @@
use std::io;
use std::num::NonZeroU64;
use std::sync::Arc;
use common::{BinarySerializable, VInt};
use common::{BinarySerializable, OwnedBytes, VInt};
use log::warn;
use super::bitpacked::BitpackedCodec;
@@ -32,9 +33,8 @@ use super::monotonic_mapping::{
};
use super::{
monotonic_map_column, ColumnValues, FastFieldCodec, FastFieldCodecType,
MonotonicallyMappableToU64, U128FastFieldCodecType,
MonotonicallyMappableToU64, U128FastFieldCodecType, VecColumn, ALL_CODEC_TYPES,
};
use crate::column_values::compact_space::CompactSpaceCompressor;
/// The normalized header gives some parameters after applying the following
/// normalization of the vector:
@@ -160,23 +160,55 @@ impl BinarySerializable for Header {
}
}
/// Serializes u128 values with the compact space codec.
pub fn serialize_column_values_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
iter_gen: F,
num_vals: u32,
output: &mut impl io::Write,
) -> io::Result<()> {
let header = U128Header {
num_vals,
codec_type: U128FastFieldCodecType::CompactSpace,
};
header.serialize(output)?;
let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
compressor.compress_into(iter_gen(), output)?;
Ok(())
/// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no
/// compression.
pub(crate) fn estimate<T: MonotonicallyMappableToU64>(
typed_column: impl ColumnValues<T>,
codec_type: FastFieldCodecType,
) -> Option<f32> {
let column = monotonic_map_column(typed_column, StrictlyMonotonicMappingToInternal::<T>::new());
let min_value = column.min_value();
let gcd = super::gcd::find_gcd(column.iter().map(|val| val - min_value))
.filter(|gcd| gcd.get() > 1u64);
let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(
gcd.map(|gcd| gcd.get()).unwrap_or(1u64),
min_value,
);
let normalized_column = monotonic_map_column(&column, mapping);
match codec_type {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
}
}
// TODO
/// Serializes u128 values with the compact space codec.
// pub fn serialize_u128_new<F: Fn() -> I, I: Iterator<Item = u128>>(
// value_index: ColumnIndex,
// iter_gen: F,
// num_vals: u32,
// output: &mut impl io::Write,
// ) -> io::Result<()> {
// let header = U128Header {
// num_vals,
// codec_type: U128FastFieldCodecType::CompactSpace,
// };
// header.serialize(output)?;
// let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
// compressor.compress_into(iter_gen(), output).unwrap();
// let null_index_footer = ColumnFooter {
// cardinality: value_index.get_cardinality(),
// null_index_codec: NullIndexCodec::Full,
// null_index_byte_range: 0..0,
// };
// append_null_index_footer(output, null_index_footer)?;
// append_format_version(output)?;
// Ok(())
// }
/// Serializes the column with the codec with the best estimate on the data.
pub fn serialize_column_values<T: MonotonicallyMappableToU64>(
typed_column: impl ColumnValues<T>,
@@ -247,29 +279,20 @@ pub(crate) fn serialize_given_codec(
Ok(())
}
/// Helper function to serialize a column (autodetect from all codecs) and then open it
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
column: &[T],
) -> Arc<dyn ColumnValues<T>> {
let mut buffer = Vec::new();
super::serialize_column_values(&VecColumn::from(&column), &ALL_CODEC_TYPES, &mut buffer)
.unwrap();
super::open_u64_mapped(OwnedBytes::new(buffer)).unwrap()
}
#[cfg(test)]
pub mod tests {
use std::sync::Arc;
use common::OwnedBytes;
mod tests {
use super::*;
use crate::column_values::{open_u64_mapped, VecColumn};
const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
FastFieldCodecType::Bitpacked,
FastFieldCodecType::Linear,
FastFieldCodecType::BlockwiseLinear,
];
/// Helper function to serialize a column (autodetect from all codecs) and then open it
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
column: &[T],
) -> Arc<dyn ColumnValues<T>> {
let mut buffer = Vec::new();
serialize_column_values(&VecColumn::from(&column), &ALL_CODEC_TYPES, &mut buffer).unwrap();
open_u64_mapped(OwnedBytes::new(buffer)).unwrap()
}
#[test]
fn test_serialize_deserialize_u128_header() {
let original = U128Header {
@@ -296,7 +319,7 @@ pub mod tests {
serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap();
// TODO put the header as a footer so that it serves as a padding.
// 5 bytes of header, 1 byte of value, 7 bytes of padding.
assert_eq!(buffer.len(), 5 + 1);
assert_eq!(buffer.len(), 5 + 1 + 7);
}
#[test]
@@ -305,7 +328,7 @@ pub mod tests {
let col = VecColumn::from(&[true][..]);
serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap();
// 5 bytes of header, 0 bytes of value, 7 bytes of padding.
assert_eq!(buffer.len(), 5);
assert_eq!(buffer.len(), 5 + 7);
}
#[test]
@@ -315,6 +338,6 @@ pub mod tests {
let col = VecColumn::from(&vals[..]);
serialize_column_values(&col, &[FastFieldCodecType::Bitpacked], &mut buffer).unwrap();
// Values are stored over 3 bits.
assert_eq!(buffer.len(), 7 + (3 * 80 / 8));
assert_eq!(buffer.len(), 7 + (3 * 80 / 8) + 7);
}
}

View File

@@ -1,103 +1,98 @@
use std::net::Ipv6Addr;
use crate::utils::{place_bits, select_bits};
use crate::value::NumericalType;
use crate::InvalidData;
/// The column type represents the column type.
/// Any changes need to be propagated to `COLUMN_TYPES`.
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd)]
#[repr(u8)]
/// The column type represents the column type and can fit on 6-bits.
///
/// - bits[0..3]: Column category type.
/// - bits[3..6]: Numerical type if necessary.
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
pub enum ColumnType {
I64 = 0u8,
U64 = 1u8,
F64 = 2u8,
Bytes = 3u8,
Str = 4u8,
Bool = 5u8,
IpAddr = 6u8,
DateTime = 7u8,
Str,
Numerical(NumericalType),
Bool,
DateTime,
}
// The order needs to match _exactly_ the order in the enum
const COLUMN_TYPES: [ColumnType; 8] = [
ColumnType::I64,
ColumnType::U64,
ColumnType::F64,
ColumnType::Bytes,
ColumnType::Str,
ColumnType::Bool,
ColumnType::IpAddr,
ColumnType::DateTime,
];
impl ColumnType {
pub fn to_code(self) -> u8 {
self as u8
/// Encoded over 6 bits.
pub(crate) fn to_code(self) -> u8 {
let column_type_category;
let numerical_type_code: u8;
match self {
ColumnType::Str => {
column_type_category = ColumnTypeCategory::Str;
numerical_type_code = 0u8;
}
ColumnType::Numerical(numerical_type) => {
column_type_category = ColumnTypeCategory::Numerical;
numerical_type_code = numerical_type.to_code();
}
ColumnType::Bool => {
column_type_category = ColumnTypeCategory::Bool;
numerical_type_code = 0u8;
}
ColumnType::DateTime => {
column_type_category = ColumnTypeCategory::DateTime;
numerical_type_code = 0u8;
}
}
place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code)
}
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
}
}
impl From<NumericalType> for ColumnType {
fn from(numerical_type: NumericalType) -> Self {
match numerical_type {
NumericalType::I64 => ColumnType::I64,
NumericalType::U64 => ColumnType::U64,
NumericalType::F64 => ColumnType::F64,
if select_bits::<6, 8>(code) != 0u8 {
return Err(InvalidData);
}
let column_type_category_code = select_bits::<0, 3>(code);
let numerical_type_code = select_bits::<3, 6>(code);
let column_type_category = ColumnTypeCategory::try_from_code(column_type_category_code)?;
match column_type_category {
ColumnTypeCategory::Bool => {
if numerical_type_code != 0u8 {
return Err(InvalidData);
}
Ok(ColumnType::Bool)
}
ColumnTypeCategory::Str => {
if numerical_type_code != 0u8 {
return Err(InvalidData);
}
Ok(ColumnType::Str)
}
ColumnTypeCategory::Numerical => {
let numerical_type = NumericalType::try_from_code(numerical_type_code)?;
Ok(ColumnType::Numerical(numerical_type))
}
ColumnTypeCategory::DateTime => {
if numerical_type_code != 0u8 {
return Err(InvalidData);
}
Ok(ColumnType::DateTime)
}
}
}
}
impl ColumnType {
pub fn numerical_type(&self) -> Option<NumericalType> {
match self {
ColumnType::I64 => Some(NumericalType::I64),
ColumnType::U64 => Some(NumericalType::U64),
ColumnType::F64 => Some(NumericalType::F64),
ColumnType::Bytes
| ColumnType::Str
| ColumnType::Bool
| ColumnType::IpAddr
| ColumnType::DateTime => None,
}
}
}
// TODO remove if possible
pub trait HasAssociatedColumnType: 'static + Send + Sync + Copy + PartialOrd {
fn column_type() -> ColumnType;
fn default_value() -> Self;
}
impl HasAssociatedColumnType for u64 {
fn column_type() -> ColumnType {
ColumnType::U64
}
fn default_value() -> Self {
0u64
ColumnType::Numerical(NumericalType::U64)
}
}
impl HasAssociatedColumnType for i64 {
fn column_type() -> ColumnType {
ColumnType::I64
}
fn default_value() -> Self {
0i64
ColumnType::Numerical(NumericalType::I64)
}
}
impl HasAssociatedColumnType for f64 {
fn column_type() -> ColumnType {
ColumnType::F64
}
fn default_value() -> Self {
Default::default()
ColumnType::Numerical(NumericalType::F64)
}
}
@@ -105,45 +100,63 @@ impl HasAssociatedColumnType for bool {
fn column_type() -> ColumnType {
ColumnType::Bool
}
fn default_value() -> Self {
Default::default()
}
}
impl HasAssociatedColumnType for crate::DateTime {
fn column_type() -> ColumnType {
ColumnType::DateTime
}
fn default_value() -> Self {
Default::default()
}
}
impl HasAssociatedColumnType for Ipv6Addr {
fn column_type() -> ColumnType {
ColumnType::IpAddr
/// Column types are grouped into different categories that
/// corresponds to the different types of `JsonValue` types.
///
/// The columnar writer will apply coercion rules to make sure that
/// at most one column exist per `ColumnTypeCategory`.
///
/// See also [README.md].
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
#[repr(u8)]
pub(crate) enum ColumnTypeCategory {
Bool = 0u8,
Str = 1u8,
Numerical = 2u8,
DateTime = 3u8,
}
impl ColumnTypeCategory {
pub fn to_code(self) -> u8 {
self as u8
}
fn default_value() -> Self {
Ipv6Addr::from([0u8; 16])
pub fn try_from_code(code: u8) -> Result<Self, InvalidData> {
match code {
0u8 => Ok(Self::Bool),
1u8 => Ok(Self::Str),
2u8 => Ok(Self::Numerical),
3u8 => Ok(Self::DateTime),
_ => Err(InvalidData),
}
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use super::*;
use crate::Cardinality;
#[test]
fn test_column_type_to_code() {
for (code, expected_column_type) in super::COLUMN_TYPES.iter().copied().enumerate() {
if let Ok(column_type) = ColumnType::try_from_code(code as u8) {
assert_eq!(column_type, expected_column_type);
let mut column_type_set: HashSet<ColumnType> = HashSet::new();
for code in u8::MIN..=u8::MAX {
if let Ok(column_type) = ColumnType::try_from_code(code) {
assert_eq!(column_type.to_code(), code);
assert!(column_type_set.insert(column_type));
}
}
for code in COLUMN_TYPES.len() as u8..=u8::MAX {
assert!(ColumnType::try_from_code(code as u8).is_err());
}
assert_eq!(column_type_set.len(), 3 + 3);
}
#[test]

View File

@@ -1,208 +0,0 @@
use std::collections::HashMap;
use std::io;
use crate::columnar::ColumnarReader;
use crate::dynamic_column::DynamicColumn;
use crate::ColumnType;
pub enum MergeDocOrder {
/// Columnar tables are simply stacked one above the other.
/// If the i-th columnar_readers has n_rows_i rows, then
/// in the resulting columnar,
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
/// ..
Stack,
/// Some more complex mapping, that can interleaves rows from the different readers and
/// possibly drop rows.
Complex(()),
}
pub fn merge_columnar(
_columnar_readers: &[ColumnarReader],
mapping: MergeDocOrder,
_output: &mut impl io::Write,
) -> io::Result<()> {
match mapping {
MergeDocOrder::Stack => {
// implement me :)
todo!();
}
MergeDocOrder::Complex(_) => {
// for later
todo!();
}
}
}
/// Column types are grouped into different categories.
/// After merge, all columns belonging to the same category are coerced to
/// the same column type.
///
/// In practise, today, only Numerical colummns are coerced into one type today.
///
/// See also [README.md].
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
#[repr(u8)]
enum ColumnTypeCategory {
Bool,
Str,
Numerical,
DateTime,
Bytes,
IpAddr,
}
impl From<ColumnType> for ColumnTypeCategory {
fn from(column_type: ColumnType) -> Self {
match column_type {
ColumnType::I64 => ColumnTypeCategory::Numerical,
ColumnType::U64 => ColumnTypeCategory::Numerical,
ColumnType::F64 => ColumnTypeCategory::Numerical,
ColumnType::Bytes => ColumnTypeCategory::Bytes,
ColumnType::Str => ColumnTypeCategory::Str,
ColumnType::Bool => ColumnTypeCategory::Bool,
ColumnType::IpAddr => ColumnTypeCategory::IpAddr,
ColumnType::DateTime => ColumnTypeCategory::DateTime,
}
}
}
fn collect_columns(
columnar_readers: &[&ColumnarReader],
) -> io::Result<HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>> {
// Each column name may have multiple types of column associated.
// For merging we are interested in the same column type category since they can be merged.
let mut field_name_to_group: HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>> =
HashMap::new();
for columnar_reader in columnar_readers {
let column_name_and_handle = columnar_reader.list_columns()?;
for (column_name, handle) in column_name_and_handle {
let column_type_to_handles = field_name_to_group
.entry(column_name.to_string())
.or_default();
let columns = column_type_to_handles
.entry(handle.column_type().into())
.or_default();
columns.push(handle.open()?);
}
}
normalize_columns(&mut field_name_to_group);
Ok(field_name_to_group)
}
/// Coerce numerical type columns to the same type
/// TODO rename to `coerce_columns`
fn normalize_columns(map: &mut HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>) {
for (_field_name, type_category_to_columns) in map.iter_mut() {
for (type_category, columns) in type_category_to_columns {
if type_category == &ColumnTypeCategory::Numerical {
let casted_columns = cast_to_common_numerical_column(&columns);
*columns = casted_columns;
}
}
}
}
/// Receives a list of columns of numerical types (u64, i64, f64)
///
/// Returns a list of `DynamicColumn` which are all of the same numerical type
fn cast_to_common_numerical_column(columns: &[DynamicColumn]) -> Vec<DynamicColumn> {
assert!(columns
.iter()
.all(|column| column.column_type().numerical_type().is_some()));
let coerce_to_i64: Vec<_> = columns
.iter()
.map(|column| column.clone().coerce_to_i64())
.collect();
if coerce_to_i64.iter().all(|column| column.is_some()) {
return coerce_to_i64
.into_iter()
.map(|column| column.unwrap())
.collect();
}
let coerce_to_u64: Vec<_> = columns
.iter()
.map(|column| column.clone().coerce_to_u64())
.collect();
if coerce_to_u64.iter().all(|column| column.is_some()) {
return coerce_to_u64
.into_iter()
.map(|column| column.unwrap())
.collect();
}
columns
.iter()
.map(|column| {
column
.clone()
.coerce_to_f64()
.expect("couldn't cast column to f64")
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ColumnarWriter;
#[test]
fn test_column_coercion() {
// i64 type
let columnar1 = {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(1u32, "numbers", 1i64);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(2, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
};
// u64 type
let columnar2 = {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(1u32, "numbers", u64::MAX - 100);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(2, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
};
// f64 type
let columnar3 = {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(1u32, "numbers", 30.5);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(2, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
};
let column_map = collect_columns(&[&columnar1, &columnar2, &columnar3]).unwrap();
assert_eq!(column_map.len(), 1);
let cat_to_columns = column_map.get("numbers").unwrap();
assert_eq!(cat_to_columns.len(), 1);
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
assert!(numerical.iter().all(|column| column.is_f64()));
let column_map = collect_columns(&[&columnar1, &columnar1]).unwrap();
assert_eq!(column_map.len(), 1);
let cat_to_columns = column_map.get("numbers").unwrap();
assert_eq!(cat_to_columns.len(), 1);
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
assert!(numerical.iter().all(|column| column.is_i64()));
let column_map = collect_columns(&[&columnar2, &columnar2]).unwrap();
assert_eq!(column_map.len(), 1);
let cat_to_columns = column_map.get("numbers").unwrap();
assert_eq!(cat_to_columns.len(), 1);
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
assert!(numerical.iter().all(|column| column.is_u64()));
}
}

View File

@@ -1,10 +1,28 @@
// Copyright (C) 2022 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at hello@quickwit.io.
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
mod column_type;
mod format_version;
mod merge;
mod reader;
mod writer;
pub use column_type::{ColumnType, HasAssociatedColumnType};
pub use merge::{merge_columnar, MergeDocOrder};
pub use reader::ColumnarReader;
pub use writer::ColumnarWriter;

View File

@@ -1,5 +1,3 @@
use std::net::Ipv6Addr;
use crate::dictionary::UnorderedId;
use crate::utils::{place_bits, pop_first_byte, select_bits};
use crate::value::NumericalValue;
@@ -27,12 +25,12 @@ struct ColumnOperationMetadata {
impl ColumnOperationMetadata {
fn to_code(self) -> u8 {
place_bits::<0, 6>(self.len) | place_bits::<6, 8>(self.op_type.to_code())
place_bits::<0, 4>(self.len) | place_bits::<4, 8>(self.op_type.to_code())
}
fn try_from_code(code: u8) -> Result<Self, InvalidData> {
let len = select_bits::<0, 6>(code);
let typ_code = select_bits::<6, 8>(code);
let len = select_bits::<0, 4>(code);
let typ_code = select_bits::<4, 8>(code);
let column_type = ColumnOperationType::try_from_code(typ_code)?;
Ok(ColumnOperationMetadata {
op_type: column_type,
@@ -144,21 +142,9 @@ impl SymbolValue for bool {
}
}
impl SymbolValue for Ipv6Addr {
fn serialize(self, buffer: &mut [u8]) -> u8 {
buffer[0..16].copy_from_slice(&self.octets());
16
}
fn deserialize(bytes: &[u8]) -> Self {
let octets: [u8; 16] = bytes[0..16].try_into().unwrap();
Ipv6Addr::from(octets)
}
}
#[derive(Default)]
struct MiniBuffer {
pub bytes: [u8; 17],
pub bytes: [u8; 10],
pub len: u8,
}

View File

@@ -184,12 +184,10 @@ impl CompatibleNumericalTypes {
}
impl NumericalColumnWriter {
pub fn numerical_type(&self) -> NumericalType {
self.compatible_numerical_types.to_numerical_type()
}
pub fn cardinality(&self, num_docs: RowId) -> Cardinality {
self.column_writer.get_cardinality(num_docs)
pub fn column_type_and_cardinality(&self, num_docs: RowId) -> (NumericalType, Cardinality) {
let numerical_type = self.compatible_numerical_types.to_numerical_type();
let cardinality = self.column_writer.get_cardinality(num_docs);
(numerical_type, cardinality)
}
pub fn record_numerical_value(
@@ -211,15 +209,15 @@ impl NumericalColumnWriter {
}
}
#[derive(Copy, Clone)]
pub(crate) struct StrOrBytesColumnWriter {
#[derive(Copy, Clone, Default)]
pub(crate) struct StrColumnWriter {
pub(crate) dictionary_id: u32,
pub(crate) column_writer: ColumnWriter,
}
impl StrOrBytesColumnWriter {
pub(crate) fn with_dictionary_id(dictionary_id: u32) -> StrOrBytesColumnWriter {
StrOrBytesColumnWriter {
impl StrColumnWriter {
pub(crate) fn with_dictionary_id(dictionary_id: u32) -> StrColumnWriter {
StrColumnWriter {
dictionary_id,
column_writer: Default::default(),
}

View File

@@ -4,7 +4,6 @@ mod serializer;
mod value_index;
use std::io;
use std::net::Ipv6Addr;
use column_operation::ColumnOperation;
use common::CountingWriter;
@@ -12,12 +11,10 @@ use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena};
use crate::column_index::SerializableColumnIndex;
use crate::column_values::{
ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
};
use crate::columnar::column_type::ColumnType;
use crate::column_values::{ColumnValues, MonotonicallyMappableToU64, VecColumn};
use crate::columnar::column_type::{ColumnType, ColumnTypeCategory};
use crate::columnar::writer::column_writers::{
ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
ColumnWriter, NumericalColumnWriter, StrColumnWriter,
};
use crate::columnar::writer::value_index::{IndexBuilder, PreallocatedIndexBuilders};
use crate::dictionary::{DictionaryBuilder, TermIdMapping, UnorderedId};
@@ -33,7 +30,6 @@ struct SpareBuffers {
u64_values: Vec<u64>,
f64_values: Vec<f64>,
bool_values: Vec<bool>,
ip_addr_values: Vec<Ipv6Addr>,
}
/// Makes it possible to create a new columnar.
@@ -53,9 +49,7 @@ pub struct ColumnarWriter {
numerical_field_hash_map: ArenaHashMap,
datetime_field_hash_map: ArenaHashMap,
bool_field_hash_map: ArenaHashMap,
ip_addr_field_hash_map: ArenaHashMap,
bytes_field_hash_map: ArenaHashMap,
str_field_hash_map: ArenaHashMap,
arena: MemoryArena,
// Dictionaries used to store dictionary-encoded values.
dictionaries: Vec<DictionaryBuilder>,
@@ -67,9 +61,7 @@ impl Default for ColumnarWriter {
ColumnarWriter {
numerical_field_hash_map: ArenaHashMap::new(10_000),
bool_field_hash_map: ArenaHashMap::new(10_000),
ip_addr_field_hash_map: ArenaHashMap::new(10_000),
bytes_field_hash_map: ArenaHashMap::new(10_000),
str_field_hash_map: ArenaHashMap::new(10_000),
datetime_field_hash_map: ArenaHashMap::new(10_000),
dictionaries: Vec::new(),
arena: MemoryArena::default(),
@@ -101,34 +93,15 @@ impl ColumnarWriter {
+ self.numerical_field_hash_map.mem_usage()
+ self.bool_field_hash_map.mem_usage()
+ self.bytes_field_hash_map.mem_usage()
+ self.str_field_hash_map.mem_usage()
+ self.ip_addr_field_hash_map.mem_usage()
+ self.datetime_field_hash_map.mem_usage()
}
pub fn record_column_type(&mut self, column_name: &str, column_type: ColumnType) {
match column_type {
ColumnType::Str | ColumnType::Bytes => {
let (hash_map, dictionaries) = (
if column_type == ColumnType::Str {
&mut self.str_field_hash_map
} else {
&mut self.bytes_field_hash_map
},
&mut self.dictionaries,
);
ColumnType::Str => {
mutate_or_create_column(
hash_map,
&mut self.bytes_field_hash_map,
column_name,
|column_opt: Option<StrOrBytesColumnWriter>| {
if let Some(column_writer) = column_opt {
column_writer
} else {
let dictionary_id = dictionaries.len() as u32;
dictionaries.push(DictionaryBuilder::default());
StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
}
},
|column_opt: Option<StrColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::Bool => {
@@ -145,8 +118,7 @@ impl ColumnarWriter {
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
let numerical_type = column_type.numerical_type().unwrap();
ColumnType::Numerical(numerical_type) => {
mutate_or_create_column(
&mut self.numerical_field_hash_map,
column_name,
@@ -157,11 +129,6 @@ impl ColumnarWriter {
},
);
}
ColumnType::IpAddr => mutate_or_create_column(
&mut self.ip_addr_field_hash_map,
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
),
}
}
@@ -195,22 +162,6 @@ impl ColumnarWriter {
);
}
pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena);
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, ip_addr, arena);
column
},
);
}
pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
@@ -231,18 +182,19 @@ impl ColumnarWriter {
pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
let (hash_map, arena, dictionaries) = (
&mut self.str_field_hash_map,
&mut self.bytes_field_hash_map,
&mut self.arena,
&mut self.dictionaries,
);
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<StrOrBytesColumnWriter>| {
let mut column: StrOrBytesColumnWriter = column_opt.unwrap_or_else(|| {
mutate_or_create_column(
hash_map,
column_name,
|column_opt: Option<StrColumnWriter>| {
let mut column: StrColumnWriter = column_opt.unwrap_or_else(|| {
// Each column has its own dictionary
let dictionary_id = dictionaries.len() as u32;
dictionaries.push(DictionaryBuilder::default());
StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
StrColumnWriter::with_dictionary_id(dictionary_id)
});
column.record_bytes(doc, value.as_bytes(), dictionaries, arena);
column
@@ -250,79 +202,35 @@ impl ColumnarWriter {
);
}
pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena, dictionaries) = (
&mut self.bytes_field_hash_map,
&mut self.arena,
&mut self.dictionaries,
);
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<StrOrBytesColumnWriter>| {
let mut column: StrOrBytesColumnWriter = column_opt.unwrap_or_else(|| {
// Each column has its own dictionary
let dictionary_id = dictionaries.len() as u32;
dictionaries.push(DictionaryBuilder::default());
StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
});
column.record_bytes(doc, value, dictionaries, arena);
column
},
);
}
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
let mut serializer = ColumnarSerializer::new(wrt);
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
let mut columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self
.numerical_field_hash_map
.iter()
.map(|(column_name, addr, _)| {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let column_type = numerical_column_writer.numerical_type().into();
(column_name, column_type, addr)
})
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Numerical, addr))
.collect();
columns.extend(
self.bytes_field_hash_map
.iter()
.map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
);
columns.extend(
self.str_field_hash_map
.iter()
.map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Str, addr)),
);
columns.extend(
self.bool_field_hash_map
.iter()
.map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
);
columns.extend(
self.ip_addr_field_hash_map
.iter()
.map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Bool, addr)),
);
columns.extend(
self.datetime_field_hash_map
.iter()
.map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::DateTime, addr)),
);
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns {
match column_type {
ColumnType::Bool | ColumnType::DateTime => {
let column_writer: ColumnWriter = if column_type == ColumnType::Bool {
self.bool_field_hash_map.read(addr)
} else {
self.datetime_field_hash_map.read(addr)
};
for (column_name, bytes_or_numerical, addr) in columns {
match bytes_or_numerical {
ColumnTypeCategory::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::Bool);
@@ -334,50 +242,29 @@ impl ColumnarWriter {
&mut column_serializer,
)?;
}
ColumnType::IpAddr => {
let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::IpAddr);
serialize_ip_addr_column(
cardinality,
num_docs,
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
}
ColumnType::Bytes | ColumnType::Str => {
let str_or_bytes_column_writer: StrOrBytesColumnWriter =
if column_type == ColumnType::Bytes {
self.bytes_field_hash_map.read(addr)
} else {
self.str_field_hash_map.read(addr)
};
ColumnTypeCategory::Str => {
let str_column_writer: StrColumnWriter = self.bytes_field_hash_map.read(addr);
let dictionary_builder =
&dictionaries[str_or_bytes_column_writer.dictionary_id as usize];
let cardinality = str_or_bytes_column_writer
.column_writer
.get_cardinality(num_docs);
&dictionaries[str_column_writer.dictionary_id as usize];
let cardinality = str_column_writer.column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, column_type);
serialize_bytes_or_str_column(
serializer.serialize_column(column_name, ColumnType::Str);
serialize_bytes_column(
cardinality,
num_docs,
dictionary_builder,
str_or_bytes_column_writer
.operation_iterator(arena, &mut symbol_byte_buffer),
str_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
}
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
ColumnTypeCategory::Numerical => {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let numerical_type = column_type.numerical_type().unwrap();
let cardinality = numerical_column_writer.cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::from(numerical_type));
let (numerical_type, cardinality) =
numerical_column_writer.column_type_and_cardinality(num_docs);
let mut column_serializer = serializer
.serialize_column(column_name, ColumnType::Numerical(numerical_type));
serialize_numerical_column(
cardinality,
num_docs,
@@ -387,6 +274,20 @@ impl ColumnarWriter {
&mut column_serializer,
)?;
}
ColumnTypeCategory::DateTime => {
let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::DateTime);
serialize_numerical_column(
cardinality,
num_docs,
NumericalType::I64,
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
}
};
}
serializer.finalize()?;
@@ -394,7 +295,7 @@ impl ColumnarWriter {
}
}
fn serialize_bytes_or_str_column(
fn serialize_bytes_column(
cardinality: Cardinality,
num_docs: RowId,
dictionary_builder: &DictionaryBuilder,
@@ -421,7 +322,7 @@ fn serialize_bytes_or_str_column(
ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
}
});
send_to_serialize_column_mappable_to_u64(
serialize_column(
operation_iterator,
cardinality,
num_docs,
@@ -450,7 +351,7 @@ fn serialize_numerical_column(
} = buffers;
match numerical_type {
NumericalType::I64 => {
send_to_serialize_column_mappable_to_u64(
serialize_column(
coerce_numerical_symbol::<i64>(op_iterator),
cardinality,
num_docs,
@@ -460,7 +361,7 @@ fn serialize_numerical_column(
)?;
}
NumericalType::U64 => {
send_to_serialize_column_mappable_to_u64(
serialize_column(
coerce_numerical_symbol::<u64>(op_iterator),
cardinality,
num_docs,
@@ -470,7 +371,7 @@ fn serialize_numerical_column(
)?;
}
NumericalType::F64 => {
send_to_serialize_column_mappable_to_u64(
serialize_column(
coerce_numerical_symbol::<f64>(op_iterator),
cardinality,
num_docs,
@@ -495,7 +396,7 @@ fn serialize_bool_column(
bool_values,
..
} = buffers;
send_to_serialize_column_mappable_to_u64(
serialize_column(
column_operations_it,
cardinality,
num_docs,
@@ -506,76 +407,7 @@ fn serialize_bool_column(
Ok(())
}
fn serialize_ip_addr_column(
cardinality: Cardinality,
num_docs: RowId,
column_operations_it: impl Iterator<Item = ColumnOperation<Ipv6Addr>>,
buffers: &mut SpareBuffers,
wrt: &mut impl io::Write,
) -> io::Result<()> {
let SpareBuffers {
value_index_builders,
ip_addr_values,
..
} = buffers;
send_to_serialize_column_mappable_to_u128(
column_operations_it,
cardinality,
num_docs,
value_index_builders,
ip_addr_values,
wrt,
)?;
Ok(())
}
fn send_to_serialize_column_mappable_to_u128<
T: Copy + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
>(
op_iterator: impl Iterator<Item = ColumnOperation<T>>,
cardinality: Cardinality,
num_docs: RowId,
value_index_builders: &mut PreallocatedIndexBuilders,
values: &mut Vec<T>,
mut wrt: impl io::Write,
) -> io::Result<()>
where
for<'a> VecColumn<'a, T>: ColumnValues<T>,
{
values.clear();
// TODO: split index and values
let serializable_column_index = match cardinality {
Cardinality::Full => {
consume_operation_iterator(
op_iterator,
value_index_builders.borrow_required_index_builder(),
values,
);
SerializableColumnIndex::Full
}
Cardinality::Optional => {
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
consume_operation_iterator(op_iterator, optional_index_builder, values);
let optional_index = optional_index_builder.finish(num_docs);
SerializableColumnIndex::Optional(Box::new(optional_index))
}
Cardinality::Multivalued => {
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_docs);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
}
};
crate::column::serialize_column_mappable_to_u128(
serializable_column_index,
|| values.iter().cloned(),
values.len() as u32,
&mut wrt,
)?;
Ok(())
}
fn send_to_serialize_column_mappable_to_u64<
fn serialize_column<
T: Copy + Default + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU64 + PartialOrd,
>(
op_iterator: impl Iterator<Item = ColumnOperation<T>>,
@@ -608,10 +440,11 @@ where
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_docs);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
todo!();
// SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
}
};
crate::column::serialize_column_mappable_to_u64(
crate::column::serialize_column_u64(
serializable_column_index,
&VecColumn::from(&values[..]),
&mut wrt,
@@ -649,12 +482,59 @@ fn consume_operation_iterator<T: std::fmt::Debug, TIndexBuilder: IndexBuilder>(
}
}
// /// Serializes the column with the codec with the best estimate on the data.
// fn serialize_numerical<T: MonotonicallyMappableToU64>(
// value_index: ValueIndexInfo,
// typed_column: impl Column<T>,
// output: &mut impl io::Write,
// codecs: &[FastFieldCodecType],
// ) -> io::Result<()> {
// let counting_writer = CountingWriter::wrap(output);
// serialize_value_index(value_index, output)?;
// let value_index_len = counting_writer.written_bytes();
// let output = counting_writer.finish();
// serialize_column(value_index, output)?;
// let column = monotonic_map_column(
// typed_column,
// crate::column::monotonic_mapping::StrictlyMonotonicMappingToInternal::<T>::new(),
// );
// let header = Header::compute_header(&column, codecs).ok_or_else(|| {
// io::Error::new(
// io::ErrorKind::InvalidInput,
// format!(
// "Data cannot be serialized with this list of codec. {:?}",
// codecs
// ),
// )
// })?;
// header.serialize(output)?;
// let normalized_column = header.normalize_column(column);
// assert_eq!(normalized_column.min_value(), 0u64);
// serialize_given_codec(normalized_column, header.codec_type, output)?;
// let column_header = ColumnFooter {
// value_index_len: todo!(),
// cardinality: todo!(),
// };
// let null_index_footer = NullIndexFooter {
// cardinality: value_index.get_cardinality(),
// null_index_codec: NullIndexCodec::Full,
// null_index_byte_range: 0..0,
// };
// append_null_index_footer(output, null_index_footer)?;
// Ok(())
// }
#[cfg(test)]
mod tests {
use column_operation::ColumnOperation;
use stacker::MemoryArena;
use crate::columnar::writer::column_operation::ColumnOperation;
use crate::{Cardinality, NumericalValue};
use super::*;
use crate::value::NumericalValue;
#[test]
fn test_column_writer_required_simple() {

View File

@@ -45,6 +45,16 @@ impl<'a> SerializableOptionalIndex<'a> for SingleValueArrayIndex<'a> {
}
}
impl OptionalIndexBuilder {
fn num_non_nulls(&self) -> u32 {
self.docs.len() as u32
}
fn iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.docs.iter().copied())
}
}
impl OptionalIndexBuilder {
pub fn finish<'a>(&'a mut self, num_rows: RowId) -> impl SerializableOptionalIndex + 'a {
debug_assert!(self
@@ -86,7 +96,7 @@ pub struct MultivaluedIndexBuilder {
impl MultivaluedIndexBuilder {
pub fn finish(&mut self, num_docs: RowId) -> impl ColumnValues<u32> + '_ {
self.start_offsets
.resize(num_docs as usize + 1, self.total_num_vals_seen);
.resize(num_docs as usize, self.total_num_vals_seen);
VecColumn {
values: &&self.start_offsets[..],
min_value: 0,
@@ -178,7 +188,7 @@ mod tests {
.finish(4u32)
.iter()
.collect::<Vec<u32>>(),
vec![0, 0, 2, 3, 3]
vec![0, 0, 2, 3]
);
multivalued_value_index_builder.reset();
multivalued_value_index_builder.record_row(2u32);
@@ -189,7 +199,7 @@ mod tests {
.finish(4u32)
.iter()
.collect::<Vec<u32>>(),
vec![0, 0, 0, 2, 2]
vec![0, 0, 0, 2]
);
}
}

View File

@@ -1,14 +1,11 @@
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use std::net::IpAddr;
use common::file_slice::FileSlice;
use common::{HasLen, OwnedBytes};
use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
use crate::column::{BytesColumn, Column};
use crate::columnar::ColumnType;
use crate::{DateTime, NumericalType};
#[derive(Clone)]
pub enum DynamicColumn {
@@ -16,139 +13,16 @@ pub enum DynamicColumn {
I64(Column<i64>),
U64(Column<u64>),
F64(Column<f64>),
IpAddr(Column<Ipv6Addr>),
DateTime(Column<DateTime>),
Bytes(BytesColumn),
Str(StrColumn),
}
impl DynamicColumn {
pub fn column_type(&self) -> ColumnType {
match self {
DynamicColumn::Bool(_) => ColumnType::Bool,
DynamicColumn::I64(_) => ColumnType::I64,
DynamicColumn::U64(_) => ColumnType::U64,
DynamicColumn::F64(_) => ColumnType::F64,
DynamicColumn::IpAddr(_) => ColumnType::IpAddr,
DynamicColumn::DateTime(_) => ColumnType::DateTime,
DynamicColumn::Bytes(_) => ColumnType::Bytes,
DynamicColumn::Str(_) => ColumnType::Str,
}
}
pub fn is_numerical(&self) -> bool {
self.column_type().numerical_type().is_some()
}
pub fn is_f64(&self) -> bool {
self.column_type().numerical_type() == Some(NumericalType::F64)
}
pub fn is_i64(&self) -> bool {
self.column_type().numerical_type() == Some(NumericalType::I64)
}
pub fn is_u64(&self) -> bool {
self.column_type().numerical_type() == Some(NumericalType::U64)
}
pub fn coerce_to_f64(self) -> Option<DynamicColumn> {
match self {
DynamicColumn::I64(column) => Some(DynamicColumn::F64(Column {
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapI64ToF64)),
})),
DynamicColumn::U64(column) => Some(DynamicColumn::F64(Column {
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapU64ToF64)),
})),
DynamicColumn::F64(_) => Some(self),
_ => None,
}
}
pub fn coerce_to_i64(self) -> Option<DynamicColumn> {
match self {
DynamicColumn::U64(column) => {
if column.max_value() > i64::MAX as u64 {
return None;
}
Some(DynamicColumn::I64(Column {
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapU64ToI64)),
}))
}
DynamicColumn::I64(_) => Some(self),
_ => None,
}
}
pub fn coerce_to_u64(self) -> Option<DynamicColumn> {
match self {
DynamicColumn::I64(column) => {
if column.min_value() < 0 {
return None;
}
Some(DynamicColumn::U64(Column {
idx: column.idx,
values: Arc::new(monotonic_map_column(column.values, MapI64ToU64)),
}))
}
DynamicColumn::U64(_) => Some(self),
_ => None,
}
}
}
struct MapI64ToF64;
impl StrictlyMonotonicFn<i64, f64> for MapI64ToF64 {
#[inline(always)]
fn mapping(&self, inp: i64) -> f64 {
inp as f64
}
#[inline(always)]
fn inverse(&self, out: f64) -> i64 {
out as i64
}
}
struct MapU64ToF64;
impl StrictlyMonotonicFn<u64, f64> for MapU64ToF64 {
#[inline(always)]
fn mapping(&self, inp: u64) -> f64 {
inp as f64
}
#[inline(always)]
fn inverse(&self, out: f64) -> u64 {
out as u64
}
}
struct MapU64ToI64;
impl StrictlyMonotonicFn<u64, i64> for MapU64ToI64 {
#[inline(always)]
fn mapping(&self, inp: u64) -> i64 {
inp as i64
}
#[inline(always)]
fn inverse(&self, out: i64) -> u64 {
out as u64
}
}
struct MapI64ToU64;
impl StrictlyMonotonicFn<i64, u64> for MapI64ToU64 {
#[inline(always)]
fn mapping(&self, inp: i64) -> u64 {
inp as u64
}
#[inline(always)]
fn inverse(&self, out: u64) -> i64 {
out as i64
}
IpAddr(Column<IpAddr>),
Str(BytesColumn),
DateTime(Column<crate::DateTime>),
}
macro_rules! static_dynamic_conversions {
($typ:ty, $enum_name:ident) => {
impl Into<Option<$typ>> for DynamicColumn {
fn into(self) -> Option<$typ> {
if let DynamicColumn::$enum_name(col) = self {
impl Into<Option<Column<$typ>>> for DynamicColumn {
fn into(self) -> Option<Column<$typ>> {
if let Self::$enum_name(col) = self {
Some(col)
} else {
None
@@ -156,22 +30,25 @@ macro_rules! static_dynamic_conversions {
}
}
impl From<$typ> for DynamicColumn {
fn from(typed_column: $typ) -> Self {
impl From<Column<$typ>> for DynamicColumn {
fn from(typed_column: Column<$typ>) -> Self {
DynamicColumn::$enum_name(typed_column)
}
}
};
}
static_dynamic_conversions!(Column<bool>, Bool);
static_dynamic_conversions!(Column<u64>, U64);
static_dynamic_conversions!(Column<i64>, I64);
static_dynamic_conversions!(Column<f64>, F64);
static_dynamic_conversions!(Column<crate::DateTime>, DateTime);
static_dynamic_conversions!(StrColumn, Str);
static_dynamic_conversions!(BytesColumn, Bytes);
static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
static_dynamic_conversions!(bool, Bool);
static_dynamic_conversions!(u64, U64);
static_dynamic_conversions!(i64, I64);
static_dynamic_conversions!(f64, F64);
static_dynamic_conversions!(crate::DateTime, DateTime);
impl From<BytesColumn> for DynamicColumn {
fn from(dictionary_encoded_col: BytesColumn) -> Self {
DynamicColumn::Str(dictionary_encoded_col)
}
}
#[derive(Clone)]
pub struct DynamicColumnHandle {
@@ -200,13 +77,12 @@ impl DynamicColumnHandle {
pub fn open_u64_lenient(&self) -> io::Result<Option<Column<u64>>> {
let column_bytes = self.file_slice.read_bytes()?;
match self.column_type {
ColumnType::Str | ColumnType::Bytes => {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
ColumnType::Str => {
let column = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column))
}
ColumnType::Bool => Ok(None),
ColumnType::IpAddr => Ok(None),
ColumnType::I64 | ColumnType::U64 | ColumnType::F64 | ColumnType::DateTime => {
ColumnType::Numerical(_) | ColumnType::DateTime => {
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
Ok(Some(column))
}
@@ -215,15 +91,19 @@ impl DynamicColumnHandle {
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
let dynamic_column: DynamicColumn = match self.column_type {
ColumnType::Bytes => {
crate::column::open_column_bytes::<BytesColumn>(column_bytes)?.into()
}
ColumnType::Str => crate::column::open_column_bytes::<StrColumn>(column_bytes)?.into(),
ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
ColumnType::Str => crate::column::open_column_bytes(column_bytes)?.into(),
ColumnType::Numerical(numerical_type) => match numerical_type {
crate::NumericalType::I64 => {
crate::column::open_column_u64::<i64>(column_bytes)?.into()
}
crate::NumericalType::U64 => {
crate::column::open_column_u64::<u64>(column_bytes)?.into()
}
crate::NumericalType::F64 => {
crate::column::open_column_u64::<f64>(column_bytes)?.into()
}
},
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
ColumnType::DateTime => {
crate::column::open_column_u64::<crate::DateTime>(column_bytes)?.into()
}

View File

@@ -18,12 +18,9 @@ mod dynamic_column;
pub(crate) mod utils;
mod value;
pub use column::{BytesColumn, Column, StrColumn};
pub use column::Column;
pub use column_values::ColumnValues;
pub use columnar::{
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
MergeDocOrder,
};
pub use columnar::{ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType};
pub use value::{NumericalType, NumericalValue};
pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};

View File

@@ -1,13 +1,10 @@
use std::net::Ipv6Addr;
use crate::column_values::MonotonicallyMappableToU128;
use crate::columnar::ColumnType;
use crate::dynamic_column::{DynamicColumn, DynamicColumnHandle};
use crate::value::NumericalValue;
use crate::{Cardinality, ColumnarReader, ColumnarWriter};
#[test]
fn test_dataframe_writer_str() {
fn test_dataframe_writer_bytes() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_str(1u32, "my_string", "hello");
dataframe_writer.record_str(3u32, "my_string", "helloeee");
@@ -17,21 +14,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 158);
}
#[test]
fn test_dataframe_writer_bytes() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 158);
assert_eq!(cols[0].num_bytes(), 165);
}
#[test]
@@ -45,7 +28,7 @@ fn test_dataframe_writer_bool() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 22);
assert_eq!(cols[0].num_bytes(), 29);
assert_eq!(cols[0].column_type(), ColumnType::Bool);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
@@ -53,59 +36,6 @@ fn test_dataframe_writer_bool() {
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
#[test]
fn test_dataframe_writer_u64_multivalued() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(2u32, "divisor", 2u64);
dataframe_writer.record_numerical(3u32, "divisor", 3u64);
dataframe_writer.record_numerical(4u32, "divisor", 2u64);
dataframe_writer.record_numerical(5u32, "divisor", 5u64);
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(7, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
assert_eq!(
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
);
assert_eq!(divisor_col.num_rows(), 7);
}
#[test]
fn test_dataframe_writer_ip_addr() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 42);
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!(
&vals,
&[
None,
Some(Ipv6Addr::from_u128(1001)),
None,
Some(Ipv6Addr::from_u128(1050)),
None,
]
);
}
#[test]
fn test_dataframe_writer_numerical() {
let mut dataframe_writer = ColumnarWriter::default();
@@ -123,7 +53,7 @@ fn test_dataframe_writer_numerical() {
// - header 14 bytes
// - vals 8 //< due to padding? could have been 1byte?.
// - null footer 6 bytes
assert_eq!(cols[0].num_bytes(), 33);
assert_eq!(cols[0].num_bytes(), 40);
let column = cols[0].open().unwrap();
let DynamicColumn::I64(column_i64) = column else { panic!(); };
assert_eq!(column_i64.idx.get_cardinality(), Cardinality::Optional);
@@ -137,76 +67,18 @@ fn test_dataframe_writer_numerical() {
}
#[test]
fn test_dictionary_encoded_str() {
fn test_dictionary_encoded() {
let mut buffer = Vec::new();
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_str(1, "my.column", "a");
columnar_writer.record_str(3, "my.column", "c");
columnar_writer.record_str(1, "my.column", "my.key");
columnar_writer.record_str(3, "my.column", "my.key2");
columnar_writer.record_str(3, "my.column2", "different_column!");
columnar_writer.record_str(4, "my.column", "b");
columnar_writer.serialize(5, &mut buffer).unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
let mut term_buffer = String::new();
let term_ords = str_col.ords();
assert_eq!(term_ords.first(0), None);
assert_eq!(term_ords.first(1), Some(0));
str_col.ord_to_str(0u64, &mut term_buffer).unwrap();
assert_eq!(term_buffer, "a");
assert_eq!(term_ords.first(2), None);
assert_eq!(term_ords.first(3), Some(2));
str_col.ord_to_str(2u64, &mut term_buffer).unwrap();
assert_eq!(term_buffer, "c");
assert_eq!(term_ords.first(4), Some(1));
str_col.ord_to_str(1u64, &mut term_buffer).unwrap();
assert_eq!(term_buffer, "b");
}
#[test]
fn test_dictionary_encoded_bytes() {
let mut buffer = Vec::new();
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_bytes(1, "my.column", b"a");
columnar_writer.record_bytes(3, "my.column", b"c");
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
columnar_writer.record_bytes(4, "my.column", b"b");
columnar_writer.serialize(5, &mut buffer).unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(bytes_col.num_rows(), 5);
let mut term_buffer = Vec::new();
let term_ords = bytes_col.ords();
assert_eq!(term_ords.first(0), None);
assert_eq!(term_ords.first(1), Some(0));
bytes_col
.dictionary
.ord_to_term(0u64, &mut term_buffer)
.unwrap();
assert_eq!(term_buffer, b"a");
assert_eq!(term_ords.first(2), None);
assert_eq!(term_ords.first(3), Some(2));
bytes_col
.dictionary
.ord_to_term(2u64, &mut term_buffer)
.unwrap();
assert_eq!(term_buffer, b"c");
assert_eq!(term_ords.first(4), Some(1));
bytes_col
.dictionary
.ord_to_term(1u64, &mut term_buffer)
.unwrap();
assert_eq!(term_buffer, b"b");
// let term_ords = (0..)
}

View File

@@ -1,22 +1,12 @@
use crate::InvalidData;
use crate::{Column, ColumnType, InvalidData};
#[derive(Copy, Clone, PartialEq, Debug)]
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum NumericalValue {
I64(i64),
U64(u64),
F64(f64),
}
impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType {
match self {
NumericalValue::I64(_) => NumericalType::I64,
NumericalValue::U64(_) => NumericalType::U64,
NumericalValue::F64(_) => NumericalType::F64,
}
}
}
impl From<u64> for NumericalValue {
fn from(val: u64) -> NumericalValue {
NumericalValue::U64(val)
@@ -35,6 +25,18 @@ impl From<f64> for NumericalValue {
}
}
impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType {
match self {
NumericalValue::F64(_) => NumericalType::F64,
NumericalValue::I64(_) => NumericalType::I64,
NumericalValue::U64(_) => NumericalType::U64,
}
}
}
impl Eq for NumericalValue {}
#[derive(Clone, Copy, Debug, Default, Hash, Eq, PartialEq)]
#[repr(u8)]
pub enum NumericalType {

View File

@@ -13,7 +13,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::metric::AverageAggregation;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::TermQuery;
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::{doc, Index, Term};
fn main() -> tantivy::Result<()> {
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
let price_field = schema_builder.add_f64_field("price", score_fieldtype);

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast(Cardinality::SingleValue)
.set_fast()
.set_precision(tantivy::DatePrecision::Seconds);
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);

View File

@@ -14,6 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies]
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
columnar = { version= "0.1", path="../columnar", package="tantivy-columnar" }
prettytable-rs = {version="0.10.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"

View File

@@ -2,81 +2,11 @@ use std::fmt::{self, Debug};
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
pub use columnar::ColumnValues as Column;
use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn;
/// `Column` provides columnar access on a field.
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// Must panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
#[inline]
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32);
}
}
/// Get the positions of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
#[inline]
fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
positions.push(idx);
}
}
}
/// Returns the minimum value for this fast field.
///
/// This min_value may not be exact.
/// For instance, the min value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.min_value()`.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// This max_value may not be exact.
/// For instance, the max value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.max_value()`.
fn max_value(&self) -> T;
/// The number of values in the column.
fn num_vals(&self) -> u32;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
/// VecColumn provides `Column` over a slice.
pub struct VecColumn<'a, T = u64> {
values: &'a [T],
@@ -84,32 +14,6 @@ pub struct VecColumn<'a, T = u64> {
max_value: T,
}
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
fn min_value(&self) -> T {
(*self).min_value()
}
fn max_value(&self) -> T {
(*self).max_value()
}
fn num_vals(&self) -> u32 {
(*self).num_vals()
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
(*self).iter()
}
fn get_range(&self, start: u64, output: &mut [T]) {
(*self).get_range(start, output)
}
}
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]

View File

@@ -402,8 +402,8 @@ mod tests {
let mut buffer = Vec::new();
let col = VecColumn::from(&[false, true][..]);
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
// 5 bytes of header, 1 byte of value
assert_eq!(buffer.len(), 3 + 5 + 1 + 4 + 2);
// 5 bytes of header, 1 byte of value, 7 bytes of padding.
assert_eq!(buffer.len(), 3 + 5 + 8 + 4 + 2);
}
#[test]
@@ -411,8 +411,8 @@ mod tests {
let mut buffer = Vec::new();
let col = VecColumn::from(&[true][..]);
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
// 5 bytes of header, 0 bytes of value
assert_eq!(buffer.len(), 3 + 5 + 4 + 2);
// 5 bytes of header, 0 bytes of value, 7 bytes of padding.
assert_eq!(buffer.len(), 3 + 5 + 7 + 4 + 2);
}
#[test]
@@ -422,6 +422,6 @@ mod tests {
let col = VecColumn::from(&vals[..]);
serialize(col, &mut buffer, &[FastFieldCodecType::Bitpacked]).unwrap();
// Values are stored over 3 bits.
assert_eq!(buffer.len(), 3 + 7 + (3 * 80 / 8) + 4 + 2);
assert_eq!(buffer.len(), 3 + 7 + (3 * 80 / 8) + 7 + 4 + 2);
}
}

2
run-tests.sh Executable file
View File

@@ -0,0 +1,2 @@
#!/bin/bash
cargo test

View File

@@ -15,7 +15,7 @@ use super::metric::{
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
use crate::schema::{Cardinality, Type};
use crate::schema::Type;
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
#[derive(Clone, Default)]

View File

@@ -362,19 +362,13 @@ impl SegmentTermCollector {
let mut entries: Vec<(u32, TermBucketEntry)> =
self.term_buckets.entries.into_iter().collect();
let order_by_key = self.req.order.target == OrderTarget::Key;
let order_by_sub_aggregation =
matches!(self.req.order.target, OrderTarget::SubAggregation(_));
match self.req.order.target {
OrderTarget::Key => {
// We rely on the fact, that term ordinals match the order of the strings
// TODO: We could have a special collector, that keeps only TOP n results at any
// time.
if self.req.order.order == Order::Desc {
entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.0));
} else {
entries.sort_unstable_by_key(|bucket| bucket.0);
}
// defer order and cut_off after loading the texts from the dictionary
}
OrderTarget::SubAggregation(_name) => {
// don't sort and cut off since it's hard to make assumptions on the quality of the
@@ -390,11 +384,12 @@ impl SegmentTermCollector {
}
}
let (term_doc_count_before_cutoff, sum_other_doc_count) = if order_by_sub_aggregation {
(0, 0)
} else {
cut_off_buckets(&mut entries, self.req.segment_size as usize)
};
let (term_doc_count_before_cutoff, mut sum_other_doc_count) =
if order_by_key || order_by_sub_aggregation {
(0, 0)
} else {
cut_off_buckets(&mut entries, self.req.segment_size as usize)
};
let inverted_index = agg_with_accessor
.inverted_index
@@ -417,10 +412,6 @@ impl SegmentTermCollector {
if self.req.min_doc_count == 0 {
let mut stream = term_dict.stream()?;
while let Some((key, _ord)) = stream.next() {
if dict.len() >= self.req.segment_size as usize {
break;
}
let key = std::str::from_utf8(key)
.map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?;
if !dict.contains_key(key) {
@@ -429,6 +420,20 @@ impl SegmentTermCollector {
}
}
if order_by_key {
let mut dict_entries = dict.into_iter().collect_vec();
if self.req.order.order == Order::Desc {
dict_entries.sort_unstable_by(|(key1, _), (key2, _)| key1.cmp(key2));
} else {
dict_entries.sort_unstable_by(|(key1, _), (key2, _)| key2.cmp(key1));
}
let (_, sum_other_docs) =
cut_off_buckets(&mut dict_entries, self.req.segment_size as usize);
sum_other_doc_count += sum_other_docs;
dict = dict_entries.into_iter().collect();
}
Ok(IntermediateBucketResult::Terms(
IntermediateTermBucketResult {
entries: dict,
@@ -918,14 +923,14 @@ mod tests {
];
let index = get_test_index_from_values_and_terms(merge_segments, &segment_and_terms)?;
// key asc
// key desc
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "string_id".to_string(),
order: Some(CustomOrder {
order: Order::Asc,
order: Order::Desc,
target: OrderTarget::Key,
}),
..Default::default()
@@ -952,7 +957,7 @@ mod tests {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "string_id".to_string(),
order: Some(CustomOrder {
order: Order::Asc,
order: Order::Desc,
target: OrderTarget::Key,
}),
size: Some(2),
@@ -976,14 +981,14 @@ mod tests {
assert_eq!(res["my_texts"]["sum_other_doc_count"], 3);
// key asc and segment_size cut_off
// key desc and segment_size cut_off
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "string_id".to_string(),
order: Some(CustomOrder {
order: Order::Asc,
order: Order::Desc,
target: OrderTarget::Key,
}),
size: Some(2),
@@ -1006,14 +1011,14 @@ mod tests {
serde_json::Value::Null
);
// key desc
// key asc
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "string_id".to_string(),
order: Some(CustomOrder {
order: Order::Desc,
order: Order::Asc,
target: OrderTarget::Key,
}),
..Default::default()
@@ -1033,14 +1038,14 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 5);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
// key desc, size cut_off
// key asc, size cut_off
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "string_id".to_string(),
order: Some(CustomOrder {
order: Order::Desc,
order: Order::Asc,
target: OrderTarget::Key,
}),
size: Some(2),
@@ -1063,14 +1068,14 @@ mod tests {
);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 5);
// key desc, segment_size cut_off
// key asc, segment_size cut_off
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "string_id".to_string(),
order: Some(CustomOrder {
order: Order::Desc,
order: Order::Asc,
target: OrderTarget::Key,
}),
size: Some(2),
@@ -1347,3 +1352,68 @@ mod tests {
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use itertools::Itertools;
use rand::seq::SliceRandom;
use rand::thread_rng;
use super::*;
fn get_collector_with_buckets(num_docs: u64) -> TermBuckets {
TermBuckets::from_req_and_validate(&Default::default(), num_docs as usize).unwrap()
}
fn get_rand_terms(total_terms: u64, num_terms_returned: u64) -> Vec<u64> {
let mut rng = thread_rng();
let all_terms = (0..total_terms - 1).collect_vec();
let mut vals = vec![];
for _ in 0..num_terms_returned {
let val = all_terms.as_slice().choose(&mut rng).unwrap();
vals.push(*val);
}
vals
}
fn bench_term_buckets(b: &mut test::Bencher, num_terms: u64, total_terms: u64) {
let mut collector = get_collector_with_buckets(total_terms);
let vals = get_rand_terms(total_terms, num_terms);
let aggregations_with_accessor: AggregationsWithAccessor = Default::default();
let bucket_count: BucketCount = BucketCount {
bucket_count: Default::default(),
max_bucket_count: 1_000_001u32,
};
b.iter(|| {
for &val in &vals {
collector
.increment_bucket(&[val], 0, &aggregations_with_accessor, &bucket_count, &None)
.unwrap();
}
})
}
#[bench]
fn bench_term_buckets_500_of_1_000_000(b: &mut test::Bencher) {
bench_term_buckets(b, 500u64, 1_000_000u64)
}
#[bench]
fn bench_term_buckets_1_000_000_of_50_000(b: &mut test::Bencher) {
bench_term_buckets(b, 1_000_000u64, 50_000u64)
}
#[bench]
fn bench_term_buckets_1_000_000_of_50(b: &mut test::Bencher) {
bench_term_buckets(b, 1_000_000u64, 50u64)
}
#[bench]
fn bench_term_buckets_1_000_000_of_1_000_000(b: &mut test::Bencher) {
bench_term_buckets(b, 1_000_000u64, 1_000_000u64)
}
}

View File

@@ -499,7 +499,7 @@ impl IntermediateTermBucketResult {
match req.order.target {
OrderTarget::Key => {
buckets.sort_by(|left, right| {
if req.order.order == Order::Asc {
if req.order.order == Order::Desc {
left.key.partial_cmp(&right.key)
} else {
right.key.partial_cmp(&left.key)

View File

@@ -6,13 +6,14 @@ use super::{IntermediateStats, SegmentStatsCollector};
/// A single-value metric aggregation that computes the average of numeric values that are
/// extracted from the aggregated documents.
/// Supported field types are u64, i64, and f64.
/// See [super::SingleMetricResult] for return value.
///
/// # JSON Format
/// ```json
/// {
/// "avg": {
/// "field": "score"
/// "field": "score",
/// }
/// }
/// ```

View File

@@ -6,13 +6,14 @@ use super::{IntermediateStats, SegmentStatsCollector};
/// A single-value metric aggregation that counts the number of values that are
/// extracted from the aggregated documents.
/// Supported field types are u64, i64, and f64.
/// See [super::SingleMetricResult] for return value.
///
/// # JSON Format
/// ```json
/// {
/// "value_count": {
/// "field": "score"
/// "field": "score",
/// }
/// }
/// ```

View File

@@ -6,13 +6,14 @@ use super::{IntermediateStats, SegmentStatsCollector};
/// A single-value metric aggregation that computes the maximum of numeric values that are
/// extracted from the aggregated documents.
/// Supported field types are u64, i64, and f64.
/// See [super::SingleMetricResult] for return value.
///
/// # JSON Format
/// ```json
/// {
/// "max": {
/// "field": "score"
/// "field": "score",
/// }
/// }
/// ```

View File

@@ -6,13 +6,14 @@ use super::{IntermediateStats, SegmentStatsCollector};
/// A single-value metric aggregation that computes the minimum of numeric values that are
/// extracted from the aggregated documents.
/// Supported field types are u64, i64, and f64.
/// See [super::SingleMetricResult] for return value.
///
/// # JSON Format
/// ```json
/// {
/// "min": {
/// "field": "score"
/// "field": "score",
/// }
/// }
/// ```

View File

@@ -43,13 +43,13 @@ mod tests {
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::schema::{NumericOptions, Schema};
use crate::Index;
#[test]
fn test_metric_aggregations() {
let mut schema_builder = Schema::builder();
let field_options = NumericOptions::default().set_fast(Cardinality::SingleValue);
let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();

View File

@@ -7,13 +7,14 @@ use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
/// are extracted from the aggregated documents.
/// Supported field types are `u64`, `i64`, and `f64`.
/// See [`Stats`] for returned statistics.
///
/// # JSON Format
/// ```json
/// {
/// "stats": {
/// "field": "score"
/// "field": "score",
/// }
/// }
/// ```

View File

@@ -6,13 +6,14 @@ use super::{IntermediateStats, SegmentStatsCollector};
/// A single-value metric aggregation that sums up numeric values that are
/// extracted from the aggregated documents.
/// Supported field types are u64, i64, and f64.
/// See [super::SingleMetricResult] for return value.
///
/// # JSON Format
/// ```json
/// {
/// "sum": {
/// "field": "score"
/// "field": "score",
/// }
/// }
/// ```

View File

@@ -1,5 +1,6 @@
//! # Aggregations
//!
//!
//! An aggregation summarizes your data as statistics on buckets or metrics.
//!
//! Aggregations can provide answer to questions like:
@@ -40,10 +41,6 @@
//! - [Metric](metric)
//! - [Average](metric::AverageAggregation)
//! - [Stats](metric::StatsAggregation)
//! - [Min](metric::MinAggregation)
//! - [Max](metric::MaxAggregation)
//! - [Sum](metric::SumAggregation)
//! - [Count](metric::CountAggregation)
//!
//! # Example
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
@@ -78,7 +75,7 @@
//! }
//! ```
//! # Example JSON
//! Requests are compatible with the elasticsearch JSON request format.
//! Requests are compatible with the elasticsearch json request format.
//!
//! ```
//! use tantivy::aggregation::agg_req::Aggregations;
@@ -433,13 +430,13 @@ mod tests {
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let fraction_field = schema_builder.add_f64_field(
"fraction_f64",
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue),
crate::schema::NumericOptions::default().set_fast(),
);
let index = Index::create_in_ram(schema_builder.build());
{
@@ -657,12 +654,12 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("dummy_text", STRING);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let multivalue =
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
crate::schema::NumericOptions::default().set_fast();
let scores_field_i64 = schema_builder.add_i64_field("scores_i64", multivalue);
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
@@ -1156,6 +1153,12 @@ mod tests {
r#"FieldNotFound("not_exist_field")"#
);
let agg_res = avg_on_field("scores_i64");
assert_eq!(
format!("{:?}", agg_res),
r#"InvalidArgument("Invalid field cardinality on field scores_i64 expected SingleValue, but got MultiValues")"#
);
Ok(())
}
@@ -1184,7 +1187,7 @@ mod tests {
let text_field_few_terms =
schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 =
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());

View File

@@ -12,10 +12,10 @@
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{DynamicColumn, HasAssociatedColumnType};
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue;
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -61,7 +61,7 @@ use crate::{Score, SegmentReader, TantivyError};
/// # Ok(())
/// # }
/// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
where TPredicate: 'static + Clone
{
field: Field,
@@ -70,7 +70,7 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue: FastValue>
impl<TCollector, TPredicate, TPredicateValue: Default>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -91,12 +91,13 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue: FastValue> Collector
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync + Clone,
TPredicateValue: FastValue,
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
@@ -117,20 +118,10 @@ where
field_entry.name()
)));
}
let requested_type = TPredicateValue::to_type();
let field_schema_type = field_entry.field_type().value_type();
if requested_type != field_schema_type {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}!={:?}",
field_entry.name(),
requested_type,
field_schema_type
)));
}
let fast_field_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(schema.get_field_name(self.field))?;
.typed_column_first_or_default(schema.get_field_name(self.field))?;
let segment_collector = self
.collector
@@ -159,7 +150,7 @@ where
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
TPredicateValue: FastValue,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn Column<TPredicateValue>>,
segment_collector: TSegmentCollector,
@@ -171,8 +162,9 @@ impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: FastValue,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
type Fruit = TSegmentCollector::Fruit;

View File

@@ -4,7 +4,7 @@ use fastdivide::DividerU64;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue;
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::Type;
use crate::{DocId, Score};
@@ -87,14 +87,14 @@ impl HistogramComputer {
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: Arc<dyn Column<u64>>,
column_u64: Arc<dyn Column<u64>>,
}
impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc);
let value = self.column_u64.get_val(doc);
self.histogram_computer.add_value(value);
}
@@ -112,14 +112,18 @@ impl Collector for HistogramCollector {
_segment_local_id: crate::SegmentOrdinal,
segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let ff_reader = segment.fast_fields().u64_lenient(&self.field)?;
let column_opt = segment.fast_fields().u64_lenient(&self.field)?;
let column = column_opt.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?;
let column_u64 = column.first_or_default_col(0u64);
Ok(SegmentHistogramCollector {
histogram_computer: HistogramComputer {
counts: vec![0; self.num_buckets],
min_value: self.min_value,
divider: self.divider,
},
ff_reader,
column_u64,
})
}

View File

@@ -104,9 +104,8 @@ pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
mod tweak_score_top_collector;
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::{FacetCollector, FacetCounts};
// mod facet_collector;
// pub use self::facet_collector::{FacetCollector, FacetCounts};
use crate::query::Weight;
mod docset_collector;

View File

@@ -5,7 +5,6 @@ use fastfield_codecs::Column;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
@@ -58,9 +57,10 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
fn date_filter(value: columnar::DateTime) -> bool {
(crate::DateTime::from(value).into_utc()
- OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
> 0
}
@@ -164,8 +164,10 @@ pub struct FastFieldSegmentCollector {
}
impl FastFieldTestCollector {
pub fn for_field(field: String) -> FastFieldTestCollector {
FastFieldTestCollector { field }
pub fn for_field(field: impl ToString) -> FastFieldTestCollector {
FastFieldTestCollector {
field: field.to_string(),
}
}
}
@@ -210,64 +212,62 @@ impl SegmentCollector for FastFieldSegmentCollector {
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
field: Field,
}
// /// Collects in order all of the fast field bytes for all of the
// /// docs in the `DocSet`
// ///
// /// This collector is mainly useful for tests.
// pub struct BytesFastFieldTestCollector {
// field: Field,
// }
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
// pub struct BytesFastFieldSegmentCollector {
// vals: Vec<u8>,
// reader: BytesFastFieldReader,
// }
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector { field }
}
}
// impl BytesFastFieldTestCollector {
// pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
// BytesFastFieldTestCollector { field }
// }
// }
impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>;
type Child = BytesFastFieldSegmentCollector;
// impl Collector for BytesFastFieldTestCollector {
// type Fruit = Vec<u8>;
// type Child = BytesFastFieldSegmentCollector;
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<BytesFastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.bytes(segment_reader.schema().get_field_name(self.field))?;
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader,
})
}
// fn for_segment(
// &self,
// _segment_local_id: u32,
// segment_reader: &SegmentReader,
// ) -> crate::Result<BytesFastFieldSegmentCollector> {
// let reader = segment_reader.fast_fields().bytes(self.field)?;
// Ok(BytesFastFieldSegmentCollector {
// vals: Vec::new(),
// reader,
// })
// }
fn requires_scoring(&self) -> bool {
false
}
// fn requires_scoring(&self) -> bool {
// false
// }
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
}
}
// fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
// Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
// }
// }
impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>;
// impl SegmentCollector for BytesFastFieldSegmentCollector {
// type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: Score) {
let data = self.reader.get_bytes(doc);
self.vals.extend(data);
}
// fn collect(&mut self, doc: u32, _score: Score) {
// let data = self.reader.get_bytes(doc);
// self.vals.extend(data);
// }
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.vals
}
}
// fn harvest(self) -> <Self as SegmentCollector>::Fruit {
// self.vals
// }
// }
fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();

View File

@@ -12,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::fastfield::FastValue;
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::query::Weight;
use crate::schema::Field;
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
@@ -22,7 +22,7 @@ struct FastFieldConvertCollector<
TFastValue: FastValue,
> {
pub collector: TCollector,
pub field: Field,
pub field: String,
pub fast_value: std::marker::PhantomData<TFastValue>,
}
@@ -41,7 +41,8 @@ where
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment.schema();
let field_entry = schema.get_field_entry(self.field);
let field = schema.get_field(&self.field)?;
let field_entry = schema.get_field_entry(field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
@@ -132,17 +133,17 @@ impl fmt::Debug for TopDocs {
}
struct ScorerByFastFieldReader {
ff_reader: Arc<dyn Column<u64>>,
sort_column: Arc<dyn Column<u64>>,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc)
self.sort_column.get_val(doc)
}
}
struct ScorerByField {
field: Field,
field: String,
}
impl CustomScorer<u64> for ScorerByField {
@@ -154,10 +155,13 @@ impl CustomScorer<u64> for ScorerByField {
// mapping is monotonic, so it is sufficient to compute our top-K docs.
//
// The conversion will then happen only on the top-K docs.
let ff_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(segment_reader.schema().get_field_name(self.field))?;
Ok(ScorerByFastFieldReader { ff_reader })
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
let sort_column = sort_column_opt
.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?
.first_or_default_col(0u64);
Ok(ScorerByFastFieldReader { sort_column })
}
}
@@ -290,9 +294,14 @@ impl TopDocs {
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
pub fn order_by_u64_field(
self,
field: Field,
field: impl ToString,
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
CustomScoreTopCollector::new(ScorerByField { field }, self.0.into_tscore())
CustomScoreTopCollector::new(
ScorerByField {
field: field.to_string(),
},
self.0.into_tscore(),
)
}
/// Set top-K to rank documents by a given fast field.
@@ -367,15 +376,15 @@ impl TopDocs {
/// ```
pub fn order_by_fast_field<TFastValue>(
self,
fast_field: Field,
fast_field: impl ToString,
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
where
TFastValue: FastValue,
{
let u64_collector = self.order_by_u64_field(fast_field);
let u64_collector = self.order_by_u64_field(fast_field.to_string());
FastFieldConvertCollector {
collector: u64_collector,
field: fast_field,
field: fast_field.to_string(),
fast_value: PhantomData,
}
}
@@ -877,7 +886,7 @@ mod tests {
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -916,7 +925,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -946,7 +955,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -976,7 +985,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1004,7 +1013,7 @@ mod tests {
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
let segment_reader = searcher.segment_reader(0u32);
top_collector
.for_segment(0, segment_reader)
@@ -1022,7 +1031,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(matches!(err, crate::TantivyError::SchemaError(_)));
Ok(())
@@ -1039,7 +1048,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(size);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")

View File

@@ -19,7 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
@@ -93,7 +93,7 @@ fn save_new_metas(
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let number_field = schema_builder.add_u64_field(
/// "number",
/// NumericOptions::default().set_fast(Cardinality::SingleValue),
/// NumericOptions::default().set_fast(),
/// );
///
/// let schema = schema_builder.build();
@@ -245,12 +245,6 @@ impl IndexBuilder {
sort_by_field.field
)));
}
if entry.field_type().fastfield_cardinality() != Some(Cardinality::SingleValue) {
return Err(TantivyError::InvalidArgument(format!(
"Only single value fast field Cardinality supported for sorting index {}",
sort_by_field.field
)));
}
}
Ok(())
} else {

View File

@@ -135,8 +135,6 @@ impl InvertedIndexReader {
term_info: &TermInfo,
option: IndexRecordOption,
) -> io::Result<SegmentPostings> {
let option = option.downgrade(self.record_option);
let block_postings = self.read_block_postings_from_terminfo(term_info, option)?;
let position_reader = {
if option.has_positions() {

View File

@@ -249,7 +249,7 @@ impl SearcherInner {
index: Index,
segment_readers: Vec<SegmentReader>,
generation: TrackedObject<SearcherGeneration>,
doc_store_cache_num_blocks: usize,
doc_store_cache_size: usize,
) -> io::Result<SearcherInner> {
assert_eq!(
&segment_readers
@@ -261,7 +261,7 @@ impl SearcherInner {
);
let store_readers: Vec<StoreReader> = segment_readers
.iter()
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_num_blocks))
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))
.collect::<io::Result<Vec<_>>>()?;
Ok(SearcherInner {

View File

@@ -7,7 +7,7 @@ use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
use crate::space_usage::SegmentSpaceUsage;
@@ -90,25 +90,8 @@ impl SegmentReader {
}
/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field: Field) -> crate::Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
FieldType::Facet(_) => {
let term_ords_reader =
self.fast_fields().u64s(self.schema.get_field_name(field))?;
let termdict = self
.termdict_composite
.open_read(field)
.map(TermDictionary::open)
.unwrap_or_else(|| Ok(TermDictionary::empty()))?;
Ok(FacetReader::new(term_ords_reader, termdict))
}
_ => Err(crate::TantivyError::InvalidArgument(format!(
"Field {:?} is not a facet field.",
field_entry.name()
))),
}
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
todo!();
}
/// Accessor to the segment's `Field norms`'s reader.
@@ -134,12 +117,9 @@ impl SegmentReader {
&self.fieldnorm_readers
}
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
///
/// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
/// The size of blocks is configurable, this should be reflexted in the
pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone(), cache_num_blocks)
/// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self, cache_size: usize) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone(), cache_size)
}
/// Open a new segment for reading.
@@ -173,9 +153,7 @@ impl SegmentReader {
let schema = segment.schema();
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_fields_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fast_fields_readers = Arc::new(FastFieldReaders::open(fast_fields_data)?);
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;

View File

@@ -8,7 +8,7 @@ use crate::schema::FieldEntry;
#[derive(Debug, Error)]
#[error("Fast field not available: '{field_name:?}'")]
pub struct FastFieldNotAvailableError {
field_name: String,
pub(crate) field_name: String,
}
impl FastFieldNotAvailableError {

File diff suppressed because it is too large Load Diff

View File

@@ -38,7 +38,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -74,7 +74,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed()
.set_fieldnorm()
.set_stored(),
@@ -215,7 +215,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -246,7 +246,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let bool_field = schema_builder.add_bool_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -278,7 +278,7 @@ mod tests {
let field = schema_builder.add_u64_field(
"multifield",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
let schema = schema_builder.build();
@@ -424,7 +424,7 @@ mod bench {
let mut builder = crate::schema::SchemaBuilder::new();
let fast_multi =
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
crate::schema::NumericOptions::default().set_fast();
let multi_field = builder.add_f64_field("f64s", fast_multi);
let index = crate::Index::create_in_ram(builder.build());
@@ -504,7 +504,7 @@ mod bench {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
let field = {
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();
@@ -562,7 +562,7 @@ mod bench {
b.iter(|| {
let directory: RamDirectory = RamDirectory::create();
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();
@@ -595,7 +595,7 @@ mod bench {
b.iter(|| {
let directory: RamDirectory = RamDirectory::create();
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();

View File

@@ -137,7 +137,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed()
.set_fieldnorm()
.set_precision(DatePrecision::Microseconds)
@@ -188,7 +188,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
// TODO: Test different precision after fixing https://github.com/quickwit-oss/tantivy/issues/1783
.set_precision(DatePrecision::Microseconds)
.set_indexed()
@@ -307,7 +307,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field_options = NumericOptions::default()
.set_indexed()
.set_fast(Cardinality::MultiValues);
.set_fast();
let item_field = schema_builder.add_i64_field("items", field_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);

View File

@@ -1,12 +1,16 @@
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use columnar::{
ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
HasAssociatedColumnType, NumericalType,
};
use fastfield_codecs::{open, open_u128, Column};
use super::multivalued::MultiValuedFastFieldReader;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{BytesFastFieldReader, FastFieldNotAvailableError, FastValue};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::{Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError};
@@ -16,315 +20,152 @@ use crate::{DateTime, TantivyError};
/// and just wraps several `HashMap`.
#[derive(Clone)]
pub struct FastFieldReaders {
schema: Schema,
fast_fields_composite: CompositeFile,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) enum FastType {
I64,
U64,
U128,
F64,
Bool,
Date,
}
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
match field_type {
FieldType::U64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U64, cardinality)),
FieldType::I64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::I64, cardinality)),
FieldType::F64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::F64, cardinality)),
FieldType::Bool(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Bool, cardinality)),
FieldType::Date(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
FieldType::Str(options) if options.is_fast() => {
Some((FastType::U64, Cardinality::MultiValues))
}
FieldType::IpAddr(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U128, cardinality)),
_ => None,
}
columnar: Arc<ColumnarReader>,
}
impl FastFieldReaders {
pub(crate) fn new(schema: Schema, fast_fields_composite: CompositeFile) -> FastFieldReaders {
FastFieldReaders {
schema,
fast_fields_composite,
}
pub(crate) fn open(fast_field_file: FileSlice) -> io::Result<FastFieldReaders> {
let columnar = Arc::new(ColumnarReader::open(fast_field_file)?);
Ok(FastFieldReaders { columnar })
}
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
self.fast_fields_composite.space_usage()
todo!()
}
#[doc(hidden)]
pub fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
self.fast_fields_composite
.open_read_with_idx(field, idx)
.ok_or_else(|| {
let field_name = self.schema.get_field_entry(field).name();
TantivyError::SchemaError(format!("Field({}) data was not found", field_name))
})
}
fn check_type(
pub fn typed_column_opt<T>(
&self,
field: Field,
expected_fast_type: FastType,
expected_cardinality: Cardinality,
) -> crate::Result<()> {
let field_entry = self.schema.get_field_entry(field);
let (fast_type, cardinality) =
type_and_cardinality(field_entry.field_type()).ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
))
})?;
if fast_type != expected_fast_type {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}, expected {:?}.",
field_entry.name(),
fast_type,
expected_fast_type
)));
field_name: &str,
) -> crate::Result<Option<columnar::Column<T>>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + Default + 'static,
DynamicColumn: Into<Option<columnar::Column<T>>>,
{
let column_type = T::column_type();
let Some(dynamic_column_handle) = self.column_handle(field_name, column_type)?
else {
return Ok(None);
};
let dynamic_column = dynamic_column_handle.open()?;
Ok(dynamic_column.into())
}
pub fn column_num_bytes(&self, field: &str) -> crate::Result<usize> {
Ok(self
.columnar
.read_columns(field)?
.into_iter()
.map(|column_handle| column_handle.num_bytes())
.sum())
}
pub fn typed_column_first_or_default<T>(&self, field: &str) -> crate::Result<Arc<dyn Column<T>>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + Default + 'static,
DynamicColumn: Into<Option<columnar::Column<T>>>,
{
let col_opt = self.typed_column_opt(field)?;
if let Some(col) = col_opt {
Ok(col.first_or_default_col(T::default()))
} else {
todo!();
}
if cardinality != expected_cardinality {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of cardinality {:?}, expected {:?}.",
field_entry.name(),
cardinality,
expected_cardinality
)));
}
Ok(())
}
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field_name: &str,
index: usize,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
let field = self.schema.get_field(field_name)?;
let fast_field_slice = self.fast_field_data(field, index)?;
let bytes = fast_field_slice.read_bytes()?;
let column = fastfield_codecs::open(bytes)?;
Ok(column)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
self.typed_fast_field_reader_with_idx(field_name, 0)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field_name)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field_name, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated with `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.check_type(
self.schema.get_field(field_name)?,
FastType::U64,
Cardinality::SingleValue,
)?;
self.typed_fast_field_reader(field_name)
pub fn u64(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<u64>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<columnar::DateTime>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addr(&self, field_name: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<Ipv6Addr>(bytes)?)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addrs(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<Ipv6Addr>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field_name)?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
pub fn ip_addr(&self, field: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
todo!();
// self.check_type(field, FastType::U128)?;
// let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
// Ok(open_u128::<Ipv6Addr>(bytes)?)
}
/// Returns the `u128` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub(crate) fn u128(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u128>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<u128>(bytes)?)
pub(crate) fn u128(&self, field: &str) -> crate::Result<Arc<dyn Column<u128>>> {
todo!();
}
/// Returns the `u128` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
pub fn u128s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u128>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> =
self.typed_fast_field_reader(self.schema.get_field_name(field))?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<u128>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
pub fn column_handle(
&self,
field_name: &str,
column_type: ColumnType,
) -> crate::Result<Option<DynamicColumnHandle>> {
let dynamic_column_handle_opt = self
.columnar
.read_columns(field_name)?
.into_iter()
.filter(|column| column.column_type() == column_type)
.next();
Ok(dynamic_column_handle_opt)
}
/// Returns the `u64` fast field reader reader associated with `field`, regardless of whether
/// the given field is effectively of type `u64` or not.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.typed_fast_field_reader(field_name)
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Option<columnar::Column<u64>>> {
for col in self.columnar.read_columns(field_name)? {
if let Some(col_u64) = col.open_u64_lenient()? {
return Ok(Some(col_u64));
}
}
Ok(None)
}
/// Returns the `i64` fast field reader reader associated with `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<i64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(self.schema.get_field_name(field))
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field_name: &str) -> crate::Result<Arc<dyn Column<DateTime>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns the `f64` fast field reader reader associated with `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<f64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns the `bool` fast field reader reader associated with `field`.
///
/// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field_name: &str) -> crate::Result<Arc<dyn Column<bool>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`, regardless
/// of whether the given field is effectively of type `u64` or not.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s_lenient(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `i64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns an Error.
pub fn i64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<i64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `f64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a f64 multi-valued fast field, this method returns an Error.
pub fn f64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<f64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `bools` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a bool multi-valued fast field, this method returns an Error.
pub fn bools(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<bool>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated with
/// `field`.
///
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
/// Error.
pub fn dates(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns the `bytes` fast field reader associated with `field`.
///
/// If `field` is not a bytes fast field, returns an Error.
pub fn bytes(&self, field_name: &str) -> crate::Result<BytesFastFieldReader> {
let field = self.schema.get_field(field_name)?;
let field_entry = self.schema.get_field_entry(field);
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
if !bytes_option.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
let idx_reader = open(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {
Err(FastFieldNotAvailableError::new(field_entry).into())
}
}
// Returns the `bytes` fast field reader associated with `field`.
//
// If `field` is not a bytes fast field, returns an Error.
// pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
// let field_entry = self.schema.get_field_entry(field);
// if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
// if !bytes_option.is_fast() {
// return Err(crate::TantivyError::SchemaError(format!(
// "Field {:?} is not a fast field.",
// field_entry.name()
// )));
// }
// let fast_field_idx_file = self.fast_field_data(field, 0)?;
// let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
// let idx_reader = open(fast_field_idx_bytes)?;
// let data = self.fast_field_data(field, 1)?;
// BytesFastFieldReader::open(idx_reader, data)
// } else {
// Err(FastFieldNotAvailableError::new(field_entry).into())
// }
// }
}

View File

@@ -1,558 +1,141 @@
use std::collections::HashMap;
use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
use super::FastFieldType;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::fastfield::CompositeFastFieldSerializer;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
use crate::termdict::TermOrdinal;
use crate::DatePrecision;
use crate::{DatePrecision, DocId};
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
term_id_writers: Vec<MultiValuedFastFieldWriter>,
single_value_writers: Vec<IntFastFieldWriter>,
u128_value_writers: Vec<U128FastFieldWriter>,
u128_multi_value_writers: Vec<MultiValueU128FastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
}
pub(crate) fn unexpected_value(expected: &str, actual: &Value) -> crate::TantivyError {
crate::TantivyError::SchemaError(format!(
"Expected a {:?} in fast field, but got {:?}",
expected, actual
))
}
fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
match *field_entry.field_type() {
FieldType::I64(_) | FieldType::Date(_) => common::i64_to_u64(0i64),
FieldType::F64(_) => common::f64_to_u64(0.0f64),
_ => 0u64,
}
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
date_precisions: Vec<DatePrecision>,
num_docs: DocId,
}
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut u128_value_writers = Vec::new();
let mut u128_multi_value_writers = Vec::new();
let mut single_value_writers = Vec::new();
let mut term_id_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
for (field, field_entry) in schema.fields() {
match field_entry.field_type() {
FieldType::I64(ref int_options)
| FieldType::U64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
None,
);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer =
IntFastFieldWriter::new(field, Some(options.get_precision()));
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
Some(options.get_precision()),
);
multi_values_writers.push(fast_field_writer);
}
None => {}
},
FieldType::Facet(_) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
if bytes_option.is_fast() {
let fast_field_writer = BytesFastFieldWriter::new(field);
bytes_value_writers.push(fast_field_writer);
}
}
FieldType::IpAddr(opt) => {
if opt.is_fast() {
match opt.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let fast_field_writer = U128FastFieldWriter::new(field);
u128_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValueU128FastFieldWriter::new(field);
u128_multi_value_writers.push(fast_field_writer);
}
None => {}
}
}
}
FieldType::Str(_) | FieldType::JsonObject(_) => {}
let mut columnar_writer = ColumnarWriter::default();
let mut fast_fields: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut date_precisions: Vec<DatePrecision> =
std::iter::repeat_with(DatePrecision::default)
.take(schema.num_fields())
.collect();
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
continue;
}
fast_fields[field_id.field_id() as usize] = Some(field_entry.name().to_string());
let column_type = match field_entry.field_type().value_type() {
Type::Str => ColumnType::Str,
Type::U64 => ColumnType::Numerical(NumericalType::U64),
Type::I64 => ColumnType::Numerical(NumericalType::I64),
Type::F64 => ColumnType::Numerical(NumericalType::F64),
Type::Bool => ColumnType::Bool,
Type::Date => ColumnType::DateTime,
Type::Facet => ColumnType::Str,
Type::Bytes => todo!(),
Type::Json => {
continue;
}
Type::IpAddr => todo!(),
};
if let FieldType::Date(date_options) = field_entry.field_type() {
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
}
columnar_writer.record_column_type(field_entry.name(), column_type);
}
FastFieldsWriter {
u128_value_writers,
u128_multi_value_writers,
term_id_writers,
single_value_writers,
multi_values_writers,
bytes_value_writers,
columnar_writer,
fast_field_names: fast_fields,
num_docs: 0u32,
date_precisions,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.term_id_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.bytes_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_multi_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
self.columnar_writer.mem_usage()
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer_mut(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a multivalued fastfield in the schema.
pub fn get_multivalue_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.multi_values_writers
.iter_mut()
.find(|multivalue_writer| multivalue_writer.field() == field)
}
/// Returns the bytes fast field writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a bytes fastfield in the schema.
pub fn get_bytes_writer_mut(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
// TODO optimize
self.bytes_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
for field_writer in &mut self.term_id_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.multi_values_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.bytes_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_multi_value_writers {
field_writer.add_document(doc)?;
let doc_id = self.num_docs;
for field_value in doc.field_values() {
if let Some(field_name) =
self.fast_field_names[field_value.field().field_id() as usize].as_ref()
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(_) => todo!(),
Value::PreTokStr(_) => todo!(),
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime.into(),
);
}
Value::Facet(_) => todo!(),
Value::Bytes(_) => todo!(),
Value::JsonObject(_) => todo!(),
Value::IpAddr(_) => todo!(),
}
}
}
self.num_docs += 1;
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
self,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
mut self,
wrt: &mut dyn io::Write,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in self.term_id_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.multi_values_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in self.bytes_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_multi_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
assert!(doc_id_map.is_none()); // TODO handle doc id map
let num_docs = self.num_docs;
self.columnar_writer.serialize(num_docs, wrt)?;
Ok(())
}
}
/// Fast field writer for u128 values.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// compressed to a compact number space and the number of
/// bits required for bitpacking can only been known once
/// we have seen all of the values.
pub struct U128FastFieldWriter {
field: Field,
vals: Vec<u128>,
val_count: u32,
}
impl U128FastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> Self {
Self {
field,
vals: vec![],
val_count: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.len() * 16
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u128) {
self.vals.push(val);
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
/// Extract the value associated to the fast field for
/// this document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let ip_addr = v.as_ip_addr().ok_or_else(|| unexpected_value("ip", v))?;
let value = ip_addr.to_u128();
self.add_val(value);
}
None => {
self.add_val(0); // TODO fix null handling
}
};
self.val_count += 1;
Ok(())
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
if let Some(doc_id_map) = doc_id_map {
let iter_gen = || {
doc_id_map
.iter_old_doc_ids()
.map(|idx| self.vals[idx as usize])
};
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
} else {
let iter_gen = || self.vals.iter().cloned();
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
}
Ok(())
}
}
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, i64 and f64 use the same writer.
/// i64 and f64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
precision_opt: Option<DatePrecision>,
vals: BlockedBitpacker,
val_count: usize,
val_if_missing: u64,
val_min: u64,
val_max: u64,
}
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
precision_opt,
vals: BlockedBitpacker::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::MAX,
val_max: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.mem_usage()
}
/// Returns the field that this writer is targeting.
pub fn field(&self) -> Field {
self.field
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
}
/// Records a new value.
///
/// The n-th value being recorded is implicitly
/// associated with the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
self.vals.add(val);
if val > self.val_max {
self.val_max = val;
}
if val < self.val_min {
self.val_min = val;
}
self.val_count += 1;
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
///
/// Extract the value associated with the fast field for
/// this document.
///
/// i64 and f64 are remapped to u64 using the logic
/// in `common::i64_to_u64` and `common::f64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
///
/// Values on text fast fields are skipped.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let value = match (self.precision_opt, v) {
(Some(precision), Value::Date(date_val)) => {
date_val.truncate(precision).to_u64()
}
_ => super::value_to_u64(v)?,
};
self.add_val(value);
}
None => {
self.add_val(self.val_if_missing);
}
};
Ok(())
}
/// get iterator over the data
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.vals.iter()
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {
(0, 0)
} else {
(self.val_min, self.val_max)
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
min_value: min,
max_value: max,
num_vals: self.val_count as u32,
};
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
Ok(())
}
}
#[derive(Clone)]
struct WriterFastFieldAccessProvider<'map, 'bitp> {
doc_id_map: Option<&'map DocIdMapping>,
vals: &'bitp BlockedBitpacker,
min_value: u64,
max_value: u64,
num_vals: u32,
}
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated with the given doc.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 {
unimplemented!()
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
if let Some(doc_id_map) = self.doc_id_map {
Box::new(
doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(doc_id as usize)),
)
} else {
Box::new(self.vals.iter())
}
}
fn min_value(&self) -> u64 {
self.min_value
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u32 {
self.num_vals
}
}

View File

@@ -113,34 +113,35 @@ pub(crate) fn get_doc_id_mapping_from_field(
sort_by_field: IndexSortByField,
segment_writer: &SegmentWriter,
) -> crate::Result<DocIdMapping> {
let schema = segment_writer.segment_serializer.segment().schema();
let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect fastfield, but not strictly required
let fast_field = segment_writer
.fast_field_writers
.get_field_writer(field_id)
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"sort index by field is required to be a fast field {:?}",
sort_by_field.field
))
})?;
todo!()
// let schema = segment_writer.segment_serializer.segment().schema();
// let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
// fastfield, but not strictly required let fast_field = segment_writer
// .fast_field_writers
// .get_field_writer(field_id)
// .ok_or_else(|| {
// TantivyError::InvalidArgument(format!(
// "sort index by field is required to be a fast field {:?}",
// sort_by_field.field
// ))
// })?;
// create new doc_id to old doc_id index (used in fast_field_writers)
let mut doc_id_and_data = fast_field
.iter()
.enumerate()
.map(|el| (el.0 as DocId, el.1))
.collect::<Vec<_>>();
if sort_by_field.order == Order::Desc {
doc_id_and_data.sort_by_key(|k| Reverse(k.1));
} else {
doc_id_and_data.sort_by_key(|k| k.1);
}
let new_doc_id_to_old = doc_id_and_data
.into_iter()
.map(|el| el.0)
.collect::<Vec<_>>();
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
// // create new doc_id to old doc_id index (used in fast_field_writers)
// let mut doc_id_and_data = fast_field
// .iter()
// .enumerate()
// .map(|el| (el.0 as DocId, el.1))
// .collect::<Vec<_>>();
// if sort_by_field.order == Order::Desc {
// doc_id_and_data.sort_by_key(|k| Reverse(k.1));
// } else {
// doc_id_and_data.sort_by_key(|k| k.1);
// }
// let new_doc_id_to_old = doc_id_and_data
// .into_iter()
// .map(|el| el.0)
// .collect::<Vec<_>>();
// Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
}
#[cfg(test)]
@@ -159,15 +160,11 @@ mod tests_indexsorting {
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
let my_number = schema_builder.add_u64_field(
"my_number",
NumericOptions::default().set_fast(Cardinality::SingleValue),
);
let my_number =
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
NumericOptions::default().set_fast(Cardinality::MultiValues),
);
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
@@ -441,47 +438,48 @@ mod tests_indexsorting {
Ok(())
}
#[test]
fn test_sort_index_fast_field() -> crate::Result<()> {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()
);
// #[test]
// fn test_sort_index_fast_field() -> crate::Result<()> {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "my_number".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// get_text_options(),
// )?;
// assert_eq!(
// index.settings().sort_by_field.as_ref().unwrap().field,
// "my_number".to_string()
// );
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
index.schema().get_field("my_number").unwrap();
// let searcher = index.reader()?.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64("my_number").unwrap();
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
// let fast_field = fast_fields.u64(my_number).unwrap();
// assert_eq!(fast_field.get_val(0), 10u64);
// assert_eq!(fast_field.get_val(1), 20u64);
// assert_eq!(fast_field.get_val(2), 30u64);
let multifield = fast_fields.u64s("multi_numbers").unwrap();
let mut vals = vec![];
multifield.get_vals(0u32, &mut vals);
assert_eq!(vals, &[] as &[u64]);
let mut vals = vec![];
multifield.get_vals(1u32, &mut vals);
assert_eq!(vals, &[5, 6]);
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let multifield = fast_fields.u64s(multi_numbers).unwrap();
// let mut vals = vec![];
// multifield.get_vals(0u32, &mut vals);
// assert_eq!(vals, &[] as &[u64]);
// let mut vals = vec![];
// multifield.get_vals(1u32, &mut vals);
// assert_eq!(vals, &[5, 6]);
let mut vals = vec![];
multifield.get_vals(2u32, &mut vals);
assert_eq!(vals, &[3]);
Ok(())
}
// let mut vals = vec![];
// multifield.get_vals(2u32, &mut vals);
// assert_eq!(vals, &[3]);
// Ok(())
// }
#[test]
fn test_doc_mapping() {

File diff suppressed because it is too large Load Diff

View File

@@ -150,7 +150,6 @@ fn index_json_value(
json_term_writer.term_buffer,
ctx,
indexing_position,
None,
);
}
TextOrDateTime::DateTime(dt) => {

File diff suppressed because it is too large Load Diff

View File

@@ -2,19 +2,17 @@
mod tests {
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
@@ -62,7 +60,7 @@ mod tests {
) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_fast()
.set_stored()
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
@@ -71,10 +69,8 @@ mod tests {
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
NumericOptions::default().set_fast(Cardinality::MultiValues),
);
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
@@ -349,128 +345,130 @@ mod tests {
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
false,
)
.unwrap();
// #[test]
// fn test_merge_sorted_index_asc() {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// false,
// )
// .unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
// let int_field = index.schema().get_field("intval").unwrap();
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let bytes_field = index.schema().get_field("bytes").unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_readers().last().unwrap();
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64("intval").unwrap();
assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get_val(5), 1_000u64);
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64(int_field).unwrap();
// assert_eq!(fast_field.get_val(0), 1u64);
// assert_eq!(fast_field.get_val(1), 2u64);
// assert_eq!(fast_field.get_val(2), 3u64);
// assert_eq!(fast_field.get_val(3), 10u64);
// assert_eq!(fast_field.get_val(4), 20u64);
// assert_eq!(fast_field.get_val(5), 1_000u64);
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![];
fast_field.get_vals(doc_id, &mut vals);
vals
};
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64s("multi_numbers").unwrap();
assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
assert_eq!(&get_vals(&fast_field, 4), &[20]);
assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
// let mut vals = vec![];
// fast_field.get_vals(doc_id, &mut vals);
// vals
// };
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
let fast_field = fast_fields.bytes("bytes").unwrap();
assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
}
// // test new field norm mapping
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
// }
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
// let searcher = index.reader().unwrap().searcher();
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
// let do_search = |term: &str| {
// let query = QueryParser::for_index(&index, vec![my_text_field])
// .parse_query(term)
// .unwrap();
// let top_docs: Vec<(f32, DocAddress)> =
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
// };
assert_eq!(do_search("some"), vec![2]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![5]);
}
// assert_eq!(do_search("some"), vec![2]);
// assert_eq!(do_search("blubber"), vec![3]);
// assert_eq!(do_search("biggest"), vec![5]);
// }
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
// // postings file
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let term_a = Term::from_field_text(my_text_field, "text");
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
// let mut postings = inverted_index
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
// .unwrap()
// .unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
// assert_eq!(postings.doc_freq(), 2);
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
// assert_eq!(
// postings.doc_freq_given_deletes(
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
// ),
// 2
// );
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
// let mut output = vec![];
// postings.positions(&mut output);
// assert_eq!(output, vec![1, 3]);
// postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// postings.positions(&mut output);
// assert_eq!(output, vec![1]);
// }
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
}
}
// // access doc store
// {
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
// }
// }
}
#[cfg(all(test, feature = "unstable"))]
@@ -487,9 +485,7 @@ mod bench_sorted_index_merge {
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
let mut schema_builder = Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let schema = schema_builder.build();

View File

@@ -19,8 +19,8 @@ mod segment_register;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
mod sorted_doc_id_column;
mod sorted_doc_id_multivalue_column;
// mod sorted_doc_id_column;
// mod sorted_doc_id_multivalue_column;
mod stamper;
use crossbeam_channel as channel;
@@ -58,7 +58,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(test)]
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
// use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
@@ -79,45 +79,45 @@ mod tests_mmap {
Ok(())
}
#[test]
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_disabled_dot_escaped_required() {
// let mut schema_builder = Schema::builder();
// let json_field = schema_builder.add_json_field("json", TEXT);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s\.container\.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
// let mut schema_builder = Schema::builder();
// let json_options: JsonObjectOptions =
// JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
// let json_field = schema_builder.add_json_field("json", json_options);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s.container.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
}

View File

@@ -1,4 +1,7 @@
use common::TerminatingWrite;
use crate::core::{Segment, SegmentComponent};
use crate::directory::WritePtr;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
@@ -9,7 +12,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_serializer: CompositeFastFieldSerializer,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
@@ -47,7 +50,6 @@ impl SegmentSerializer {
};
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
@@ -56,7 +58,7 @@ impl SegmentSerializer {
Ok(SegmentSerializer {
segment,
store_writer,
fast_field_serializer,
fast_field_write,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
@@ -81,8 +83,8 @@ impl SegmentSerializer {
}
/// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
&mut self.fast_field_serializer
pub fn get_fast_field_write(&mut self) -> &mut WritePtr {
&mut self.fast_field_write
}
/// Extract the field norm serializer.
@@ -102,7 +104,7 @@ impl SegmentSerializer {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?;
self.fast_field_write.terminate()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
Ok(())

View File

@@ -139,7 +139,6 @@ impl SegmentWriter {
self.ctx,
self.fast_field_writers,
&self.fieldnorms_writer,
&self.schema,
self.segment_serializer,
mapping.as_ref(),
)?;
@@ -185,22 +184,15 @@ impl SegmentWriter {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut unordered_term_id_opt = None;
FacetTokenizer
.token_stream(facet_str)
.process(&mut |token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
// TODO pass indexing context directly in subscribe function
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_term_id_writer_mut(field)
.expect("writer for facet missing")
.add_val(unordered_term_id);
}
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
doc_id,
&mut *facet_tokenizer,
term_buffer,
ctx,
&mut indexing_position,
);
}
}
FieldType::Str(_) => {
@@ -227,7 +219,6 @@ impl SegmentWriter {
term_buffer,
ctx,
&mut indexing_position,
self.fast_field_writers.get_term_id_writer_mut(field),
);
}
if field_entry.has_fieldnorms() {
@@ -383,7 +374,6 @@ fn remap_and_write(
ctx: IndexingContext,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
schema: &Schema,
mut serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<()> {
@@ -395,20 +385,15 @@ fn remap_and_write(
.segment()
.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_map = serialize_postings(
serialize_postings(
ctx,
per_field_postings_writers,
fieldnorm_readers,
doc_id_map,
schema,
serializer.get_postings_serializer(),
)?;
debug!("fastfield-serialize");
fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
&term_ord_map,
doc_id_map,
)?;
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {
@@ -834,23 +819,20 @@ mod tests {
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![
Token {
// Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
},
],
tokens: vec![Token { // Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
}],
};
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");

View File

@@ -147,6 +147,22 @@ pub struct DateTime {
pub(crate) timestamp_micros: i64,
}
impl From<columnar::DateTime> for DateTime {
fn from(columnar_datetime: columnar::DateTime) -> Self {
DateTime {
timestamp_micros: columnar_datetime.timestamp_micros,
}
}
}
impl From<DateTime> for columnar::DateTime {
fn from(datetime: crate::DateTime) -> Self {
columnar::DateTime {
timestamp_micros: datetime.timestamp_micros,
}
}
}
impl DateTime {
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
@@ -263,7 +279,7 @@ mod indexer;
pub mod error;
pub mod tokenizer;
pub mod aggregation;
// pub mod aggregation;
pub mod collector;
pub mod directory;
pub mod fastfield;

View File

@@ -2,13 +2,10 @@ use std::io;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{
FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::as_json_path_type_value_bytes;
use crate::schema::Type;
use crate::tokenizer::TokenStream;
@@ -33,8 +30,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
pos: u32,
term: &crate::Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
) {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx);
}
fn index_text(
@@ -44,7 +41,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
) {
self.str_posting_writer.index_text(
doc_id,
@@ -52,20 +48,19 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer,
ctx,
indexing_position,
None,
);
}
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
for (term, addr) in term_addrs {
// TODO optimization opportunity here.
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
if typ == Type::Str {

View File

@@ -6,7 +6,6 @@ use std::ops::Range;
use rustc_hash::FxHashMap;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
@@ -21,12 +20,10 @@ use crate::DocId;
const POSITION_GAP: u32 = 1;
fn make_field_partition(
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
) -> Vec<(Field, Range<usize>)> {
fn make_field_partition(term_offsets: &[(Term<&[u8]>, Addr)]) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets
.iter()
.map(|(term, _, _)| term.field())
.map(|(term, _)| term.field())
.enumerate();
let mut prev_field_opt = None;
let mut fields = vec![];
@@ -54,48 +51,18 @@ pub(crate) fn serialize_postings(
per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>,
schema: &Schema,
serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len());
) -> crate::Result<()> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr)> = Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(
ctx.term_index
.iter()
.map(|(bytes, addr, unordered_id)| (Term::wrap(bytes), addr, unordered_id)),
.map(|(bytes, addr, _unordered_id)| (Term::wrap(bytes), addr)),
);
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
term_offsets.sort_unstable_by_key(|(k, _)| k.clone());
let field_offsets = make_field_partition(&term_offsets);
for (field, byte_offsets) in field_offsets {
let field_entry = schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) | FieldType::Facet(_) => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::F64(_)
| FieldType::Date(_)
| FieldType::Bool(_) => {}
FieldType::Bytes(_) => {}
FieldType::JsonObject(_) => {}
FieldType::IpAddr(_) => {}
}
let postings_writer = per_field_postings_writers.get_for_field(field);
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
let mut field_serializer =
@@ -108,7 +75,7 @@ pub(crate) fn serialize_postings(
)?;
field_serializer.close()?;
}
Ok(unordered_term_mappings)
Ok(())
}
#[derive(Default)]
@@ -129,19 +96,13 @@ pub(crate) trait PostingsWriter: Send + Sync {
/// * term - the term
/// * ctx - Contains a term hashmap and a memory arena to store all necessary posting list
/// information.
fn subscribe(
&mut self,
doc: DocId,
pos: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId;
fn subscribe(&mut self, doc: DocId, pos: u32, term: &Term, ctx: &mut IndexingContext);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
@@ -155,7 +116,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.len_bytes();
let mut num_tokens = 0;
@@ -175,11 +135,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32);
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);
}
self.subscribe(doc_id, start_position, term_buffer, ctx);
num_tokens += 1;
});
@@ -227,13 +183,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
}
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn subscribe(
&mut self,
doc: DocId,
position: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
@@ -252,18 +202,18 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
recorder.record_position(position, arena);
recorder
}
}) as UnorderedTermId
});
}
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
for (term, addr) in term_addrs {
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
}
Ok(())

View File

@@ -15,7 +15,7 @@ mod more_like_this;
mod phrase_query;
mod query;
mod query_parser;
mod range_query;
// mod range_query;
mod regex_query;
mod reqopt_scorer;
mod scorer;
@@ -50,7 +50,7 @@ pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::RangeQuery;
// pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{

View File

@@ -13,10 +13,19 @@ use crate::core::Index;
use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
// use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query,
RangeQuery, TermQuery, TermSetQuery,
AllQuery,
BooleanQuery,
BoostQuery,
EmptyQuery,
FuzzyTermQuery,
Occur,
PhraseQuery,
Query,
// RangeQuery,
TermQuery,
TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -334,89 +343,90 @@ impl QueryParser {
json_path: &str,
phrase: &str,
) -> Result<Term, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let field_supports_ff_range_queries = field_type.is_fast()
&& is_type_valid_for_fastfield_range_query(field_type.value_type());
if !field_type.is_indexed() && !field_supports_ff_range_queries {
return Err(QueryParserError::FieldNotIndexed(
field_entry.name().to_string(),
));
}
if !json_path.is_empty() && field_type.value_type() != Type::Json {
return Err(QueryParserError::UnsupportedQuery(format!(
"Json path is not supported for field {:?}",
field_entry.name()
)));
}
match *field_type {
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
Ok(Term::from_field_u64(field, val))
}
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
Ok(Term::from_field_i64(field, val))
}
FieldType::F64(_) => {
let val: f64 = f64::from_str(phrase)?;
Ok(Term::from_field_f64(field, val))
}
FieldType::Bool(_) => {
let val: bool = bool::from_str(phrase)?;
Ok(Term::from_field_bool(field, val))
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
}
FieldType::Str(ref str_options) => {
let option = str_options.get_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_entry.name().to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let mut terms: Vec<Term> = Vec::new();
let mut token_stream = text_analyzer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push(term);
});
if terms.len() != 1 {
return Err(QueryParserError::UnsupportedQuery(format!(
"Range query boundary cannot have multiple tokens: {phrase:?}."
)));
}
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(_) => {
// Json range are not supported.
Err(QueryParserError::UnsupportedQuery(
"Range query are not supported on json field.".to_string(),
))
}
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => Ok(Term::from_facet(field, &facet)),
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = BASE64
.decode(phrase)
.map_err(QueryParserError::ExpectedBase64)?;
Ok(Term::from_field_bytes(field, &bytes))
}
FieldType::IpAddr(_) => {
let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
Ok(Term::from_field_ip_addr(field, ip_v6))
}
}
todo!();
// let field_entry = self.schema.get_field_entry(field);
// let field_type = field_entry.field_type();
// let field_supports_ff_range_queries = field_type.is_fast()
// && is_type_valid_for_fastfield_range_query(field_type.value_type());
//
// if !field_type.is_indexed() && !field_supports_ff_range_queries {
// return Err(QueryParserError::FieldNotIndexed(
// field_entry.name().to_string(),
// ));
// }
// if !json_path.is_empty() && field_type.value_type() != Type::Json {
// return Err(QueryParserError::UnsupportedQuery(format!(
// "Json path is not supported for field {:?}",
// field_entry.name()
// )));
// }
// match *field_type {
// FieldType::U64(_) => {
// let val: u64 = u64::from_str(phrase)?;
// Ok(Term::from_field_u64(field, val))
// }
// FieldType::I64(_) => {
// let val: i64 = i64::from_str(phrase)?;
// Ok(Term::from_field_i64(field, val))
// }
// FieldType::F64(_) => {
// let val: f64 = f64::from_str(phrase)?;
// Ok(Term::from_field_f64(field, val))
// }
// FieldType::Bool(_) => {
// let val: bool = bool::from_str(phrase)?;
// Ok(Term::from_field_bool(field, val))
// }
// FieldType::Date(_) => {
// let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
// Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
// }
// FieldType::Str(ref str_options) => {
// let option = str_options.get_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
// QueryParserError::FieldNotIndexed(field_entry.name().to_string())
// })?;
// let text_analyzer =
// self.tokenizer_manager
// .get(option.tokenizer())
// .ok_or_else(|| QueryParserError::UnknownTokenizer {
// field: field_entry.name().to_string(),
// tokenizer: option.tokenizer().to_string(),
// })?;
// let mut terms: Vec<Term> = Vec::new();
// let mut token_stream = text_analyzer.token_stream(phrase);
// token_stream.process(&mut |token| {
// let term = Term::from_field_text(field, &token.text);
// terms.push(term);
// });
// if terms.len() != 1 {
// return Err(QueryParserError::UnsupportedQuery(format!(
// "Range query boundary cannot have multiple tokens: {phrase:?}."
// )));
// }
// Ok(terms.into_iter().next().unwrap())
// }
// FieldType::JsonObject(_) => {
// Json range are not supported.
// Err(QueryParserError::UnsupportedQuery(
// "Range query are not supported on json field.".to_string(),
// ))
// }
// FieldType::Facet(_) => match Facet::from_text(phrase) {
// Ok(facet) => Ok(Term::from_facet(field, &facet)),
// Err(e) => Err(QueryParserError::from(e)),
// },
// FieldType::Bytes(_) => {
// let bytes = BASE64
// .decode(phrase)
// .map_err(QueryParserError::ExpectedBase64)?;
// Ok(Term::from_field_bytes(field, &bytes))
// }
// FieldType::IpAddr(_) => {
// let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
// Ok(Term::from_field_ip_addr(field, ip_v6))
// }
// }
}
fn compute_logical_ast_for_leaf(
@@ -740,9 +750,12 @@ fn convert_literal_to_query(
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
} => {
todo!();
// Box::new(RangeQuery::new_term_bounds(
// field, value_type, &lower, &upper,
// ))
}
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}

View File

@@ -4,7 +4,7 @@ use std::sync::Arc;
use fastfield_codecs::Column;
use crate::fastfield::{MakeZero, MultiValuedFastFieldReader};
use crate::fastfield::MakeZero;
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids

View File

@@ -8,10 +8,13 @@ use std::ops::{Bound, RangeInclusive};
use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
<<<<<<< HEAD
use crate::schema::Cardinality;
=======
use crate::schema::Field;
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
@@ -40,6 +43,7 @@ impl IPFastFieldRangeWeight {
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
<<<<<<< HEAD
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
@@ -74,6 +78,40 @@ impl Weight for IPFastFieldRangeWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
=======
todo!();
// let field_type = reader.schema().get_field_entry(self.field).field_type();
// match field_type.fastfield_cardinality().unwrap() {
// Cardinality::SingleValue => {
// let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// ip_addr_fast_field.min_value(),
// ip_addr_fast_field.max_value(),
// );
// let docset = RangeDocSet::new(
// value_range,
// FastFieldCardinality::SingleValue(ip_addr_fast_field),
// );
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// Cardinality::MultiValues => {
// let ip_addr_fast_field = reader.fast_fields().ip_addrs(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// ip_addr_fast_field.min_value(),
// ip_addr_fast_field.max_value(),
// );
// let docset = RangeDocSet::new(
// value_range,
// FastFieldCardinality::MultiValue(ip_addr_fast_field),
// );
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// }
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -190,7 +228,7 @@ mod tests {
let ips_field = schema_builder.add_ip_addr_field(
"ips",
IpAddrOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
let text_field = schema_builder.add_text_field("id", STRING | STORED);

View File

@@ -6,10 +6,14 @@ use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::fast_field_range_query::RangeDocSet;
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
<<<<<<< HEAD
use crate::schema::Cardinality;
=======
use crate::schema::Field;
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
@@ -33,6 +37,7 @@ impl FastFieldRangeWeight {
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
<<<<<<< HEAD
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
@@ -63,6 +68,36 @@ impl Weight for FastFieldRangeWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
=======
todo!();
// let field_type = reader.schema().get_field_entry(self.field).field_type();
// match field_type.fastfield_cardinality().unwrap() {
// Cardinality::SingleValue => {
// let fast_field = reader.fast_fields().u64_lenient(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// fast_field.min_value(),
// fast_field.max_value(),
// );
// let docset =
// RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field));
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// Cardinality::MultiValues => {
// let fast_field = reader.fast_fields().u64s_lenient(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// fast_field.min_value(),
// fast_field.max_value(),
// );
// let docset =
// RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field));
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// }
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -186,7 +221,7 @@ mod tests {
let ids_u64_field = schema_builder.add_u64_field(
"ids",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
@@ -194,7 +229,7 @@ mod tests {
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
@@ -202,7 +237,7 @@ mod tests {
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);

View File

@@ -109,7 +109,6 @@ impl TermQuery {
} else {
IndexRecordOption::Basic
};
Ok(TermWeight::new(
self.term.clone(),
index_record_option,

View File

@@ -44,7 +44,7 @@ pub struct IndexReaderBuilder {
index: Index,
warmers: Vec<Weak<dyn Warmer>>,
num_warming_threads: usize,
doc_store_cache_num_blocks: usize,
doc_store_cache_size: usize,
}
impl IndexReaderBuilder {
@@ -55,7 +55,7 @@ impl IndexReaderBuilder {
index,
warmers: Vec::new(),
num_warming_threads: 1,
doc_store_cache_num_blocks: DOCSTORE_CACHE_CAPACITY,
doc_store_cache_size: DOCSTORE_CACHE_CAPACITY,
}
}
@@ -72,7 +72,7 @@ impl IndexReaderBuilder {
searcher_generation_inventory.clone(),
)?;
let inner_reader = InnerIndexReader::new(
self.doc_store_cache_num_blocks,
self.doc_store_cache_size,
self.index,
warming_state,
searcher_generation_inventory,
@@ -119,11 +119,8 @@ impl IndexReaderBuilder {
///
/// The doc store readers cache by default DOCSTORE_CACHE_CAPACITY(100) decompressed blocks.
#[must_use]
pub fn doc_store_cache_num_blocks(
mut self,
doc_store_cache_num_blocks: usize,
) -> IndexReaderBuilder {
self.doc_store_cache_num_blocks = doc_store_cache_num_blocks;
pub fn doc_store_cache_size(mut self, doc_store_cache_size: usize) -> IndexReaderBuilder {
self.doc_store_cache_size = doc_store_cache_size;
self
}
@@ -154,7 +151,7 @@ impl TryInto<IndexReader> for IndexReaderBuilder {
}
struct InnerIndexReader {
doc_store_cache_num_blocks: usize,
doc_store_cache_size: usize,
index: Index,
warming_state: WarmingState,
searcher: arc_swap::ArcSwap<SearcherInner>,
@@ -164,7 +161,7 @@ struct InnerIndexReader {
impl InnerIndexReader {
fn new(
doc_store_cache_num_blocks: usize,
doc_store_cache_size: usize,
index: Index,
warming_state: WarmingState,
// The searcher_generation_inventory is not used as source, but as target to track the
@@ -175,13 +172,13 @@ impl InnerIndexReader {
let searcher = Self::create_searcher(
&index,
doc_store_cache_num_blocks,
doc_store_cache_size,
&warming_state,
&searcher_generation_counter,
&searcher_generation_inventory,
)?;
Ok(InnerIndexReader {
doc_store_cache_num_blocks,
doc_store_cache_size,
index,
warming_state,
searcher: ArcSwap::from(searcher),
@@ -217,7 +214,7 @@ impl InnerIndexReader {
fn create_searcher(
index: &Index,
doc_store_cache_num_blocks: usize,
doc_store_cache_size: usize,
warming_state: &WarmingState,
searcher_generation_counter: &Arc<AtomicU64>,
searcher_generation_inventory: &Inventory<SearcherGeneration>,
@@ -235,7 +232,7 @@ impl InnerIndexReader {
index.clone(),
segment_readers,
searcher_generation,
doc_store_cache_num_blocks,
doc_store_cache_size,
)?);
warming_state.warm_new_searcher_generation(&searcher.clone().into())?;
@@ -245,7 +242,7 @@ impl InnerIndexReader {
fn reload(&self) -> crate::Result<()> {
let searcher = Self::create_searcher(
&self.index,
self.doc_store_cache_num_blocks,
self.doc_store_cache_size,
&self.warming_state,
&self.searcher_generation_counter,
&self.searcher_generation_inventory,

View File

@@ -2,14 +2,16 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::Cardinality;
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// DateTime Precision
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
#[derive(
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "lowercase")]
pub enum DatePrecision {
/// Seconds precision
#[default]
Seconds,
/// Milli-seconds precision.
Milliseconds,
@@ -17,20 +19,13 @@ pub enum DatePrecision {
Microseconds,
}
impl Default for DatePrecision {
fn default() -> Self {
DatePrecision::Seconds
}
}
/// Defines how DateTime field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct DateOptions {
indexed: bool,
// This boolean has no effect if the field is not marked as indexed true.
fieldnorms: bool,
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
// Internal storage precision, used to optimize storage
// compression on fast fields.
@@ -54,18 +49,9 @@ impl DateOptions {
self.fieldnorms && self.indexed
}
/// Returns true iff the value is a fast field and multivalue.
pub fn is_multivalue_fast(&self) -> bool {
if let Some(cardinality) = self.fast {
cardinality == Cardinality::MultiValues
} else {
false
}
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Set the field as stored.
@@ -107,19 +93,11 @@ impl DateOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> DateOptions {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> DateOptions {
self.fast = true;
self
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns `None`.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
/// Sets the precision for this DateTime field.
///
/// Internal storage precision, used to optimize storage
@@ -147,10 +125,7 @@ impl From<()> for DateOptions {
impl From<FastFlag> for DateOptions {
fn from(_: FastFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
..Default::default()
}
}
@@ -159,10 +134,7 @@ impl From<FastFlag> for DateOptions {
impl From<StoredFlag> for DateOptions {
fn from(_: StoredFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: true,
fast: None,
..Default::default()
}
}
@@ -173,8 +145,6 @@ impl From<IndexedFlag> for DateOptions {
DateOptions {
indexed: true,
fieldnorms: true,
stored: false,
fast: None,
..Default::default()
}
}
@@ -189,7 +159,7 @@ impl<T: Into<DateOptions>> BitOr<T> for DateOptions {
indexed: self.indexed | other.indexed,
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
precision: self.precision,
}
}

View File

@@ -8,7 +8,7 @@ use serde_json::Value as JsonValue;
use thiserror::Error;
use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr};
use super::IntoIpv6Addr;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{
@@ -241,26 +241,6 @@ impl FieldType {
}
}
/// returns true if the field is fast.
pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
match *self {
FieldType::Bytes(ref bytes_options) => {
bytes_options.is_fast().then_some(Cardinality::SingleValue)
}
FieldType::Str(ref text_options) => {
text_options.is_fast().then_some(Cardinality::MultiValues)
}
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality(),
FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues),
FieldType::JsonObject(_) => None,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.get_fastfield_cardinality(),
}
}
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {

Some files were not shown because too many files have changed in this diff Show More