mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-31 14:32:54 +00:00
Compare commits
20 Commits
typed-colu
...
use_column
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
375d1f9dac | ||
|
|
2874554ee4 | ||
|
|
cbc70a9eae | ||
|
|
226d0f88bc | ||
|
|
9548570e88 | ||
|
|
9a296b29b7 | ||
|
|
b31fd389d8 | ||
|
|
89cec79813 | ||
|
|
d09d91a856 | ||
|
|
50d8a8bc32 | ||
|
|
08919a2900 | ||
|
|
8ba333f1b4 | ||
|
|
a2ca12995e | ||
|
|
e3d504d833 | ||
|
|
5a42c5aae9 | ||
|
|
a86b104a40 | ||
|
|
f9abd256b7 | ||
|
|
9f42b6440a | ||
|
|
c723ed3f0b | ||
|
|
d72ea7d353 |
@@ -55,11 +55,11 @@ measure_time = "0.8.2"
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
|
||||
columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
|
||||
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
|
||||
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
|
||||
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
|
||||
columnar = { version= "0.1", path="./columnar", package="tantivy-columnar" }
|
||||
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
|
||||
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
|
||||
|
||||
45
README.md
45
README.md
@@ -41,7 +41,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set
|
||||
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, f64, dates, and hierarchical facet fields
|
||||
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
|
||||
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
|
||||
- Range queries
|
||||
- Faceted search
|
||||
@@ -80,56 +80,21 @@ There are many ways to support this project.
|
||||
# Contributing code
|
||||
|
||||
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
||||
Feel free to update CHANGELOG.md with your contribution.
|
||||
|
||||
## Tokenizer
|
||||
|
||||
When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
|
||||
|
||||
## Minimum supported Rust version
|
||||
|
||||
Tantivy currently requires at least Rust 1.62 or later to compile.
|
||||
|
||||
## Clone and build locally
|
||||
|
||||
Tantivy compiles on stable Rust.
|
||||
To check out and run tests, you can simply run:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/quickwit-oss/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
```
|
||||
|
||||
## Run tests
|
||||
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
## Debug
|
||||
|
||||
You might find it useful to step through the programme with a debugger.
|
||||
|
||||
### A failing test
|
||||
|
||||
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that the `target/` directory exists. Use this bash script to find the name of the most recent debug build of Tantivy and run it under `rust-gdb`:
|
||||
|
||||
```bash
|
||||
find target/debug/ -maxdepth 1 -executable -type f -name "tantivy*" -printf '%TY-%Tm-%Td %TT %p\n' | sort -r | cut -d " " -f 3 | xargs -I RECENT_DBG_TANTIVY rust-gdb RECENT_DBG_TANTIVY
|
||||
```
|
||||
|
||||
Now that you are in `rust-gdb`, you can set breakpoints on lines and methods that match your source code and run the debug executable with flags that you normally pass to `cargo test` like this:
|
||||
|
||||
```bash
|
||||
$gdb run --test-threads 1 --test $NAME_OF_TEST
|
||||
```
|
||||
|
||||
### An example
|
||||
|
||||
By default, `rustc` compiles everything in the `examples/` directory in debug mode. This makes it easy for you to make examples to reproduce bugs:
|
||||
|
||||
```bash
|
||||
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
||||
$ gdb run
|
||||
git clone https://github.com/quickwit-oss/tantivy.git
|
||||
cd tantivy
|
||||
cargo test
|
||||
```
|
||||
|
||||
# Companies Using Tantivy
|
||||
|
||||
18
TODO.txt
18
TODO.txt
@@ -1,18 +0,0 @@
|
||||
Make schema_builder API fluent.
|
||||
fix doc serialization and prevent compression problems
|
||||
|
||||
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
|
||||
remove fastfield codecs
|
||||
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
||||
rename FastFieldReaders::open to load
|
||||
|
||||
|
||||
remove fast field reader
|
||||
|
||||
find a way to unify the two DateTime.
|
||||
readd type check in the filter wrapper
|
||||
|
||||
add unit test on columnar list columns.
|
||||
|
||||
make sure sort works
|
||||
|
||||
@@ -15,3 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
proptest = "1"
|
||||
|
||||
@@ -4,9 +4,39 @@ extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
|
||||
use test::Bencher;
|
||||
|
||||
#[inline(never)]
|
||||
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut buffer = Vec::new();
|
||||
for _ in 0..num_els {
|
||||
// the values do not matter.
|
||||
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitpacking_read(b: &mut Bencher) {
|
||||
let bit_width = 3;
|
||||
let num_els = 1_000_000u32;
|
||||
let bit_unpacker = BitUnpacker::new(bit_width);
|
||||
let data = create_bitpacked_data(bit_width, num_els);
|
||||
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut thread_rng(), 100_000);
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for &idx in &idxs {
|
||||
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
@@ -14,9 +44,9 @@ mod tests {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut out = 0;
|
||||
let mut out = 0u64;
|
||||
for val in 0..=21500 {
|
||||
out = blocked_bitpacker.get(val);
|
||||
out = out.wrapping_add(blocked_bitpacker.get(val));
|
||||
}
|
||||
out
|
||||
});
|
||||
|
||||
@@ -56,27 +56,31 @@ impl BitPacker {
|
||||
|
||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[derive(Clone, Debug, Default, Copy)]
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u64,
|
||||
num_bits: u32,
|
||||
mask: u64,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
/// Creates a bit unpacker, that assumes the same bitwidth for all values.
|
||||
///
|
||||
/// The bitunpacker works by doing an unaligned read of 8 bytes.
|
||||
/// For this reason, values of `num_bits` between
|
||||
/// [57..63] are forbidden.
|
||||
pub fn new(num_bits: u8) -> BitUnpacker {
|
||||
assert!(num_bits <= 7 * 8 || num_bits == 64);
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: u64::from(num_bits),
|
||||
num_bits: u32::from(num_bits),
|
||||
mask,
|
||||
}
|
||||
}
|
||||
@@ -87,28 +91,40 @@ impl BitUnpacker {
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
let addr_in_bits = idx * self.num_bits as u32;
|
||||
let addr_in_bits = idx * self.num_bits;
|
||||
let addr = (addr_in_bits >> 3) as usize;
|
||||
if addr + 8 > data.len() {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
}
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
return self.get_slow_path(addr, bit_shift, data);
|
||||
}
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = val_unshifted_unmasked >> bit_shift;
|
||||
val_shifted & self.mask
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn get_slow_path(&self, addr: usize, bit_shift: u32, data: &[u8]) -> u64 {
|
||||
let mut bytes: [u8; 8] = [0u8; 8];
|
||||
let available_bytes = data.len() - addr;
|
||||
// This function is meant to only be called if we did not have 8 bytes to load.
|
||||
debug_assert!(available_bytes < 8);
|
||||
bytes[..available_bytes].copy_from_slice(&data[addr..]);
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = val_unshifted_unmasked >> bit_shift;
|
||||
val_shifted & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker};
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
||||
fn create_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
||||
@@ -119,13 +135,13 @@ mod test {
|
||||
bitpacker.write(val, num_bits, &mut data).unwrap();
|
||||
}
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
|
||||
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8);
|
||||
let bitunpacker = BitUnpacker::new(num_bits);
|
||||
(bitunpacker, vals, data)
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
|
||||
let (bitunpacker, vals, data) = create_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i as u32, &data), *val);
|
||||
}
|
||||
@@ -139,4 +155,49 @@ mod test {
|
||||
test_bitpacker_util(6, 14);
|
||||
test_bitpacker_util(1000, 14);
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
fn num_bits_strategy() -> impl Strategy<Value = u8> {
|
||||
prop_oneof!(Just(0), Just(1), 2u8..56u8, Just(56), Just(64),)
|
||||
}
|
||||
|
||||
fn vals_strategy() -> impl Strategy<Value = (u8, Vec<u64>)> {
|
||||
(num_bits_strategy(), 0usize..100usize).prop_flat_map(|(num_bits, len)| {
|
||||
let max_val = if num_bits == 64 {
|
||||
u64::MAX
|
||||
} else {
|
||||
(1u64 << num_bits as u32) - 1
|
||||
};
|
||||
let vals = proptest::collection::vec(0..=max_val, len);
|
||||
vals.prop_map(move |vals| (num_bits, vals))
|
||||
})
|
||||
}
|
||||
|
||||
fn test_bitpacker_aux(num_bits: u8, vals: &[u64]) {
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
for &val in vals {
|
||||
bitpacker.write(val, num_bits, &mut buffer).unwrap();
|
||||
}
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8);
|
||||
let bitunpacker = BitUnpacker::new(num_bits);
|
||||
let max_val = if num_bits == 64 {
|
||||
u64::MAX
|
||||
} else {
|
||||
(1u64 << num_bits) - 1
|
||||
};
|
||||
for (i, val) in vals.iter().copied().enumerate() {
|
||||
assert!(val <= max_val);
|
||||
assert_eq!(bitunpacker.get(i as u32, &buffer), val);
|
||||
}
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_bitpacker_proptest((num_bits, vals) in vals_strategy()) {
|
||||
test_bitpacker_aux(num_bits, &vals);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,24 +5,23 @@ edition = "2021"
|
||||
license = "MIT"
|
||||
|
||||
[dependencies]
|
||||
itertools = "0.10.5"
|
||||
log = "0.4.17"
|
||||
fnv = "1.0.7"
|
||||
fastdivide = "0.4.0"
|
||||
rand = { version = "0.8.5", optional = true }
|
||||
measure_time = { version = "0.8.2", optional = true }
|
||||
prettytable-rs = { version = "0.10.0", optional = true }
|
||||
|
||||
stacker = { path = "../stacker", package="tantivy-stacker"}
|
||||
serde_json = "1"
|
||||
thiserror = "1"
|
||||
fnv = "1"
|
||||
sstable = { path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { path = "../common", package = "tantivy-common" }
|
||||
itertools = "0.10"
|
||||
log = "0.4"
|
||||
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.10.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
fastdivide = "0.4"
|
||||
measure_time = { version="0.8.2", optional=true}
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.0"
|
||||
rand = "0.8.3"
|
||||
proptest = "1.0.0"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.8.5"
|
||||
|
||||
[features]
|
||||
unstable = []
|
||||
|
||||
@@ -26,7 +26,6 @@ Add alignment?
|
||||
Consider another codec to bridge the gap between few and 5k elements
|
||||
|
||||
# Cleanup and rationalization
|
||||
remove the 6 bit limitation of columntype. use 4 + 4 bits instead.
|
||||
in benchmark, unify percent vs ratio, f32 vs f64.
|
||||
investigate if should have better errors? io::Error is overused at the moment.
|
||||
rename rank/select in unit tests
|
||||
|
||||
@@ -5,10 +5,16 @@ use std::sync::Arc;
|
||||
use sstable::{Dictionary, VoidSSTable};
|
||||
|
||||
use crate::column::Column;
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::RowId;
|
||||
|
||||
/// Dictionary encoded column.
|
||||
///
|
||||
/// The column simply gives access to a regular u64-column that, in
|
||||
/// which the values are term-ordinals.
|
||||
///
|
||||
/// These ordinals are ids uniquely identify the bytes that are stored in
|
||||
/// the column. These ordinals are small, and sorted in the same order
|
||||
/// as the term_ord_column.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesColumn {
|
||||
pub(crate) dictionary: Arc<Dictionary<VoidSSTable>>,
|
||||
@@ -16,30 +22,57 @@ pub struct BytesColumn {
|
||||
}
|
||||
|
||||
impl BytesColumn {
|
||||
/// Fills the given `output` buffer with the term associated to the ordinal `ord`.
|
||||
///
|
||||
/// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
|
||||
/// overll number of terms).
|
||||
pub fn term_ord_to_str(&self, term_ord: u64, output: &mut Vec<u8>) -> io::Result<bool> {
|
||||
self.dictionary.ord_to_term(term_ord, output)
|
||||
}
|
||||
|
||||
pub fn term_ords(&self) -> &Column<u64> {
|
||||
&self.term_ord_column
|
||||
pub fn ord_to_bytes(&self, ord: u64, output: &mut Vec<u8>) -> io::Result<bool> {
|
||||
self.dictionary.ord_to_term(ord, output)
|
||||
}
|
||||
|
||||
/// Returns the number of rows in the column.
|
||||
pub fn num_rows(&self) -> RowId {
|
||||
self.term_ord_column.num_rows()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for BytesColumn {
|
||||
type Target = ColumnIndex<'static>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&**self.term_ords()
|
||||
/// Returns the column of ordinals
|
||||
pub fn ords(&self) -> &Column<u64> {
|
||||
&self.term_ord_column
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{ColumnarReader, ColumnarWriter};
|
||||
#[derive(Clone)]
|
||||
pub struct StrColumn(BytesColumn);
|
||||
|
||||
impl From<BytesColumn> for StrColumn {
|
||||
fn from(bytes_col: BytesColumn) -> Self {
|
||||
StrColumn(bytes_col)
|
||||
}
|
||||
}
|
||||
|
||||
impl StrColumn {
|
||||
/// Fills the buffer
|
||||
pub fn ord_to_str(&self, term_ord: u64, output: &mut String) -> io::Result<bool> {
|
||||
unsafe {
|
||||
let buf = output.as_mut_vec();
|
||||
self.0.dictionary.ord_to_term(term_ord, buf)?;
|
||||
// TODO consider remove checks if it hurts performance.
|
||||
if std::str::from_utf8(buf.as_slice()).is_err() {
|
||||
buf.clear();
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Not valid utf-8",
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for StrColumn {
|
||||
type Target = BytesColumn;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,10 +5,13 @@ use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::BinarySerializable;
|
||||
pub use dictionary_encoded::BytesColumn;
|
||||
pub use serialize::{open_column_bytes, open_column_u64, serialize_column_u64};
|
||||
pub use dictionary_encoded::{BytesColumn, StrColumn};
|
||||
pub use serialize::{
|
||||
open_column_bytes, open_column_u128, open_column_u64, serialize_column_mappable_to_u128,
|
||||
serialize_column_mappable_to_u64,
|
||||
};
|
||||
|
||||
use crate::column_index::{ColumnIndex, Set};
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::column_values::ColumnValues;
|
||||
use crate::{Cardinality, RowId};
|
||||
|
||||
@@ -19,27 +22,37 @@ pub struct Column<T> {
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> Column<T> {
|
||||
pub fn get_cardinality(&self) -> Cardinality {
|
||||
self.idx.get_cardinality()
|
||||
}
|
||||
pub fn num_rows(&self) -> RowId {
|
||||
match &self.idx {
|
||||
ColumnIndex::Full => self.values.num_vals(),
|
||||
ColumnIndex::Optional(optional_idx) => optional_idx.num_rows(),
|
||||
ColumnIndex::Multivalued(_) => todo!(),
|
||||
ColumnIndex::Full => self.values.num_vals() as u32,
|
||||
ColumnIndex::Optional(optional_index) => optional_index.num_rows(),
|
||||
ColumnIndex::Multivalued(col_index) => {
|
||||
// The multivalued index contains all value start row_id,
|
||||
// and one extra value at the end with the overall number of rows.
|
||||
col_index.num_vals() - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn min_value(&self) -> T {
|
||||
self.values.min_value()
|
||||
}
|
||||
pub fn max_value(&self) -> T {
|
||||
self.values.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
|
||||
pub fn first(&self, row_id: RowId) -> Option<T> {
|
||||
match &self.idx {
|
||||
ColumnIndex::Full => Some(self.values.get_val(row_id)),
|
||||
ColumnIndex::Optional(opt_idx) => {
|
||||
let value_row_idx = opt_idx.rank_if_exists(row_id)?;
|
||||
Some(self.values.get_val(value_row_idx))
|
||||
}
|
||||
ColumnIndex::Multivalued(_multivalued_index) => {
|
||||
todo!();
|
||||
}
|
||||
}
|
||||
self.values(row_id).next()
|
||||
}
|
||||
|
||||
pub fn values(&self, row_id: RowId) -> impl Iterator<Item = T> + '_ {
|
||||
self.value_row_ids(row_id)
|
||||
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
|
||||
}
|
||||
|
||||
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
|
||||
|
||||
@@ -2,24 +2,51 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
use common::OwnedBytes;
|
||||
use sstable::Dictionary;
|
||||
|
||||
use crate::column::{BytesColumn, Column};
|
||||
use crate::column_index::{serialize_column_index, SerializableColumnIndex};
|
||||
use crate::column_values::serialize::serialize_column_values_u128;
|
||||
use crate::column_values::{
|
||||
serialize_column_values, ColumnValues, MonotonicallyMappableToU64, ALL_CODEC_TYPES,
|
||||
serialize_column_values, ColumnValues, FastFieldCodecType, MonotonicallyMappableToU128,
|
||||
MonotonicallyMappableToU64,
|
||||
};
|
||||
pub fn serialize_column_u64<T: MonotonicallyMappableToU64>(
|
||||
|
||||
pub fn serialize_column_mappable_to_u128<
|
||||
F: Fn() -> I,
|
||||
I: Iterator<Item = T>,
|
||||
T: MonotonicallyMappableToU128,
|
||||
>(
|
||||
column_index: SerializableColumnIndex<'_>,
|
||||
column_values: F,
|
||||
num_vals: u32,
|
||||
output: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
let column_index_num_bytes = serialize_column_index(column_index, output)?;
|
||||
serialize_column_values_u128(
|
||||
|| column_values().map(|val| val.to_u128()),
|
||||
num_vals,
|
||||
output,
|
||||
)?;
|
||||
output.write_all(&column_index_num_bytes.to_le_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
|
||||
column_index: SerializableColumnIndex<'_>,
|
||||
column_values: &impl ColumnValues<T>,
|
||||
output: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
let mut counting_writer = CountingWriter::wrap(output);
|
||||
serialize_column_index(column_index, &mut counting_writer)?;
|
||||
let column_index_num_bytes = counting_writer.written_bytes() as u32;
|
||||
let output = counting_writer.finish();
|
||||
serialize_column_values(column_values, &ALL_CODEC_TYPES[..], output)?;
|
||||
let column_index_num_bytes = serialize_column_index(column_index, output)?;
|
||||
serialize_column_values(
|
||||
column_values,
|
||||
&[
|
||||
FastFieldCodecType::Bitpacked,
|
||||
FastFieldCodecType::BlockwiseLinear,
|
||||
],
|
||||
output,
|
||||
)?;
|
||||
output.write_all(&column_index_num_bytes.to_le_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -41,14 +68,34 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
|
||||
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
.as_slice()
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
|
||||
Ok(Column {
|
||||
idx: column_index,
|
||||
values: column_values,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_bytes<T: From<BytesColumn>>(data: OwnedBytes) -> io::Result<T> {
|
||||
let (body, dictionary_len_bytes) = data.rsplit(4);
|
||||
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
|
||||
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
|
||||
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
|
||||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
||||
Ok(BytesColumn {
|
||||
let bytes_column = BytesColumn {
|
||||
dictionary,
|
||||
term_ord_column,
|
||||
})
|
||||
};
|
||||
Ok(bytes_column.into())
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ mod multivalued_index;
|
||||
mod optional_index;
|
||||
mod serialize;
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use optional_index::{OptionalIndex, SerializableOptionalIndex, Set};
|
||||
@@ -14,8 +15,12 @@ use crate::{Cardinality, RowId};
|
||||
pub enum ColumnIndex<'a> {
|
||||
Full,
|
||||
Optional(OptionalIndex),
|
||||
// TODO remove the Arc<dyn> apart from serialization this is not
|
||||
// dynamic at all.
|
||||
// TODO Remove the static by fixing the codec if possible.
|
||||
/// The column values enclosed contains for all row_id,
|
||||
/// the value start_index.
|
||||
///
|
||||
/// In addition, at index num_rows, an extra value is added
|
||||
/// containing the overal number of values.
|
||||
Multivalued(Arc<dyn ColumnValues<RowId> + 'a>),
|
||||
}
|
||||
|
||||
@@ -27,4 +32,23 @@ impl<'a> ColumnIndex<'a> {
|
||||
ColumnIndex::Multivalued(_) => Cardinality::Multivalued,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn value_row_ids(&self, row_id: RowId) -> Range<RowId> {
|
||||
match self {
|
||||
ColumnIndex::Full => row_id..row_id + 1,
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
if let Some(val) = optional_index.rank_if_exists(row_id) {
|
||||
val..val + 1
|
||||
} else {
|
||||
0..0
|
||||
}
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => {
|
||||
let multivalued_index_ref = &**multivalued_index;
|
||||
let start: u32 = multivalued_index_ref.get_val(row_id);
|
||||
let end: u32 = multivalued_index_ref.get_val(row_id + 1);
|
||||
start..end
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,11 +11,11 @@ use crate::RowId;
|
||||
pub struct MultivaluedIndex(Arc<dyn ColumnValues<RowId>>);
|
||||
|
||||
pub fn serialize_multivalued_index(
|
||||
multivalued_index: MultivaluedIndex,
|
||||
multivalued_index: &dyn ColumnValues<RowId>,
|
||||
output: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
crate::column_values::serialize_column_values(
|
||||
&*multivalued_index.0,
|
||||
&*multivalued_index,
|
||||
&[FastFieldCodecType::Bitpacked, FastFieldCodecType::Linear],
|
||||
output,
|
||||
)?;
|
||||
@@ -23,5 +23,7 @@ pub fn serialize_multivalued_index(
|
||||
}
|
||||
|
||||
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<RowId>>> {
|
||||
todo!();
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||
crate::column_values::open_u64_mapped(bytes)?;
|
||||
Ok(start_index_column)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
mod set_block;
|
||||
mod dense;
|
||||
mod sparse;
|
||||
|
||||
pub use set_block::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
|
||||
pub use dense::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
|
||||
pub use sparse::{SparseBlock, SparseBlockCodec};
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::column_index::optional_index::set_block::set_block::DENSE_BLOCK_NUM_BYTES;
|
||||
use crate::column_index::optional_index::set_block::{DenseBlockCodec, SparseBlockCodec};
|
||||
use crate::column_index::optional_index::set_block::{
|
||||
DenseBlockCodec, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
|
||||
};
|
||||
use crate::column_index::optional_index::{Set, SetCodec};
|
||||
|
||||
fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
|
||||
@@ -51,6 +52,7 @@ fn test_sparse_block_set_u16_max() {
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(1))]
|
||||
#[test]
|
||||
fn test_prop_test_dense(els in proptest::collection::btree_set(0..=u16::MAX, 0..=u16::MAX as usize)) {
|
||||
let vals: Vec<u16> = els.into_iter().collect();
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use common::OwnedBytes;
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
|
||||
use crate::column_index::multivalued_index::{serialize_multivalued_index, MultivaluedIndex};
|
||||
use crate::column_index::multivalued_index::serialize_multivalued_index;
|
||||
use crate::column_index::optional_index::serialize_optional_index;
|
||||
use crate::column_index::{ColumnIndex, SerializableOptionalIndex};
|
||||
use crate::Cardinality;
|
||||
use crate::column_values::ColumnValues;
|
||||
use crate::{Cardinality, RowId};
|
||||
|
||||
pub enum SerializableColumnIndex<'a> {
|
||||
Full,
|
||||
Optional(Box<dyn SerializableOptionalIndex<'a> + 'a>),
|
||||
// TODO remove the Arc<dyn> apart from serialization this is not
|
||||
// dynamic at all.
|
||||
Multivalued(MultivaluedIndex),
|
||||
Multivalued(Box<dyn ColumnValues<RowId> + 'a>),
|
||||
}
|
||||
|
||||
impl<'a> SerializableColumnIndex<'a> {
|
||||
@@ -29,19 +30,21 @@ impl<'a> SerializableColumnIndex<'a> {
|
||||
pub fn serialize_column_index(
|
||||
column_index: SerializableColumnIndex,
|
||||
output: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
) -> io::Result<u32> {
|
||||
let mut output = CountingWriter::wrap(output);
|
||||
let cardinality = column_index.get_cardinality().to_code();
|
||||
output.write_all(&[cardinality])?;
|
||||
match column_index {
|
||||
SerializableColumnIndex::Full => {}
|
||||
SerializableColumnIndex::Optional(optional_index) => {
|
||||
serialize_optional_index(&*optional_index, output)?
|
||||
serialize_optional_index(&*optional_index, &mut output)?
|
||||
}
|
||||
SerializableColumnIndex::Multivalued(multivalued_index) => {
|
||||
serialize_multivalued_index(multivalued_index, output)?
|
||||
serialize_multivalued_index(&*multivalued_index, &mut output)?
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
let column_index_num_bytes = output.written_bytes() as u32;
|
||||
Ok(column_index_num_bytes)
|
||||
}
|
||||
|
||||
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex<'static>> {
|
||||
|
||||
@@ -78,6 +78,32 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Copy + PartialOrd> ColumnValues<T> for std::sync::Arc<dyn ColumnValues<T>> {
|
||||
fn get_val(&self, idx: u32) -> T {
|
||||
self.as_ref().get_val(idx)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> T {
|
||||
self.as_ref().min_value()
|
||||
}
|
||||
|
||||
fn max_value(&self) -> T {
|
||||
self.as_ref().max_value()
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.as_ref().num_vals()
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
|
||||
self.as_ref().iter()
|
||||
}
|
||||
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
self.as_ref().get_range(start, output)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd> ColumnValues<T> for &'a C {
|
||||
fn get_val(&self, idx: u32) -> T {
|
||||
(*self).get_val(idx)
|
||||
|
||||
@@ -28,7 +28,7 @@ mod compact_space;
|
||||
mod line;
|
||||
mod linear;
|
||||
pub(crate) mod monotonic_mapping;
|
||||
// mod monotonic_mapping_u128;
|
||||
pub(crate) mod monotonic_mapping_u128;
|
||||
|
||||
mod column;
|
||||
mod column_with_cardinality;
|
||||
@@ -37,8 +37,10 @@ pub mod serialize;
|
||||
|
||||
pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn};
|
||||
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
// pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
pub use self::serialize::{serialize_and_load, serialize_column_values, NormalizedHeader};
|
||||
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
#[cfg(test)]
|
||||
pub use self::serialize::tests::serialize_and_load;
|
||||
pub use self::serialize::{serialize_column_values, NormalizedHeader};
|
||||
use crate::column_values::bitpacked::BitpackedCodec;
|
||||
use crate::column_values::blockwise_linear::BlockwiseLinearCodec;
|
||||
use crate::column_values::linear::LinearCodec;
|
||||
@@ -122,19 +124,17 @@ impl U128FastFieldCodecType {
|
||||
}
|
||||
|
||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
// pub fn open_u128<Item: MonotonicallyMappableToU128>(
|
||||
// bytes: OwnedBytes,
|
||||
// ) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
// todo!();
|
||||
// // let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
// // let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
// // let header = U128Header::deserialize(&mut bytes)?;
|
||||
// // assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
|
||||
// // let reader = CompactSpaceDecompressor::open(bytes)?;
|
||||
// // let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
|
||||
// // StrictlyMonotonicMappingToInternal::<Item>::new().into();
|
||||
// // Ok(Arc::new(monotonic_map_column(reader, inverted)))
|
||||
// }
|
||||
pub fn open_u128_mapped<T: MonotonicallyMappableToU128>(
|
||||
mut bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn ColumnValues<T>>> {
|
||||
let header = U128Header::deserialize(&mut bytes)?;
|
||||
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
|
||||
let reader = CompactSpaceDecompressor::open(bytes)?;
|
||||
|
||||
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<T>> =
|
||||
StrictlyMonotonicMappingToInternal::<T>::new().into();
|
||||
Ok(Arc::new(monotonic_map_column(reader, inverted)))
|
||||
}
|
||||
|
||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
pub fn open_u64_mapped<T: MonotonicallyMappableToU64>(
|
||||
@@ -198,13 +198,6 @@ pub(crate) trait FastFieldCodec: 'static {
|
||||
fn estimate(column: &dyn ColumnValues) -> Option<f32>;
|
||||
}
|
||||
|
||||
/// The list of all available codecs for u64 convertible data.
|
||||
pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
||||
FastFieldCodecType::Bitpacked,
|
||||
FastFieldCodecType::BlockwiseLinear,
|
||||
FastFieldCodecType::Linear,
|
||||
];
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::marker::PhantomData;
|
||||
|
||||
use fastdivide::DividerU64;
|
||||
|
||||
use super::MonotonicallyMappableToU128;
|
||||
use crate::RowId;
|
||||
|
||||
/// Monotonic maps a value to u64 value space.
|
||||
@@ -80,21 +81,20 @@ impl<T> StrictlyMonotonicMappingToInternal<T> {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
// impl<External: MonotonicallyMappableToU128, T: MonotonicallyMappableToU128>
|
||||
// StrictlyMonotonicFn<External, u128> for StrictlyMonotonicMappingToInternal<T>
|
||||
// where T: MonotonicallyMappableToU128
|
||||
// {
|
||||
// #[inline(always)]
|
||||
// fn mapping(&self, inp: External) -> u128 {
|
||||
// External::to_u128(inp)
|
||||
// }
|
||||
impl<External: MonotonicallyMappableToU128, T: MonotonicallyMappableToU128>
|
||||
StrictlyMonotonicFn<External, u128> for StrictlyMonotonicMappingToInternal<T>
|
||||
where T: MonotonicallyMappableToU128
|
||||
{
|
||||
#[inline(always)]
|
||||
fn mapping(&self, inp: External) -> u128 {
|
||||
External::to_u128(inp)
|
||||
}
|
||||
|
||||
// #[inline(always)]
|
||||
// fn inverse(&self, out: u128) -> External {
|
||||
// External::from_u128(out)
|
||||
// }
|
||||
// }
|
||||
#[inline(always)]
|
||||
fn inverse(&self, out: u128) -> External {
|
||||
External::from_u128(out)
|
||||
}
|
||||
}
|
||||
|
||||
impl<External: MonotonicallyMappableToU64, T: MonotonicallyMappableToU64>
|
||||
StrictlyMonotonicFn<External, u64> for StrictlyMonotonicMappingToInternal<T>
|
||||
|
||||
@@ -19,9 +19,8 @@
|
||||
|
||||
use std::io;
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{BinarySerializable, OwnedBytes, VInt};
|
||||
use common::{BinarySerializable, VInt};
|
||||
use log::warn;
|
||||
|
||||
use super::bitpacked::BitpackedCodec;
|
||||
@@ -33,8 +32,9 @@ use super::monotonic_mapping::{
|
||||
};
|
||||
use super::{
|
||||
monotonic_map_column, ColumnValues, FastFieldCodec, FastFieldCodecType,
|
||||
MonotonicallyMappableToU64, U128FastFieldCodecType, VecColumn, ALL_CODEC_TYPES,
|
||||
MonotonicallyMappableToU64, U128FastFieldCodecType,
|
||||
};
|
||||
use crate::column_values::compact_space::CompactSpaceCompressor;
|
||||
|
||||
/// The normalized header gives some parameters after applying the following
|
||||
/// normalization of the vector:
|
||||
@@ -160,54 +160,22 @@ impl BinarySerializable for Header {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no
|
||||
/// compression.
|
||||
pub(crate) fn estimate<T: MonotonicallyMappableToU64>(
|
||||
typed_column: impl ColumnValues<T>,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> Option<f32> {
|
||||
let column = monotonic_map_column(typed_column, StrictlyMonotonicMappingToInternal::<T>::new());
|
||||
let min_value = column.min_value();
|
||||
let gcd = super::gcd::find_gcd(column.iter().map(|val| val - min_value))
|
||||
.filter(|gcd| gcd.get() > 1u64);
|
||||
let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(
|
||||
gcd.map(|gcd| gcd.get()).unwrap_or(1u64),
|
||||
min_value,
|
||||
);
|
||||
let normalized_column = monotonic_map_column(&column, mapping);
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
|
||||
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
|
||||
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
/// Serializes u128 values with the compact space codec.
|
||||
// pub fn serialize_u128_new<F: Fn() -> I, I: Iterator<Item = u128>>(
|
||||
// value_index: ColumnIndex,
|
||||
// iter_gen: F,
|
||||
// num_vals: u32,
|
||||
// output: &mut impl io::Write,
|
||||
// ) -> io::Result<()> {
|
||||
// let header = U128Header {
|
||||
// num_vals,
|
||||
// codec_type: U128FastFieldCodecType::CompactSpace,
|
||||
// };
|
||||
// header.serialize(output)?;
|
||||
// let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
|
||||
// compressor.compress_into(iter_gen(), output).unwrap();
|
||||
pub fn serialize_column_values_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
|
||||
iter_gen: F,
|
||||
num_vals: u32,
|
||||
output: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
let header = U128Header {
|
||||
num_vals,
|
||||
codec_type: U128FastFieldCodecType::CompactSpace,
|
||||
};
|
||||
header.serialize(output)?;
|
||||
let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
|
||||
compressor.compress_into(iter_gen(), output)?;
|
||||
|
||||
// let null_index_footer = ColumnFooter {
|
||||
// cardinality: value_index.get_cardinality(),
|
||||
// null_index_codec: NullIndexCodec::Full,
|
||||
// null_index_byte_range: 0..0,
|
||||
// };
|
||||
// append_null_index_footer(output, null_index_footer)?;
|
||||
// append_format_version(output)?;
|
||||
|
||||
// Ok(())
|
||||
// }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serializes the column with the codec with the best estimate on the data.
|
||||
pub fn serialize_column_values<T: MonotonicallyMappableToU64>(
|
||||
@@ -279,20 +247,29 @@ pub(crate) fn serialize_given_codec(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function to serialize a column (autodetect from all codecs) and then open it
|
||||
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
|
||||
column: &[T],
|
||||
) -> Arc<dyn ColumnValues<T>> {
|
||||
let mut buffer = Vec::new();
|
||||
super::serialize_column_values(&VecColumn::from(&column), &ALL_CODEC_TYPES, &mut buffer)
|
||||
.unwrap();
|
||||
super::open_u64_mapped(OwnedBytes::new(buffer)).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
pub mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::OwnedBytes;
|
||||
|
||||
use super::*;
|
||||
use crate::column_values::{open_u64_mapped, VecColumn};
|
||||
|
||||
const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
||||
FastFieldCodecType::Bitpacked,
|
||||
FastFieldCodecType::Linear,
|
||||
FastFieldCodecType::BlockwiseLinear,
|
||||
];
|
||||
|
||||
/// Helper function to serialize a column (autodetect from all codecs) and then open it
|
||||
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
|
||||
column: &[T],
|
||||
) -> Arc<dyn ColumnValues<T>> {
|
||||
let mut buffer = Vec::new();
|
||||
serialize_column_values(&VecColumn::from(&column), &ALL_CODEC_TYPES, &mut buffer).unwrap();
|
||||
open_u64_mapped(OwnedBytes::new(buffer)).unwrap()
|
||||
}
|
||||
#[test]
|
||||
fn test_serialize_deserialize_u128_header() {
|
||||
let original = U128Header {
|
||||
@@ -319,7 +296,7 @@ mod tests {
|
||||
serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap();
|
||||
// TODO put the header as a footer so that it serves as a padding.
|
||||
// 5 bytes of header, 1 byte of value, 7 bytes of padding.
|
||||
assert_eq!(buffer.len(), 5 + 1 + 7);
|
||||
assert_eq!(buffer.len(), 5 + 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -328,7 +305,7 @@ mod tests {
|
||||
let col = VecColumn::from(&[true][..]);
|
||||
serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap();
|
||||
// 5 bytes of header, 0 bytes of value, 7 bytes of padding.
|
||||
assert_eq!(buffer.len(), 5 + 7);
|
||||
assert_eq!(buffer.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -338,6 +315,6 @@ mod tests {
|
||||
let col = VecColumn::from(&vals[..]);
|
||||
serialize_column_values(&col, &[FastFieldCodecType::Bitpacked], &mut buffer).unwrap();
|
||||
// Values are stored over 3 bits.
|
||||
assert_eq!(buffer.len(), 7 + (3 * 80 / 8) + 7);
|
||||
assert_eq!(buffer.len(), 7 + (3 * 80 / 8));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,98 +1,103 @@
|
||||
use crate::utils::{place_bits, select_bits};
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use crate::value::NumericalType;
|
||||
use crate::InvalidData;
|
||||
|
||||
/// The column type represents the column type and can fit on 6-bits.
|
||||
///
|
||||
/// - bits[0..3]: Column category type.
|
||||
/// - bits[3..6]: Numerical type if necessary.
|
||||
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
/// The column type represents the column type.
|
||||
/// Any changes need to be propagated to `COLUMN_TYPES`.
|
||||
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd)]
|
||||
#[repr(u8)]
|
||||
pub enum ColumnType {
|
||||
Str,
|
||||
Numerical(NumericalType),
|
||||
Bool,
|
||||
DateTime,
|
||||
I64 = 0u8,
|
||||
U64 = 1u8,
|
||||
F64 = 2u8,
|
||||
Bytes = 3u8,
|
||||
Str = 4u8,
|
||||
Bool = 5u8,
|
||||
IpAddr = 6u8,
|
||||
DateTime = 7u8,
|
||||
}
|
||||
|
||||
// The order needs to match _exactly_ the order in the enum
|
||||
const COLUMN_TYPES: [ColumnType; 8] = [
|
||||
ColumnType::I64,
|
||||
ColumnType::U64,
|
||||
ColumnType::F64,
|
||||
ColumnType::Bytes,
|
||||
ColumnType::Str,
|
||||
ColumnType::Bool,
|
||||
ColumnType::IpAddr,
|
||||
ColumnType::DateTime,
|
||||
];
|
||||
|
||||
impl ColumnType {
|
||||
/// Encoded over 6 bits.
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
let column_type_category;
|
||||
let numerical_type_code: u8;
|
||||
match self {
|
||||
ColumnType::Str => {
|
||||
column_type_category = ColumnTypeCategory::Str;
|
||||
numerical_type_code = 0u8;
|
||||
}
|
||||
ColumnType::Numerical(numerical_type) => {
|
||||
column_type_category = ColumnTypeCategory::Numerical;
|
||||
numerical_type_code = numerical_type.to_code();
|
||||
}
|
||||
ColumnType::Bool => {
|
||||
column_type_category = ColumnTypeCategory::Bool;
|
||||
numerical_type_code = 0u8;
|
||||
}
|
||||
ColumnType::DateTime => {
|
||||
column_type_category = ColumnTypeCategory::DateTime;
|
||||
numerical_type_code = 0u8;
|
||||
}
|
||||
}
|
||||
place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code)
|
||||
pub fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
|
||||
if select_bits::<6, 8>(code) != 0u8 {
|
||||
return Err(InvalidData);
|
||||
}
|
||||
let column_type_category_code = select_bits::<0, 3>(code);
|
||||
let numerical_type_code = select_bits::<3, 6>(code);
|
||||
let column_type_category = ColumnTypeCategory::try_from_code(column_type_category_code)?;
|
||||
match column_type_category {
|
||||
ColumnTypeCategory::Bool => {
|
||||
if numerical_type_code != 0u8 {
|
||||
return Err(InvalidData);
|
||||
}
|
||||
Ok(ColumnType::Bool)
|
||||
}
|
||||
ColumnTypeCategory::Str => {
|
||||
if numerical_type_code != 0u8 {
|
||||
return Err(InvalidData);
|
||||
}
|
||||
Ok(ColumnType::Str)
|
||||
}
|
||||
ColumnTypeCategory::Numerical => {
|
||||
let numerical_type = NumericalType::try_from_code(numerical_type_code)?;
|
||||
Ok(ColumnType::Numerical(numerical_type))
|
||||
}
|
||||
ColumnTypeCategory::DateTime => {
|
||||
if numerical_type_code != 0u8 {
|
||||
return Err(InvalidData);
|
||||
}
|
||||
Ok(ColumnType::DateTime)
|
||||
}
|
||||
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NumericalType> for ColumnType {
|
||||
fn from(numerical_type: NumericalType) -> Self {
|
||||
match numerical_type {
|
||||
NumericalType::I64 => ColumnType::I64,
|
||||
NumericalType::U64 => ColumnType::U64,
|
||||
NumericalType::F64 => ColumnType::F64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ColumnType {
|
||||
pub fn numerical_type(&self) -> Option<NumericalType> {
|
||||
match self {
|
||||
ColumnType::I64 => Some(NumericalType::I64),
|
||||
ColumnType::U64 => Some(NumericalType::U64),
|
||||
ColumnType::F64 => Some(NumericalType::F64),
|
||||
ColumnType::Bytes
|
||||
| ColumnType::Str
|
||||
| ColumnType::Bool
|
||||
| ColumnType::IpAddr
|
||||
| ColumnType::DateTime => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO remove if possible
|
||||
pub trait HasAssociatedColumnType: 'static + Send + Sync + Copy + PartialOrd {
|
||||
fn column_type() -> ColumnType;
|
||||
fn default_value() -> Self;
|
||||
}
|
||||
|
||||
impl HasAssociatedColumnType for u64 {
|
||||
fn column_type() -> ColumnType {
|
||||
ColumnType::Numerical(NumericalType::U64)
|
||||
ColumnType::U64
|
||||
}
|
||||
|
||||
fn default_value() -> Self {
|
||||
0u64
|
||||
}
|
||||
}
|
||||
|
||||
impl HasAssociatedColumnType for i64 {
|
||||
fn column_type() -> ColumnType {
|
||||
ColumnType::Numerical(NumericalType::I64)
|
||||
ColumnType::I64
|
||||
}
|
||||
|
||||
fn default_value() -> Self {
|
||||
0i64
|
||||
}
|
||||
}
|
||||
|
||||
impl HasAssociatedColumnType for f64 {
|
||||
fn column_type() -> ColumnType {
|
||||
ColumnType::Numerical(NumericalType::F64)
|
||||
ColumnType::F64
|
||||
}
|
||||
|
||||
fn default_value() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,63 +105,45 @@ impl HasAssociatedColumnType for bool {
|
||||
fn column_type() -> ColumnType {
|
||||
ColumnType::Bool
|
||||
}
|
||||
fn default_value() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasAssociatedColumnType for crate::DateTime {
|
||||
fn column_type() -> ColumnType {
|
||||
ColumnType::DateTime
|
||||
}
|
||||
fn default_value() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Column types are grouped into different categories that
|
||||
/// corresponds to the different types of `JsonValue` types.
|
||||
///
|
||||
/// The columnar writer will apply coercion rules to make sure that
|
||||
/// at most one column exist per `ColumnTypeCategory`.
|
||||
///
|
||||
/// See also [README.md].
|
||||
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
|
||||
#[repr(u8)]
|
||||
pub(crate) enum ColumnTypeCategory {
|
||||
Bool = 0u8,
|
||||
Str = 1u8,
|
||||
Numerical = 2u8,
|
||||
DateTime = 3u8,
|
||||
}
|
||||
|
||||
impl ColumnTypeCategory {
|
||||
pub fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
impl HasAssociatedColumnType for Ipv6Addr {
|
||||
fn column_type() -> ColumnType {
|
||||
ColumnType::IpAddr
|
||||
}
|
||||
|
||||
pub fn try_from_code(code: u8) -> Result<Self, InvalidData> {
|
||||
match code {
|
||||
0u8 => Ok(Self::Bool),
|
||||
1u8 => Ok(Self::Str),
|
||||
2u8 => Ok(Self::Numerical),
|
||||
3u8 => Ok(Self::DateTime),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
fn default_value() -> Self {
|
||||
Ipv6Addr::from([0u8; 16])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::*;
|
||||
use crate::Cardinality;
|
||||
|
||||
#[test]
|
||||
fn test_column_type_to_code() {
|
||||
let mut column_type_set: HashSet<ColumnType> = HashSet::new();
|
||||
for code in u8::MIN..=u8::MAX {
|
||||
if let Ok(column_type) = ColumnType::try_from_code(code) {
|
||||
assert_eq!(column_type.to_code(), code);
|
||||
assert!(column_type_set.insert(column_type));
|
||||
for (code, expected_column_type) in super::COLUMN_TYPES.iter().copied().enumerate() {
|
||||
if let Ok(column_type) = ColumnType::try_from_code(code as u8) {
|
||||
assert_eq!(column_type, expected_column_type);
|
||||
}
|
||||
}
|
||||
assert_eq!(column_type_set.len(), 3 + 3);
|
||||
for code in COLUMN_TYPES.len() as u8..=u8::MAX {
|
||||
assert!(ColumnType::try_from_code(code as u8).is_err());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
248
columnar/src/columnar/merge.rs
Normal file
248
columnar/src/columnar/merge.rs
Normal file
@@ -0,0 +1,248 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use super::writer::ColumnarSerializer;
|
||||
use crate::columnar::ColumnarReader;
|
||||
use crate::dynamic_column::DynamicColumn;
|
||||
use crate::{Cardinality, ColumnType};
|
||||
|
||||
pub enum MergeDocOrder {
|
||||
/// Columnar tables are simply stacked one above the other.
|
||||
/// If the i-th columnar_readers has n_rows_i rows, then
|
||||
/// in the resulting columnar,
|
||||
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
|
||||
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
|
||||
/// ..
|
||||
Stack,
|
||||
/// Some more complex mapping, that can interleaves rows from the different readers and
|
||||
/// possibly drop rows.
|
||||
Complex(()),
|
||||
}
|
||||
|
||||
pub fn merge_columnar(
|
||||
columnar_readers: &[ColumnarReader],
|
||||
mapping: MergeDocOrder,
|
||||
output: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = ColumnarSerializer::new(output);
|
||||
|
||||
// TODO handle dictionary merge for Str/Bytes column
|
||||
let field_name_to_group = group_columns_for_merge(columnar_readers)?;
|
||||
for (column_name, category_to_columns) in field_name_to_group {
|
||||
for (_category, columns_to_merge) in category_to_columns {
|
||||
let column_type = columns_to_merge[0].column_type();
|
||||
let mut column_serialzier =
|
||||
serializer.serialize_column(column_name.as_bytes(), column_type);
|
||||
merge_columns(
|
||||
column_type,
|
||||
&columns_to_merge,
|
||||
&mapping,
|
||||
&mut column_serialzier,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
serializer.finalize()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Column types are grouped into different categories.
|
||||
/// After merge, all columns belonging to the same category are coerced to
|
||||
/// the same column type.
|
||||
///
|
||||
/// In practise, today, only Numerical colummns are coerced into one type today.
|
||||
///
|
||||
/// See also [README.md].
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
|
||||
#[repr(u8)]
|
||||
pub enum ColumnTypeCategory {
|
||||
Bool,
|
||||
Str,
|
||||
Numerical,
|
||||
DateTime,
|
||||
Bytes,
|
||||
IpAddr,
|
||||
}
|
||||
|
||||
impl From<ColumnType> for ColumnTypeCategory {
|
||||
fn from(column_type: ColumnType) -> Self {
|
||||
match column_type {
|
||||
ColumnType::I64 => ColumnTypeCategory::Numerical,
|
||||
ColumnType::U64 => ColumnTypeCategory::Numerical,
|
||||
ColumnType::F64 => ColumnTypeCategory::Numerical,
|
||||
ColumnType::Bytes => ColumnTypeCategory::Bytes,
|
||||
ColumnType::Str => ColumnTypeCategory::Str,
|
||||
ColumnType::Bool => ColumnTypeCategory::Bool,
|
||||
ColumnType::IpAddr => ColumnTypeCategory::IpAddr,
|
||||
ColumnType::DateTime => ColumnTypeCategory::DateTime,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn detect_cardinality(columns: &[DynamicColumn]) -> Cardinality {
|
||||
if columns
|
||||
.iter()
|
||||
.any(|column| column.get_cardinality().is_multivalue())
|
||||
{
|
||||
return Cardinality::Multivalued;
|
||||
}
|
||||
if columns
|
||||
.iter()
|
||||
.any(|column| column.get_cardinality().is_optional())
|
||||
{
|
||||
return Cardinality::Optional;
|
||||
}
|
||||
Cardinality::Full
|
||||
}
|
||||
|
||||
pub fn compute_num_docs(columns: &[DynamicColumn], mapping: &MergeDocOrder) -> usize {
|
||||
// TODO handle deletes
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
pub fn merge_columns(
|
||||
column_type: ColumnType,
|
||||
columns: &[DynamicColumn],
|
||||
mapping: &MergeDocOrder,
|
||||
column_serializer: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
let cardinality = detect_cardinality(columns);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn group_columns_for_merge(
|
||||
columnar_readers: &[ColumnarReader],
|
||||
) -> io::Result<HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>> {
|
||||
// Each column name may have multiple types of column associated.
|
||||
// For merging we are interested in the same column type category since they can be merged.
|
||||
let mut field_name_to_group: HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>> =
|
||||
HashMap::new();
|
||||
|
||||
for columnar_reader in columnar_readers {
|
||||
let column_name_and_handle = columnar_reader.list_columns()?;
|
||||
for (column_name, handle) in column_name_and_handle {
|
||||
let column_type_to_handles = field_name_to_group
|
||||
.entry(column_name.to_string())
|
||||
.or_default();
|
||||
|
||||
let columns = column_type_to_handles
|
||||
.entry(handle.column_type().into())
|
||||
.or_default();
|
||||
columns.push(handle.open()?);
|
||||
}
|
||||
}
|
||||
|
||||
normalize_columns(&mut field_name_to_group);
|
||||
|
||||
Ok(field_name_to_group)
|
||||
}
|
||||
|
||||
/// Coerce numerical type columns to the same type
|
||||
/// TODO rename to `coerce_columns`
|
||||
fn normalize_columns(map: &mut HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>) {
|
||||
for (_field_name, type_category_to_columns) in map.iter_mut() {
|
||||
for (type_category, columns) in type_category_to_columns {
|
||||
if type_category == &ColumnTypeCategory::Numerical {
|
||||
let casted_columns = cast_to_common_numerical_column(&columns);
|
||||
*columns = casted_columns;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Receives a list of columns of numerical types (u64, i64, f64)
|
||||
///
|
||||
/// Returns a list of `DynamicColumn` which are all of the same numerical type
|
||||
fn cast_to_common_numerical_column(columns: &[DynamicColumn]) -> Vec<DynamicColumn> {
|
||||
assert!(columns
|
||||
.iter()
|
||||
.all(|column| column.column_type().numerical_type().is_some()));
|
||||
let coerce_to_i64: Vec<_> = columns
|
||||
.iter()
|
||||
.filter_map(|column| column.clone().coerce_to_i64())
|
||||
.collect();
|
||||
|
||||
if coerce_to_i64.len() == columns.len() {
|
||||
return coerce_to_i64;
|
||||
}
|
||||
|
||||
let coerce_to_u64: Vec<_> = columns
|
||||
.iter()
|
||||
.filter_map(|column| column.clone().coerce_to_u64())
|
||||
.collect();
|
||||
|
||||
if coerce_to_u64.len() == columns.len() {
|
||||
return coerce_to_u64;
|
||||
}
|
||||
|
||||
columns
|
||||
.iter()
|
||||
.map(|column| {
|
||||
column
|
||||
.clone()
|
||||
.coerce_to_f64()
|
||||
.expect("couldn't cast column to f64")
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::ColumnarWriter;
|
||||
|
||||
#[test]
|
||||
fn test_column_coercion() {
|
||||
// i64 type
|
||||
let columnar1 = {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(1u32, "numbers", 1i64);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(2, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
};
|
||||
// u64 type
|
||||
let columnar2 = {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(1u32, "numbers", u64::MAX - 100);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(2, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
};
|
||||
|
||||
// f64 type
|
||||
let columnar3 = {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(1u32, "numbers", 30.5);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(2, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
};
|
||||
|
||||
let column_map =
|
||||
group_columns_for_merge(&[columnar1.clone(), columnar2.clone(), columnar3.clone()])
|
||||
.unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
let cat_to_columns = column_map.get("numbers").unwrap();
|
||||
assert_eq!(cat_to_columns.len(), 1);
|
||||
|
||||
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
|
||||
assert!(numerical.iter().all(|column| column.is_f64()));
|
||||
|
||||
let column_map = group_columns_for_merge(&[columnar1.clone(), columnar1.clone()]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
let cat_to_columns = column_map.get("numbers").unwrap();
|
||||
assert_eq!(cat_to_columns.len(), 1);
|
||||
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
|
||||
assert!(numerical.iter().all(|column| column.is_i64()));
|
||||
|
||||
let column_map = group_columns_for_merge(&[columnar2.clone(), columnar2.clone()]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
let cat_to_columns = column_map.get("numbers").unwrap();
|
||||
assert_eq!(cat_to_columns.len(), 1);
|
||||
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
|
||||
assert!(numerical.iter().all(|column| column.is_u64()));
|
||||
}
|
||||
}
|
||||
1
columnar/src/columnar/merge_index.rs
Normal file
1
columnar/src/columnar/merge_index.rs
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
@@ -1,28 +1,11 @@
|
||||
// Copyright (C) 2022 Quickwit, Inc.
|
||||
//
|
||||
// Quickwit is offered under the AGPL v3.0 and as commercial software.
|
||||
// For commercial licensing, contact us at hello@quickwit.io.
|
||||
//
|
||||
// AGPL:
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
|
||||
mod column_type;
|
||||
mod format_version;
|
||||
mod merge;
|
||||
mod merge_index;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use column_type::{ColumnType, HasAssociatedColumnType};
|
||||
pub use merge::{merge_columnar, MergeDocOrder};
|
||||
pub use reader::ColumnarReader;
|
||||
pub use writer::ColumnarWriter;
|
||||
|
||||
@@ -13,6 +13,7 @@ fn io_invalid_data(msg: String) -> io::Error {
|
||||
|
||||
/// The ColumnarReader makes it possible to access a set of columns
|
||||
/// associated to field names.
|
||||
#[derive(Clone)]
|
||||
pub struct ColumnarReader {
|
||||
column_dictionary: Dictionary<RangeSSTable>,
|
||||
column_data: FileSlice,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use crate::dictionary::UnorderedId;
|
||||
use crate::utils::{place_bits, pop_first_byte, select_bits};
|
||||
use crate::value::NumericalValue;
|
||||
@@ -25,12 +27,12 @@ struct ColumnOperationMetadata {
|
||||
|
||||
impl ColumnOperationMetadata {
|
||||
fn to_code(self) -> u8 {
|
||||
place_bits::<0, 4>(self.len) | place_bits::<4, 8>(self.op_type.to_code())
|
||||
place_bits::<0, 6>(self.len) | place_bits::<6, 8>(self.op_type.to_code())
|
||||
}
|
||||
|
||||
fn try_from_code(code: u8) -> Result<Self, InvalidData> {
|
||||
let len = select_bits::<0, 4>(code);
|
||||
let typ_code = select_bits::<4, 8>(code);
|
||||
let len = select_bits::<0, 6>(code);
|
||||
let typ_code = select_bits::<6, 8>(code);
|
||||
let column_type = ColumnOperationType::try_from_code(typ_code)?;
|
||||
Ok(ColumnOperationMetadata {
|
||||
op_type: column_type,
|
||||
@@ -142,9 +144,21 @@ impl SymbolValue for bool {
|
||||
}
|
||||
}
|
||||
|
||||
impl SymbolValue for Ipv6Addr {
|
||||
fn serialize(self, buffer: &mut [u8]) -> u8 {
|
||||
buffer[0..16].copy_from_slice(&self.octets());
|
||||
16
|
||||
}
|
||||
|
||||
fn deserialize(bytes: &[u8]) -> Self {
|
||||
let octets: [u8; 16] = bytes[0..16].try_into().unwrap();
|
||||
Ipv6Addr::from(octets)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct MiniBuffer {
|
||||
pub bytes: [u8; 10],
|
||||
pub bytes: [u8; 17],
|
||||
pub len: u8,
|
||||
}
|
||||
|
||||
|
||||
@@ -184,10 +184,12 @@ impl CompatibleNumericalTypes {
|
||||
}
|
||||
|
||||
impl NumericalColumnWriter {
|
||||
pub fn column_type_and_cardinality(&self, num_docs: RowId) -> (NumericalType, Cardinality) {
|
||||
let numerical_type = self.compatible_numerical_types.to_numerical_type();
|
||||
let cardinality = self.column_writer.get_cardinality(num_docs);
|
||||
(numerical_type, cardinality)
|
||||
pub fn numerical_type(&self) -> NumericalType {
|
||||
self.compatible_numerical_types.to_numerical_type()
|
||||
}
|
||||
|
||||
pub fn cardinality(&self, num_docs: RowId) -> Cardinality {
|
||||
self.column_writer.get_cardinality(num_docs)
|
||||
}
|
||||
|
||||
pub fn record_numerical_value(
|
||||
@@ -209,15 +211,15 @@ impl NumericalColumnWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
pub(crate) struct StrColumnWriter {
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct StrOrBytesColumnWriter {
|
||||
pub(crate) dictionary_id: u32,
|
||||
pub(crate) column_writer: ColumnWriter,
|
||||
}
|
||||
|
||||
impl StrColumnWriter {
|
||||
pub(crate) fn with_dictionary_id(dictionary_id: u32) -> StrColumnWriter {
|
||||
StrColumnWriter {
|
||||
impl StrOrBytesColumnWriter {
|
||||
pub(crate) fn with_dictionary_id(dictionary_id: u32) -> StrOrBytesColumnWriter {
|
||||
StrOrBytesColumnWriter {
|
||||
dictionary_id,
|
||||
column_writer: Default::default(),
|
||||
}
|
||||
|
||||
@@ -4,17 +4,20 @@ mod serializer;
|
||||
mod value_index;
|
||||
|
||||
use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use column_operation::ColumnOperation;
|
||||
use common::CountingWriter;
|
||||
use serializer::ColumnarSerializer;
|
||||
pub(crate) use serializer::ColumnarSerializer;
|
||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::column_values::{ColumnValues, MonotonicallyMappableToU64, VecColumn};
|
||||
use crate::columnar::column_type::{ColumnType, ColumnTypeCategory};
|
||||
use crate::column_values::{
|
||||
ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
|
||||
};
|
||||
use crate::columnar::column_type::ColumnType;
|
||||
use crate::columnar::writer::column_writers::{
|
||||
ColumnWriter, NumericalColumnWriter, StrColumnWriter,
|
||||
ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
|
||||
};
|
||||
use crate::columnar::writer::value_index::{IndexBuilder, PreallocatedIndexBuilders};
|
||||
use crate::dictionary::{DictionaryBuilder, TermIdMapping, UnorderedId};
|
||||
@@ -30,6 +33,7 @@ struct SpareBuffers {
|
||||
u64_values: Vec<u64>,
|
||||
f64_values: Vec<f64>,
|
||||
bool_values: Vec<bool>,
|
||||
ip_addr_values: Vec<Ipv6Addr>,
|
||||
}
|
||||
|
||||
/// Makes it possible to create a new columnar.
|
||||
@@ -49,7 +53,9 @@ pub struct ColumnarWriter {
|
||||
numerical_field_hash_map: ArenaHashMap,
|
||||
datetime_field_hash_map: ArenaHashMap,
|
||||
bool_field_hash_map: ArenaHashMap,
|
||||
ip_addr_field_hash_map: ArenaHashMap,
|
||||
bytes_field_hash_map: ArenaHashMap,
|
||||
str_field_hash_map: ArenaHashMap,
|
||||
arena: MemoryArena,
|
||||
// Dictionaries used to store dictionary-encoded values.
|
||||
dictionaries: Vec<DictionaryBuilder>,
|
||||
@@ -61,7 +67,9 @@ impl Default for ColumnarWriter {
|
||||
ColumnarWriter {
|
||||
numerical_field_hash_map: ArenaHashMap::new(10_000),
|
||||
bool_field_hash_map: ArenaHashMap::new(10_000),
|
||||
ip_addr_field_hash_map: ArenaHashMap::new(10_000),
|
||||
bytes_field_hash_map: ArenaHashMap::new(10_000),
|
||||
str_field_hash_map: ArenaHashMap::new(10_000),
|
||||
datetime_field_hash_map: ArenaHashMap::new(10_000),
|
||||
dictionaries: Vec::new(),
|
||||
arena: MemoryArena::default(),
|
||||
@@ -93,15 +101,34 @@ impl ColumnarWriter {
|
||||
+ self.numerical_field_hash_map.mem_usage()
|
||||
+ self.bool_field_hash_map.mem_usage()
|
||||
+ self.bytes_field_hash_map.mem_usage()
|
||||
+ self.str_field_hash_map.mem_usage()
|
||||
+ self.ip_addr_field_hash_map.mem_usage()
|
||||
+ self.datetime_field_hash_map.mem_usage()
|
||||
}
|
||||
|
||||
pub fn record_column_type(&mut self, column_name: &str, column_type: ColumnType) {
|
||||
match column_type {
|
||||
ColumnType::Str => {
|
||||
ColumnType::Str | ColumnType::Bytes => {
|
||||
let (hash_map, dictionaries) = (
|
||||
if column_type == ColumnType::Str {
|
||||
&mut self.str_field_hash_map
|
||||
} else {
|
||||
&mut self.bytes_field_hash_map
|
||||
},
|
||||
&mut self.dictionaries,
|
||||
);
|
||||
mutate_or_create_column(
|
||||
&mut self.bytes_field_hash_map,
|
||||
hash_map,
|
||||
column_name,
|
||||
|column_opt: Option<StrColumnWriter>| column_opt.unwrap_or_default(),
|
||||
|column_opt: Option<StrOrBytesColumnWriter>| {
|
||||
if let Some(column_writer) = column_opt {
|
||||
column_writer
|
||||
} else {
|
||||
let dictionary_id = dictionaries.len() as u32;
|
||||
dictionaries.push(DictionaryBuilder::default());
|
||||
StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
ColumnType::Bool => {
|
||||
@@ -118,7 +145,8 @@ impl ColumnarWriter {
|
||||
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
|
||||
);
|
||||
}
|
||||
ColumnType::Numerical(numerical_type) => {
|
||||
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
|
||||
let numerical_type = column_type.numerical_type().unwrap();
|
||||
mutate_or_create_column(
|
||||
&mut self.numerical_field_hash_map,
|
||||
column_name,
|
||||
@@ -129,6 +157,11 @@ impl ColumnarWriter {
|
||||
},
|
||||
);
|
||||
}
|
||||
ColumnType::IpAddr => mutate_or_create_column(
|
||||
&mut self.ip_addr_field_hash_map,
|
||||
column_name,
|
||||
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,6 +195,22 @@ impl ColumnarWriter {
|
||||
);
|
||||
}
|
||||
|
||||
pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) {
|
||||
assert!(
|
||||
!column_name.as_bytes().contains(&0u8),
|
||||
"key may not contain the 0 byte"
|
||||
);
|
||||
let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena);
|
||||
hash_map.mutate_or_create(
|
||||
column_name.as_bytes(),
|
||||
|column_opt: Option<ColumnWriter>| {
|
||||
let mut column: ColumnWriter = column_opt.unwrap_or_default();
|
||||
column.record(doc, ip_addr, arena);
|
||||
column
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
|
||||
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
|
||||
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
|
||||
@@ -182,19 +231,18 @@ impl ColumnarWriter {
|
||||
|
||||
pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
|
||||
let (hash_map, arena, dictionaries) = (
|
||||
&mut self.bytes_field_hash_map,
|
||||
&mut self.str_field_hash_map,
|
||||
&mut self.arena,
|
||||
&mut self.dictionaries,
|
||||
);
|
||||
mutate_or_create_column(
|
||||
hash_map,
|
||||
column_name,
|
||||
|column_opt: Option<StrColumnWriter>| {
|
||||
let mut column: StrColumnWriter = column_opt.unwrap_or_else(|| {
|
||||
hash_map.mutate_or_create(
|
||||
column_name.as_bytes(),
|
||||
|column_opt: Option<StrOrBytesColumnWriter>| {
|
||||
let mut column: StrOrBytesColumnWriter = column_opt.unwrap_or_else(|| {
|
||||
// Each column has its own dictionary
|
||||
let dictionary_id = dictionaries.len() as u32;
|
||||
dictionaries.push(DictionaryBuilder::default());
|
||||
StrColumnWriter::with_dictionary_id(dictionary_id)
|
||||
StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
|
||||
});
|
||||
column.record_bytes(doc, value.as_bytes(), dictionaries, arena);
|
||||
column
|
||||
@@ -202,35 +250,79 @@ impl ColumnarWriter {
|
||||
);
|
||||
}
|
||||
|
||||
pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) {
|
||||
assert!(
|
||||
!column_name.as_bytes().contains(&0u8),
|
||||
"key may not contain the 0 byte"
|
||||
);
|
||||
let (hash_map, arena, dictionaries) = (
|
||||
&mut self.bytes_field_hash_map,
|
||||
&mut self.arena,
|
||||
&mut self.dictionaries,
|
||||
);
|
||||
hash_map.mutate_or_create(
|
||||
column_name.as_bytes(),
|
||||
|column_opt: Option<StrOrBytesColumnWriter>| {
|
||||
let mut column: StrOrBytesColumnWriter = column_opt.unwrap_or_else(|| {
|
||||
// Each column has its own dictionary
|
||||
let dictionary_id = dictionaries.len() as u32;
|
||||
dictionaries.push(DictionaryBuilder::default());
|
||||
StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
|
||||
});
|
||||
column.record_bytes(doc, value, dictionaries, arena);
|
||||
column
|
||||
},
|
||||
);
|
||||
}
|
||||
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||
let mut serializer = ColumnarSerializer::new(wrt);
|
||||
let mut columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self
|
||||
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
||||
.numerical_field_hash_map
|
||||
.iter()
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Numerical, addr))
|
||||
.map(|(column_name, addr, _)| {
|
||||
let numerical_column_writer: NumericalColumnWriter =
|
||||
self.numerical_field_hash_map.read(addr);
|
||||
let column_type = numerical_column_writer.numerical_type().into();
|
||||
(column_name, column_type, addr)
|
||||
})
|
||||
.collect();
|
||||
columns.extend(
|
||||
self.bytes_field_hash_map
|
||||
.iter()
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Str, addr)),
|
||||
.map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
|
||||
);
|
||||
columns.extend(
|
||||
self.str_field_hash_map
|
||||
.iter()
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
|
||||
);
|
||||
columns.extend(
|
||||
self.bool_field_hash_map
|
||||
.iter()
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Bool, addr)),
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
|
||||
);
|
||||
columns.extend(
|
||||
self.ip_addr_field_hash_map
|
||||
.iter()
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
|
||||
);
|
||||
columns.extend(
|
||||
self.datetime_field_hash_map
|
||||
.iter()
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::DateTime, addr)),
|
||||
.map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
|
||||
);
|
||||
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
|
||||
|
||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
||||
for (column_name, bytes_or_numerical, addr) in columns {
|
||||
match bytes_or_numerical {
|
||||
ColumnTypeCategory::Bool => {
|
||||
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
||||
for (column_name, column_type, addr) in columns {
|
||||
match column_type {
|
||||
ColumnType::Bool | ColumnType::DateTime => {
|
||||
let column_writer: ColumnWriter = if column_type == ColumnType::Bool {
|
||||
self.bool_field_hash_map.read(addr)
|
||||
} else {
|
||||
self.datetime_field_hash_map.read(addr)
|
||||
};
|
||||
let cardinality = column_writer.get_cardinality(num_docs);
|
||||
let mut column_serializer =
|
||||
serializer.serialize_column(column_name, ColumnType::Bool);
|
||||
@@ -242,29 +334,50 @@ impl ColumnarWriter {
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
}
|
||||
ColumnTypeCategory::Str => {
|
||||
let str_column_writer: StrColumnWriter = self.bytes_field_hash_map.read(addr);
|
||||
let dictionary_builder =
|
||||
&dictionaries[str_column_writer.dictionary_id as usize];
|
||||
let cardinality = str_column_writer.column_writer.get_cardinality(num_docs);
|
||||
ColumnType::IpAddr => {
|
||||
let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr);
|
||||
let cardinality = column_writer.get_cardinality(num_docs);
|
||||
let mut column_serializer =
|
||||
serializer.serialize_column(column_name, ColumnType::Str);
|
||||
serialize_bytes_column(
|
||||
serializer.serialize_column(column_name, ColumnType::IpAddr);
|
||||
serialize_ip_addr_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
dictionary_builder,
|
||||
str_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
}
|
||||
ColumnTypeCategory::Numerical => {
|
||||
ColumnType::Bytes | ColumnType::Str => {
|
||||
let str_or_bytes_column_writer: StrOrBytesColumnWriter =
|
||||
if column_type == ColumnType::Bytes {
|
||||
self.bytes_field_hash_map.read(addr)
|
||||
} else {
|
||||
self.str_field_hash_map.read(addr)
|
||||
};
|
||||
let dictionary_builder =
|
||||
&dictionaries[str_or_bytes_column_writer.dictionary_id as usize];
|
||||
let cardinality = str_or_bytes_column_writer
|
||||
.column_writer
|
||||
.get_cardinality(num_docs);
|
||||
let mut column_serializer =
|
||||
serializer.serialize_column(column_name, column_type);
|
||||
serialize_bytes_or_str_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
dictionary_builder,
|
||||
str_or_bytes_column_writer
|
||||
.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
}
|
||||
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
|
||||
let numerical_column_writer: NumericalColumnWriter =
|
||||
self.numerical_field_hash_map.read(addr);
|
||||
let (numerical_type, cardinality) =
|
||||
numerical_column_writer.column_type_and_cardinality(num_docs);
|
||||
let mut column_serializer = serializer
|
||||
.serialize_column(column_name, ColumnType::Numerical(numerical_type));
|
||||
let numerical_type = column_type.numerical_type().unwrap();
|
||||
let cardinality = numerical_column_writer.cardinality(num_docs);
|
||||
let mut column_serializer =
|
||||
serializer.serialize_column(column_name, ColumnType::from(numerical_type));
|
||||
serialize_numerical_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
@@ -274,20 +387,6 @@ impl ColumnarWriter {
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
}
|
||||
ColumnTypeCategory::DateTime => {
|
||||
let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
|
||||
let cardinality = column_writer.get_cardinality(num_docs);
|
||||
let mut column_serializer =
|
||||
serializer.serialize_column(column_name, ColumnType::DateTime);
|
||||
serialize_numerical_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
NumericalType::I64,
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
}
|
||||
};
|
||||
}
|
||||
serializer.finalize()?;
|
||||
@@ -295,7 +394,7 @@ impl ColumnarWriter {
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_bytes_column(
|
||||
fn serialize_bytes_or_str_column(
|
||||
cardinality: Cardinality,
|
||||
num_docs: RowId,
|
||||
dictionary_builder: &DictionaryBuilder,
|
||||
@@ -322,7 +421,7 @@ fn serialize_bytes_column(
|
||||
ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
|
||||
}
|
||||
});
|
||||
serialize_column(
|
||||
send_to_serialize_column_mappable_to_u64(
|
||||
operation_iterator,
|
||||
cardinality,
|
||||
num_docs,
|
||||
@@ -351,7 +450,7 @@ fn serialize_numerical_column(
|
||||
} = buffers;
|
||||
match numerical_type {
|
||||
NumericalType::I64 => {
|
||||
serialize_column(
|
||||
send_to_serialize_column_mappable_to_u64(
|
||||
coerce_numerical_symbol::<i64>(op_iterator),
|
||||
cardinality,
|
||||
num_docs,
|
||||
@@ -361,7 +460,7 @@ fn serialize_numerical_column(
|
||||
)?;
|
||||
}
|
||||
NumericalType::U64 => {
|
||||
serialize_column(
|
||||
send_to_serialize_column_mappable_to_u64(
|
||||
coerce_numerical_symbol::<u64>(op_iterator),
|
||||
cardinality,
|
||||
num_docs,
|
||||
@@ -371,7 +470,7 @@ fn serialize_numerical_column(
|
||||
)?;
|
||||
}
|
||||
NumericalType::F64 => {
|
||||
serialize_column(
|
||||
send_to_serialize_column_mappable_to_u64(
|
||||
coerce_numerical_symbol::<f64>(op_iterator),
|
||||
cardinality,
|
||||
num_docs,
|
||||
@@ -396,7 +495,7 @@ fn serialize_bool_column(
|
||||
bool_values,
|
||||
..
|
||||
} = buffers;
|
||||
serialize_column(
|
||||
send_to_serialize_column_mappable_to_u64(
|
||||
column_operations_it,
|
||||
cardinality,
|
||||
num_docs,
|
||||
@@ -407,7 +506,76 @@ fn serialize_bool_column(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_column<
|
||||
fn serialize_ip_addr_column(
|
||||
cardinality: Cardinality,
|
||||
num_docs: RowId,
|
||||
column_operations_it: impl Iterator<Item = ColumnOperation<Ipv6Addr>>,
|
||||
buffers: &mut SpareBuffers,
|
||||
wrt: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
let SpareBuffers {
|
||||
value_index_builders,
|
||||
ip_addr_values,
|
||||
..
|
||||
} = buffers;
|
||||
send_to_serialize_column_mappable_to_u128(
|
||||
column_operations_it,
|
||||
cardinality,
|
||||
num_docs,
|
||||
value_index_builders,
|
||||
ip_addr_values,
|
||||
wrt,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn send_to_serialize_column_mappable_to_u128<
|
||||
T: Copy + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
|
||||
>(
|
||||
op_iterator: impl Iterator<Item = ColumnOperation<T>>,
|
||||
cardinality: Cardinality,
|
||||
num_docs: RowId,
|
||||
value_index_builders: &mut PreallocatedIndexBuilders,
|
||||
values: &mut Vec<T>,
|
||||
mut wrt: impl io::Write,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
for<'a> VecColumn<'a, T>: ColumnValues<T>,
|
||||
{
|
||||
values.clear();
|
||||
// TODO: split index and values
|
||||
let serializable_column_index = match cardinality {
|
||||
Cardinality::Full => {
|
||||
consume_operation_iterator(
|
||||
op_iterator,
|
||||
value_index_builders.borrow_required_index_builder(),
|
||||
values,
|
||||
);
|
||||
SerializableColumnIndex::Full
|
||||
}
|
||||
Cardinality::Optional => {
|
||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||
let optional_index = optional_index_builder.finish(num_docs);
|
||||
SerializableColumnIndex::Optional(Box::new(optional_index))
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_docs);
|
||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
}
|
||||
};
|
||||
crate::column::serialize_column_mappable_to_u128(
|
||||
serializable_column_index,
|
||||
|| values.iter().cloned(),
|
||||
values.len() as u32,
|
||||
&mut wrt,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn send_to_serialize_column_mappable_to_u64<
|
||||
T: Copy + Default + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU64 + PartialOrd,
|
||||
>(
|
||||
op_iterator: impl Iterator<Item = ColumnOperation<T>>,
|
||||
@@ -440,11 +608,10 @@ where
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_docs);
|
||||
todo!();
|
||||
// SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
}
|
||||
};
|
||||
crate::column::serialize_column_u64(
|
||||
crate::column::serialize_column_mappable_to_u64(
|
||||
serializable_column_index,
|
||||
&VecColumn::from(&values[..]),
|
||||
&mut wrt,
|
||||
@@ -482,59 +649,12 @@ fn consume_operation_iterator<T: std::fmt::Debug, TIndexBuilder: IndexBuilder>(
|
||||
}
|
||||
}
|
||||
|
||||
// /// Serializes the column with the codec with the best estimate on the data.
|
||||
// fn serialize_numerical<T: MonotonicallyMappableToU64>(
|
||||
// value_index: ValueIndexInfo,
|
||||
// typed_column: impl Column<T>,
|
||||
// output: &mut impl io::Write,
|
||||
// codecs: &[FastFieldCodecType],
|
||||
// ) -> io::Result<()> {
|
||||
|
||||
// let counting_writer = CountingWriter::wrap(output);
|
||||
// serialize_value_index(value_index, output)?;
|
||||
// let value_index_len = counting_writer.written_bytes();
|
||||
// let output = counting_writer.finish();
|
||||
|
||||
// serialize_column(value_index, output)?;
|
||||
// let column = monotonic_map_column(
|
||||
// typed_column,
|
||||
// crate::column::monotonic_mapping::StrictlyMonotonicMappingToInternal::<T>::new(),
|
||||
// );
|
||||
// let header = Header::compute_header(&column, codecs).ok_or_else(|| {
|
||||
// io::Error::new(
|
||||
// io::ErrorKind::InvalidInput,
|
||||
// format!(
|
||||
// "Data cannot be serialized with this list of codec. {:?}",
|
||||
// codecs
|
||||
// ),
|
||||
// )
|
||||
// })?;
|
||||
// header.serialize(output)?;
|
||||
// let normalized_column = header.normalize_column(column);
|
||||
// assert_eq!(normalized_column.min_value(), 0u64);
|
||||
// serialize_given_codec(normalized_column, header.codec_type, output)?;
|
||||
|
||||
// let column_header = ColumnFooter {
|
||||
// value_index_len: todo!(),
|
||||
// cardinality: todo!(),
|
||||
// };
|
||||
|
||||
// let null_index_footer = NullIndexFooter {
|
||||
// cardinality: value_index.get_cardinality(),
|
||||
// null_index_codec: NullIndexCodec::Full,
|
||||
// null_index_byte_range: 0..0,
|
||||
// };
|
||||
// append_null_index_footer(output, null_index_footer)?;
|
||||
// Ok(())
|
||||
// }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use column_operation::ColumnOperation;
|
||||
use stacker::MemoryArena;
|
||||
|
||||
use super::*;
|
||||
use crate::value::NumericalValue;
|
||||
use crate::columnar::writer::column_operation::ColumnOperation;
|
||||
use crate::{Cardinality, NumericalValue};
|
||||
|
||||
#[test]
|
||||
fn test_column_writer_required_simple() {
|
||||
|
||||
@@ -45,16 +45,6 @@ impl<'a> SerializableOptionalIndex<'a> for SingleValueArrayIndex<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl OptionalIndexBuilder {
|
||||
fn num_non_nulls(&self) -> u32 {
|
||||
self.docs.len() as u32
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.docs.iter().copied())
|
||||
}
|
||||
}
|
||||
|
||||
impl OptionalIndexBuilder {
|
||||
pub fn finish<'a>(&'a mut self, num_rows: RowId) -> impl SerializableOptionalIndex + 'a {
|
||||
debug_assert!(self
|
||||
@@ -96,7 +86,7 @@ pub struct MultivaluedIndexBuilder {
|
||||
impl MultivaluedIndexBuilder {
|
||||
pub fn finish(&mut self, num_docs: RowId) -> impl ColumnValues<u32> + '_ {
|
||||
self.start_offsets
|
||||
.resize(num_docs as usize, self.total_num_vals_seen);
|
||||
.resize(num_docs as usize + 1, self.total_num_vals_seen);
|
||||
VecColumn {
|
||||
values: &&self.start_offsets[..],
|
||||
min_value: 0,
|
||||
@@ -188,7 +178,7 @@ mod tests {
|
||||
.finish(4u32)
|
||||
.iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
vec![0, 0, 2, 3]
|
||||
vec![0, 0, 2, 3, 3]
|
||||
);
|
||||
multivalued_value_index_builder.reset();
|
||||
multivalued_value_index_builder.record_row(2u32);
|
||||
@@ -199,7 +189,7 @@ mod tests {
|
||||
.finish(4u32)
|
||||
.iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
vec![0, 0, 0, 2]
|
||||
vec![0, 0, 0, 2, 2]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
use std::io;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
use common::{HasLen, OwnedBytes};
|
||||
|
||||
use crate::column::{BytesColumn, Column};
|
||||
use crate::column::{BytesColumn, Column, StrColumn};
|
||||
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::{Cardinality, DateTime, NumericalType};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum DynamicColumn {
|
||||
@@ -13,16 +16,151 @@ pub enum DynamicColumn {
|
||||
I64(Column<i64>),
|
||||
U64(Column<u64>),
|
||||
F64(Column<f64>),
|
||||
IpAddr(Column<IpAddr>),
|
||||
Str(BytesColumn),
|
||||
DateTime(Column<crate::DateTime>),
|
||||
IpAddr(Column<Ipv6Addr>),
|
||||
DateTime(Column<DateTime>),
|
||||
Bytes(BytesColumn),
|
||||
Str(StrColumn),
|
||||
}
|
||||
|
||||
impl DynamicColumn {
|
||||
pub fn get_cardinality(&self) -> Cardinality {
|
||||
match self {
|
||||
DynamicColumn::Bool(c) => c.get_cardinality(),
|
||||
DynamicColumn::I64(c) => c.get_cardinality(),
|
||||
DynamicColumn::U64(c) => c.get_cardinality(),
|
||||
DynamicColumn::F64(c) => c.get_cardinality(),
|
||||
DynamicColumn::IpAddr(c) => c.get_cardinality(),
|
||||
DynamicColumn::DateTime(c) => c.get_cardinality(),
|
||||
DynamicColumn::Bytes(c) => c.ords().get_cardinality(),
|
||||
DynamicColumn::Str(c) => c.ords().get_cardinality(),
|
||||
}
|
||||
}
|
||||
pub fn column_type(&self) -> ColumnType {
|
||||
match self {
|
||||
DynamicColumn::Bool(_) => ColumnType::Bool,
|
||||
DynamicColumn::I64(_) => ColumnType::I64,
|
||||
DynamicColumn::U64(_) => ColumnType::U64,
|
||||
DynamicColumn::F64(_) => ColumnType::F64,
|
||||
DynamicColumn::IpAddr(_) => ColumnType::IpAddr,
|
||||
DynamicColumn::DateTime(_) => ColumnType::DateTime,
|
||||
DynamicColumn::Bytes(_) => ColumnType::Bytes,
|
||||
DynamicColumn::Str(_) => ColumnType::Str,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_numerical(&self) -> bool {
|
||||
self.column_type().numerical_type().is_some()
|
||||
}
|
||||
|
||||
pub fn is_f64(&self) -> bool {
|
||||
self.column_type().numerical_type() == Some(NumericalType::F64)
|
||||
}
|
||||
pub fn is_i64(&self) -> bool {
|
||||
self.column_type().numerical_type() == Some(NumericalType::I64)
|
||||
}
|
||||
pub fn is_u64(&self) -> bool {
|
||||
self.column_type().numerical_type() == Some(NumericalType::U64)
|
||||
}
|
||||
|
||||
pub fn coerce_to_f64(self) -> Option<DynamicColumn> {
|
||||
match self {
|
||||
DynamicColumn::I64(column) => Some(DynamicColumn::F64(Column {
|
||||
idx: column.idx,
|
||||
values: Arc::new(monotonic_map_column(column.values, MapI64ToF64)),
|
||||
})),
|
||||
DynamicColumn::U64(column) => Some(DynamicColumn::F64(Column {
|
||||
idx: column.idx,
|
||||
values: Arc::new(monotonic_map_column(column.values, MapU64ToF64)),
|
||||
})),
|
||||
DynamicColumn::F64(_) => Some(self),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub fn coerce_to_i64(self) -> Option<DynamicColumn> {
|
||||
match self {
|
||||
DynamicColumn::U64(column) => {
|
||||
if column.max_value() > i64::MAX as u64 {
|
||||
return None;
|
||||
}
|
||||
Some(DynamicColumn::I64(Column {
|
||||
idx: column.idx,
|
||||
values: Arc::new(monotonic_map_column(column.values, MapU64ToI64)),
|
||||
}))
|
||||
}
|
||||
DynamicColumn::I64(_) => Some(self),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub fn coerce_to_u64(self) -> Option<DynamicColumn> {
|
||||
match self {
|
||||
DynamicColumn::I64(column) => {
|
||||
if column.min_value() < 0 {
|
||||
return None;
|
||||
}
|
||||
Some(DynamicColumn::U64(Column {
|
||||
idx: column.idx,
|
||||
values: Arc::new(monotonic_map_column(column.values, MapI64ToU64)),
|
||||
}))
|
||||
}
|
||||
DynamicColumn::U64(_) => Some(self),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct MapI64ToF64;
|
||||
impl StrictlyMonotonicFn<i64, f64> for MapI64ToF64 {
|
||||
#[inline(always)]
|
||||
fn mapping(&self, inp: i64) -> f64 {
|
||||
inp as f64
|
||||
}
|
||||
#[inline(always)]
|
||||
fn inverse(&self, out: f64) -> i64 {
|
||||
out as i64
|
||||
}
|
||||
}
|
||||
|
||||
struct MapU64ToF64;
|
||||
impl StrictlyMonotonicFn<u64, f64> for MapU64ToF64 {
|
||||
#[inline(always)]
|
||||
fn mapping(&self, inp: u64) -> f64 {
|
||||
inp as f64
|
||||
}
|
||||
#[inline(always)]
|
||||
fn inverse(&self, out: f64) -> u64 {
|
||||
out as u64
|
||||
}
|
||||
}
|
||||
|
||||
struct MapU64ToI64;
|
||||
impl StrictlyMonotonicFn<u64, i64> for MapU64ToI64 {
|
||||
#[inline(always)]
|
||||
fn mapping(&self, inp: u64) -> i64 {
|
||||
inp as i64
|
||||
}
|
||||
#[inline(always)]
|
||||
fn inverse(&self, out: i64) -> u64 {
|
||||
out as u64
|
||||
}
|
||||
}
|
||||
|
||||
struct MapI64ToU64;
|
||||
impl StrictlyMonotonicFn<i64, u64> for MapI64ToU64 {
|
||||
#[inline(always)]
|
||||
fn mapping(&self, inp: i64) -> u64 {
|
||||
inp as u64
|
||||
}
|
||||
#[inline(always)]
|
||||
fn inverse(&self, out: u64) -> i64 {
|
||||
out as i64
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! static_dynamic_conversions {
|
||||
($typ:ty, $enum_name:ident) => {
|
||||
impl Into<Option<Column<$typ>>> for DynamicColumn {
|
||||
fn into(self) -> Option<Column<$typ>> {
|
||||
if let Self::$enum_name(col) = self {
|
||||
impl Into<Option<$typ>> for DynamicColumn {
|
||||
fn into(self) -> Option<$typ> {
|
||||
if let DynamicColumn::$enum_name(col) = self {
|
||||
Some(col)
|
||||
} else {
|
||||
None
|
||||
@@ -30,25 +168,22 @@ macro_rules! static_dynamic_conversions {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Column<$typ>> for DynamicColumn {
|
||||
fn from(typed_column: Column<$typ>) -> Self {
|
||||
impl From<$typ> for DynamicColumn {
|
||||
fn from(typed_column: $typ) -> Self {
|
||||
DynamicColumn::$enum_name(typed_column)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
static_dynamic_conversions!(bool, Bool);
|
||||
static_dynamic_conversions!(u64, U64);
|
||||
static_dynamic_conversions!(i64, I64);
|
||||
static_dynamic_conversions!(f64, F64);
|
||||
static_dynamic_conversions!(crate::DateTime, DateTime);
|
||||
|
||||
impl From<BytesColumn> for DynamicColumn {
|
||||
fn from(dictionary_encoded_col: BytesColumn) -> Self {
|
||||
DynamicColumn::Str(dictionary_encoded_col)
|
||||
}
|
||||
}
|
||||
static_dynamic_conversions!(Column<bool>, Bool);
|
||||
static_dynamic_conversions!(Column<u64>, U64);
|
||||
static_dynamic_conversions!(Column<i64>, I64);
|
||||
static_dynamic_conversions!(Column<f64>, F64);
|
||||
static_dynamic_conversions!(Column<crate::DateTime>, DateTime);
|
||||
static_dynamic_conversions!(StrColumn, Str);
|
||||
static_dynamic_conversions!(BytesColumn, Bytes);
|
||||
static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DynamicColumnHandle {
|
||||
@@ -77,12 +212,13 @@ impl DynamicColumnHandle {
|
||||
pub fn open_u64_lenient(&self) -> io::Result<Option<Column<u64>>> {
|
||||
let column_bytes = self.file_slice.read_bytes()?;
|
||||
match self.column_type {
|
||||
ColumnType::Str => {
|
||||
let column = crate::column::open_column_bytes(column_bytes)?;
|
||||
ColumnType::Str | ColumnType::Bytes => {
|
||||
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
|
||||
Ok(Some(column.term_ord_column))
|
||||
}
|
||||
ColumnType::Bool => Ok(None),
|
||||
ColumnType::Numerical(_) | ColumnType::DateTime => {
|
||||
ColumnType::IpAddr => Ok(None),
|
||||
ColumnType::I64 | ColumnType::U64 | ColumnType::F64 | ColumnType::DateTime => {
|
||||
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
||||
Ok(Some(column))
|
||||
}
|
||||
@@ -91,19 +227,15 @@ impl DynamicColumnHandle {
|
||||
|
||||
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
|
||||
let dynamic_column: DynamicColumn = match self.column_type {
|
||||
ColumnType::Str => crate::column::open_column_bytes(column_bytes)?.into(),
|
||||
ColumnType::Numerical(numerical_type) => match numerical_type {
|
||||
crate::NumericalType::I64 => {
|
||||
crate::column::open_column_u64::<i64>(column_bytes)?.into()
|
||||
}
|
||||
crate::NumericalType::U64 => {
|
||||
crate::column::open_column_u64::<u64>(column_bytes)?.into()
|
||||
}
|
||||
crate::NumericalType::F64 => {
|
||||
crate::column::open_column_u64::<f64>(column_bytes)?.into()
|
||||
}
|
||||
},
|
||||
ColumnType::Bytes => {
|
||||
crate::column::open_column_bytes::<BytesColumn>(column_bytes)?.into()
|
||||
}
|
||||
ColumnType::Str => crate::column::open_column_bytes::<StrColumn>(column_bytes)?.into(),
|
||||
ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
|
||||
ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
|
||||
ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
|
||||
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
|
||||
ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
|
||||
ColumnType::DateTime => {
|
||||
crate::column::open_column_u64::<crate::DateTime>(column_bytes)?.into()
|
||||
}
|
||||
|
||||
@@ -18,9 +18,12 @@ mod dynamic_column;
|
||||
pub(crate) mod utils;
|
||||
mod value;
|
||||
|
||||
pub use column::Column;
|
||||
pub use column::{BytesColumn, Column, StrColumn};
|
||||
pub use column_values::ColumnValues;
|
||||
pub use columnar::{ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType};
|
||||
pub use columnar::{
|
||||
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||
MergeDocOrder,
|
||||
};
|
||||
pub use value::{NumericalType, NumericalValue};
|
||||
|
||||
pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
|
||||
@@ -59,6 +62,12 @@ pub enum Cardinality {
|
||||
}
|
||||
|
||||
impl Cardinality {
|
||||
pub fn is_optional(&self) -> bool {
|
||||
matches!(self, Cardinality::Optional)
|
||||
}
|
||||
pub fn is_multivalue(&self) -> bool {
|
||||
matches!(self, Cardinality::Multivalued)
|
||||
}
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use crate::column_values::MonotonicallyMappableToU128;
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::dynamic_column::{DynamicColumn, DynamicColumnHandle};
|
||||
use crate::value::NumericalValue;
|
||||
use crate::{Cardinality, ColumnarReader, ColumnarWriter};
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_writer_bytes() {
|
||||
fn test_dataframe_writer_str() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_str(1u32, "my_string", "hello");
|
||||
dataframe_writer.record_str(3u32, "my_string", "helloeee");
|
||||
@@ -14,7 +17,21 @@ fn test_dataframe_writer_bytes() {
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 165);
|
||||
assert_eq!(cols[0].num_bytes(), 158);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_writer_bytes() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
|
||||
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 158);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -28,7 +45,7 @@ fn test_dataframe_writer_bool() {
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 29);
|
||||
assert_eq!(cols[0].num_bytes(), 22);
|
||||
assert_eq!(cols[0].column_type(), ColumnType::Bool);
|
||||
let dyn_bool_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
|
||||
@@ -36,6 +53,59 @@ fn test_dataframe_writer_bool() {
|
||||
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_writer_u64_multivalued() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(2u32, "divisor", 2u64);
|
||||
dataframe_writer.record_numerical(3u32, "divisor", 3u64);
|
||||
dataframe_writer.record_numerical(4u32, "divisor", 2u64);
|
||||
dataframe_writer.record_numerical(5u32, "divisor", 5u64);
|
||||
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
|
||||
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(7, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 29);
|
||||
let dyn_i64_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
|
||||
assert_eq!(
|
||||
divisor_col.get_cardinality(),
|
||||
crate::Cardinality::Multivalued
|
||||
);
|
||||
assert_eq!(divisor_col.num_rows(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_writer_ip_addr() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
|
||||
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 42);
|
||||
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
|
||||
let dyn_bool_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
|
||||
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
|
||||
assert_eq!(
|
||||
&vals,
|
||||
&[
|
||||
None,
|
||||
Some(Ipv6Addr::from_u128(1001)),
|
||||
None,
|
||||
Some(Ipv6Addr::from_u128(1050)),
|
||||
None,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_writer_numerical() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
@@ -53,7 +123,7 @@ fn test_dataframe_writer_numerical() {
|
||||
// - header 14 bytes
|
||||
// - vals 8 //< due to padding? could have been 1byte?.
|
||||
// - null footer 6 bytes
|
||||
assert_eq!(cols[0].num_bytes(), 40);
|
||||
assert_eq!(cols[0].num_bytes(), 33);
|
||||
let column = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(column_i64) = column else { panic!(); };
|
||||
assert_eq!(column_i64.idx.get_cardinality(), Cardinality::Optional);
|
||||
@@ -67,18 +137,76 @@ fn test_dataframe_writer_numerical() {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary_encoded() {
|
||||
fn test_dictionary_encoded_str() {
|
||||
let mut buffer = Vec::new();
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
columnar_writer.record_str(1, "my.column", "my.key");
|
||||
columnar_writer.record_str(3, "my.column", "my.key2");
|
||||
columnar_writer.record_str(1, "my.column", "a");
|
||||
columnar_writer.record_str(3, "my.column", "c");
|
||||
columnar_writer.record_str(3, "my.column2", "different_column!");
|
||||
columnar_writer.record_str(4, "my.column", "b");
|
||||
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
assert_eq!(col_handles.len(), 1);
|
||||
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
|
||||
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
|
||||
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
||||
assert_eq!(str_col.num_rows(), 5);
|
||||
// let term_ords = (0..)
|
||||
let mut term_buffer = String::new();
|
||||
let term_ords = str_col.ords();
|
||||
assert_eq!(term_ords.first(0), None);
|
||||
assert_eq!(term_ords.first(1), Some(0));
|
||||
str_col.ord_to_str(0u64, &mut term_buffer).unwrap();
|
||||
assert_eq!(term_buffer, "a");
|
||||
assert_eq!(term_ords.first(2), None);
|
||||
assert_eq!(term_ords.first(3), Some(2));
|
||||
str_col.ord_to_str(2u64, &mut term_buffer).unwrap();
|
||||
assert_eq!(term_buffer, "c");
|
||||
assert_eq!(term_ords.first(4), Some(1));
|
||||
str_col.ord_to_str(1u64, &mut term_buffer).unwrap();
|
||||
assert_eq!(term_buffer, "b");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary_encoded_bytes() {
|
||||
let mut buffer = Vec::new();
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
columnar_writer.record_bytes(1, "my.column", b"a");
|
||||
columnar_writer.record_bytes(3, "my.column", b"c");
|
||||
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
|
||||
columnar_writer.record_bytes(4, "my.column", b"b");
|
||||
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
assert_eq!(col_handles.len(), 1);
|
||||
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
|
||||
let index: Vec<Option<u64>> = (0..5)
|
||||
.map(|row_id| bytes_col.ords().first(row_id))
|
||||
.collect();
|
||||
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
||||
assert_eq!(bytes_col.num_rows(), 5);
|
||||
let mut term_buffer = Vec::new();
|
||||
let term_ords = bytes_col.ords();
|
||||
assert_eq!(term_ords.first(0), None);
|
||||
assert_eq!(term_ords.first(1), Some(0));
|
||||
bytes_col
|
||||
.dictionary
|
||||
.ord_to_term(0u64, &mut term_buffer)
|
||||
.unwrap();
|
||||
assert_eq!(term_buffer, b"a");
|
||||
assert_eq!(term_ords.first(2), None);
|
||||
assert_eq!(term_ords.first(3), Some(2));
|
||||
bytes_col
|
||||
.dictionary
|
||||
.ord_to_term(2u64, &mut term_buffer)
|
||||
.unwrap();
|
||||
assert_eq!(term_buffer, b"c");
|
||||
assert_eq!(term_ords.first(4), Some(1));
|
||||
bytes_col
|
||||
.dictionary
|
||||
.ord_to_term(1u64, &mut term_buffer)
|
||||
.unwrap();
|
||||
assert_eq!(term_buffer, b"b");
|
||||
}
|
||||
|
||||
@@ -1,12 +1,22 @@
|
||||
use crate::{Column, ColumnType, InvalidData};
|
||||
use crate::InvalidData;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
#[derive(Copy, Clone, PartialEq, Debug)]
|
||||
pub enum NumericalValue {
|
||||
I64(i64),
|
||||
U64(u64),
|
||||
F64(f64),
|
||||
}
|
||||
|
||||
impl NumericalValue {
|
||||
pub fn numerical_type(&self) -> NumericalType {
|
||||
match self {
|
||||
NumericalValue::I64(_) => NumericalType::I64,
|
||||
NumericalValue::U64(_) => NumericalType::U64,
|
||||
NumericalValue::F64(_) => NumericalType::F64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for NumericalValue {
|
||||
fn from(val: u64) -> NumericalValue {
|
||||
NumericalValue::U64(val)
|
||||
@@ -25,18 +35,6 @@ impl From<f64> for NumericalValue {
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericalValue {
|
||||
pub fn numerical_type(&self) -> NumericalType {
|
||||
match self {
|
||||
NumericalValue::F64(_) => NumericalType::F64,
|
||||
NumericalValue::I64(_) => NumericalType::I64,
|
||||
NumericalValue::U64(_) => NumericalType::U64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for NumericalValue {}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Hash, Eq, PartialEq)]
|
||||
#[repr(u8)]
|
||||
pub enum NumericalType {
|
||||
|
||||
@@ -13,7 +13,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::metric::AverageAggregation;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::{doc, Index, Term};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast();
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
|
||||
let price_field = schema_builder.add_f64_field("price", score_fieldtype);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
|
||||
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_precision(tantivy::DatePrecision::Seconds);
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
@@ -14,7 +14,6 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
[dependencies]
|
||||
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
|
||||
columnar = { version= "0.1", path="../columnar", package="tantivy-columnar" }
|
||||
prettytable-rs = {version="0.10.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
fastdivide = "0.4"
|
||||
|
||||
@@ -2,11 +2,81 @@ use std::fmt::{self, Debug};
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
|
||||
pub use columnar::ColumnValues as Column;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::monotonic_mapping::StrictlyMonotonicFn;
|
||||
|
||||
/// `Column` provides columnar access on a field.
|
||||
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
|
||||
/// Return the value associated with the given idx.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `idx` is greater than the column length.
|
||||
fn get_val(&self, idx: u32) -> T;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Must panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
#[inline]
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
for (out, idx) in output.iter_mut().zip(start..) {
|
||||
*out = self.get_val(idx as u32);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the positions of values which are in the provided value range.
|
||||
///
|
||||
/// Note that position == docid for single value fast fields
|
||||
#[inline]
|
||||
fn get_docids_for_value_range(
|
||||
&self,
|
||||
value_range: RangeInclusive<T>,
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
|
||||
|
||||
for idx in doc_id_range.start..doc_id_range.end {
|
||||
let val = self.get_val(idx);
|
||||
if value_range.contains(&val) {
|
||||
positions.push(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// This min_value may not be exact.
|
||||
/// For instance, the min value does not take in account of possible
|
||||
/// deleted document. All values are however guaranteed to be higher than
|
||||
/// `.min_value()`.
|
||||
fn min_value(&self) -> T;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// This max_value may not be exact.
|
||||
/// For instance, the max value does not take in account of possible
|
||||
/// deleted document. All values are however guaranteed to be higher than
|
||||
/// `.max_value()`.
|
||||
fn max_value(&self) -> T;
|
||||
|
||||
/// The number of values in the column.
|
||||
fn num_vals(&self) -> u32;
|
||||
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
|
||||
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
|
||||
}
|
||||
}
|
||||
|
||||
/// VecColumn provides `Column` over a slice.
|
||||
pub struct VecColumn<'a, T = u64> {
|
||||
values: &'a [T],
|
||||
@@ -14,6 +84,32 @@ pub struct VecColumn<'a, T = u64> {
|
||||
max_value: T,
|
||||
}
|
||||
|
||||
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
|
||||
fn get_val(&self, idx: u32) -> T {
|
||||
(*self).get_val(idx)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> T {
|
||||
(*self).min_value()
|
||||
}
|
||||
|
||||
fn max_value(&self) -> T {
|
||||
(*self).max_value()
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
(*self).num_vals()
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
|
||||
(*self).iter()
|
||||
}
|
||||
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
(*self).get_range(start, output)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
|
||||
fn get_val(&self, position: u32) -> T {
|
||||
self.values[position as usize]
|
||||
|
||||
@@ -402,8 +402,8 @@ mod tests {
|
||||
let mut buffer = Vec::new();
|
||||
let col = VecColumn::from(&[false, true][..]);
|
||||
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||
// 5 bytes of header, 1 byte of value, 7 bytes of padding.
|
||||
assert_eq!(buffer.len(), 3 + 5 + 8 + 4 + 2);
|
||||
// 5 bytes of header, 1 byte of value
|
||||
assert_eq!(buffer.len(), 3 + 5 + 1 + 4 + 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -411,8 +411,8 @@ mod tests {
|
||||
let mut buffer = Vec::new();
|
||||
let col = VecColumn::from(&[true][..]);
|
||||
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||
// 5 bytes of header, 0 bytes of value, 7 bytes of padding.
|
||||
assert_eq!(buffer.len(), 3 + 5 + 7 + 4 + 2);
|
||||
// 5 bytes of header, 0 bytes of value
|
||||
assert_eq!(buffer.len(), 3 + 5 + 4 + 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -422,6 +422,6 @@ mod tests {
|
||||
let col = VecColumn::from(&vals[..]);
|
||||
serialize(col, &mut buffer, &[FastFieldCodecType::Bitpacked]).unwrap();
|
||||
// Values are stored over 3 bits.
|
||||
assert_eq!(buffer.len(), 3 + 7 + (3 * 80 / 8) + 7 + 4 + 2);
|
||||
assert_eq!(buffer.len(), 3 + 7 + (3 * 80 / 8) + 4 + 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
#!/bin/bash
|
||||
cargo test
|
||||
@@ -15,7 +15,7 @@ use super::metric::{
|
||||
use super::segment_agg_result::BucketCount;
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
|
||||
@@ -6,14 +6,13 @@ use super::{IntermediateStats, SegmentStatsCollector};
|
||||
|
||||
/// A single-value metric aggregation that computes the average of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "avg": {
|
||||
/// "field": "score",
|
||||
/// "field": "score"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
@@ -6,14 +6,13 @@ use super::{IntermediateStats, SegmentStatsCollector};
|
||||
|
||||
/// A single-value metric aggregation that counts the number of values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "value_count": {
|
||||
/// "field": "score",
|
||||
/// "field": "score"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
@@ -6,14 +6,13 @@ use super::{IntermediateStats, SegmentStatsCollector};
|
||||
|
||||
/// A single-value metric aggregation that computes the maximum of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "max": {
|
||||
/// "field": "score",
|
||||
/// "field": "score"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
@@ -6,14 +6,13 @@ use super::{IntermediateStats, SegmentStatsCollector};
|
||||
|
||||
/// A single-value metric aggregation that computes the minimum of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "min": {
|
||||
/// "field": "score",
|
||||
/// "field": "score"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
@@ -43,13 +43,13 @@ mod tests {
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{NumericOptions, Schema};
|
||||
use crate::schema::{Cardinality, NumericOptions, Schema};
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn test_metric_aggregations() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_options = NumericOptions::default().set_fast();
|
||||
let field_options = NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let field = schema_builder.add_f64_field("price", field_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
@@ -7,14 +7,13 @@ use crate::{DocId, TantivyError};
|
||||
|
||||
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
|
||||
/// are extracted from the aggregated documents.
|
||||
/// Supported field types are `u64`, `i64`, and `f64`.
|
||||
/// See [`Stats`] for returned statistics.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "stats": {
|
||||
/// "field": "score",
|
||||
/// "field": "score"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
@@ -6,14 +6,13 @@ use super::{IntermediateStats, SegmentStatsCollector};
|
||||
|
||||
/// A single-value metric aggregation that sums up numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "sum": {
|
||||
/// "field": "score",
|
||||
/// "field": "score"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
//! # Aggregations
|
||||
//!
|
||||
//!
|
||||
//! An aggregation summarizes your data as statistics on buckets or metrics.
|
||||
//!
|
||||
//! Aggregations can provide answer to questions like:
|
||||
@@ -41,6 +40,10 @@
|
||||
//! - [Metric](metric)
|
||||
//! - [Average](metric::AverageAggregation)
|
||||
//! - [Stats](metric::StatsAggregation)
|
||||
//! - [Min](metric::MinAggregation)
|
||||
//! - [Max](metric::MaxAggregation)
|
||||
//! - [Sum](metric::SumAggregation)
|
||||
//! - [Count](metric::CountAggregation)
|
||||
//!
|
||||
//! # Example
|
||||
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
|
||||
@@ -75,7 +78,7 @@
|
||||
//! }
|
||||
//! ```
|
||||
//! # Example JSON
|
||||
//! Requests are compatible with the elasticsearch json request format.
|
||||
//! Requests are compatible with the elasticsearch JSON request format.
|
||||
//!
|
||||
//! ```
|
||||
//! use tantivy::aggregation::agg_req::Aggregations;
|
||||
@@ -430,13 +433,13 @@ mod tests {
|
||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast();
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let fraction_field = schema_builder.add_f64_field(
|
||||
"fraction_f64",
|
||||
crate::schema::NumericOptions::default().set_fast(),
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue),
|
||||
);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
@@ -654,12 +657,12 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
schema_builder.add_text_field("dummy_text", STRING);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast();
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
|
||||
let multivalue =
|
||||
crate::schema::NumericOptions::default().set_fast();
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let scores_field_i64 = schema_builder.add_i64_field("scores_i64", multivalue);
|
||||
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
@@ -1187,7 +1190,7 @@ mod tests {
|
||||
let text_field_few_terms =
|
||||
schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast();
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
let score_field_f64 =
|
||||
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
|
||||
@@ -12,10 +12,10 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{DynamicColumn, HasAssociatedColumnType};
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
@@ -61,7 +61,7 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
@@ -70,7 +70,7 @@ where TPredicate: 'static + Clone
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate, TPredicateValue: Default>
|
||||
impl<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||
FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
@@ -91,13 +91,12 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
|
||||
impl<TCollector, TPredicate, TPredicateValue: FastValue> Collector
|
||||
for FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync + Clone,
|
||||
TPredicateValue: HasAssociatedColumnType,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
// That's the type of our result.
|
||||
// Our standard deviation will be a float.
|
||||
@@ -118,10 +117,20 @@ where
|
||||
field_entry.name()
|
||||
)));
|
||||
}
|
||||
let requested_type = TPredicateValue::to_type();
|
||||
let field_schema_type = field_entry.field_type().value_type();
|
||||
if requested_type != field_schema_type {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"Field {:?} is of type {:?}!={:?}",
|
||||
field_entry.name(),
|
||||
requested_type,
|
||||
field_schema_type
|
||||
)));
|
||||
}
|
||||
|
||||
let fast_field_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_column_first_or_default(schema.get_field_name(self.field))?;
|
||||
.typed_fast_field_reader(schema.get_field_name(self.field))?;
|
||||
|
||||
let segment_collector = self
|
||||
.collector
|
||||
@@ -150,7 +159,7 @@ where
|
||||
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TPredicate: 'static,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
fast_field_reader: Arc<dyn Column<TPredicateValue>>,
|
||||
segment_collector: TSegmentCollector,
|
||||
@@ -162,9 +171,8 @@ impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
|
||||
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicateValue: HasAssociatedColumnType,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use fastdivide::DividerU64;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -87,14 +87,14 @@ impl HistogramComputer {
|
||||
}
|
||||
pub struct SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer,
|
||||
column_u64: Arc<dyn Column<u64>>,
|
||||
ff_reader: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentHistogramCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let value = self.column_u64.get_val(doc);
|
||||
let value = self.ff_reader.get_val(doc);
|
||||
self.histogram_computer.add_value(value);
|
||||
}
|
||||
|
||||
@@ -112,18 +112,14 @@ impl Collector for HistogramCollector {
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
segment: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let column_opt = segment.fast_fields().u64_lenient(&self.field)?;
|
||||
let column = column_opt.ok_or_else(|| FastFieldNotAvailableError {
|
||||
field_name: self.field.clone(),
|
||||
})?;
|
||||
let column_u64 = column.first_or_default_col(0u64);
|
||||
let ff_reader = segment.fast_fields().u64_lenient(&self.field)?;
|
||||
Ok(SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer {
|
||||
counts: vec![0; self.num_buckets],
|
||||
min_value: self.min_value,
|
||||
divider: self.divider,
|
||||
},
|
||||
column_u64,
|
||||
ff_reader,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -104,8 +104,9 @@ pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
|
||||
|
||||
mod tweak_score_top_collector;
|
||||
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
// mod facet_collector;
|
||||
// pub use self::facet_collector::{FacetCollector, FacetCounts};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::{FacetCollector, FacetCounts};
|
||||
use crate::query::Weight;
|
||||
|
||||
mod docset_collector;
|
||||
|
||||
@@ -5,6 +5,7 @@ use fastfield_codecs::Column;
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -57,10 +58,9 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
|
||||
assert_eq!(filtered_top_docs.len(), 0);
|
||||
|
||||
fn date_filter(value: columnar::DateTime) -> bool {
|
||||
(crate::DateTime::from(value).into_utc()
|
||||
- OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
.whole_weeks()
|
||||
fn date_filter(value: DateTime) -> bool {
|
||||
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
.whole_weeks()
|
||||
> 0
|
||||
}
|
||||
|
||||
@@ -164,10 +164,8 @@ pub struct FastFieldSegmentCollector {
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: impl ToString) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
field: field.to_string(),
|
||||
}
|
||||
pub fn for_field(field: String) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector { field }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,62 +210,64 @@ impl SegmentCollector for FastFieldSegmentCollector {
|
||||
}
|
||||
}
|
||||
|
||||
// /// Collects in order all of the fast field bytes for all of the
|
||||
// /// docs in the `DocSet`
|
||||
// ///
|
||||
// /// This collector is mainly useful for tests.
|
||||
// pub struct BytesFastFieldTestCollector {
|
||||
// field: Field,
|
||||
// }
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
field: Field,
|
||||
}
|
||||
|
||||
// pub struct BytesFastFieldSegmentCollector {
|
||||
// vals: Vec<u8>,
|
||||
// reader: BytesFastFieldReader,
|
||||
// }
|
||||
pub struct BytesFastFieldSegmentCollector {
|
||||
vals: Vec<u8>,
|
||||
reader: BytesFastFieldReader,
|
||||
}
|
||||
|
||||
// impl BytesFastFieldTestCollector {
|
||||
// pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
// BytesFastFieldTestCollector { field }
|
||||
// }
|
||||
// }
|
||||
impl BytesFastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
BytesFastFieldTestCollector { field }
|
||||
}
|
||||
}
|
||||
|
||||
// impl Collector for BytesFastFieldTestCollector {
|
||||
// type Fruit = Vec<u8>;
|
||||
// type Child = BytesFastFieldSegmentCollector;
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
type Child = BytesFastFieldSegmentCollector;
|
||||
|
||||
// fn for_segment(
|
||||
// &self,
|
||||
// _segment_local_id: u32,
|
||||
// segment_reader: &SegmentReader,
|
||||
// ) -> crate::Result<BytesFastFieldSegmentCollector> {
|
||||
// let reader = segment_reader.fast_fields().bytes(self.field)?;
|
||||
// Ok(BytesFastFieldSegmentCollector {
|
||||
// vals: Vec::new(),
|
||||
// reader,
|
||||
// })
|
||||
// }
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<BytesFastFieldSegmentCollector> {
|
||||
let reader = segment_reader
|
||||
.fast_fields()
|
||||
.bytes(segment_reader.schema().get_field_name(self.field))?;
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader,
|
||||
})
|
||||
}
|
||||
|
||||
// fn requires_scoring(&self) -> bool {
|
||||
// false
|
||||
// }
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
// fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
|
||||
// Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
|
||||
// }
|
||||
// }
|
||||
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
|
||||
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
// impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
// type Fruit = Vec<u8>;
|
||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
|
||||
// fn collect(&mut self, doc: u32, _score: Score) {
|
||||
// let data = self.reader.get_bytes(doc);
|
||||
// self.vals.extend(data);
|
||||
// }
|
||||
fn collect(&mut self, doc: u32, _score: Score) {
|
||||
let data = self.reader.get_bytes(doc);
|
||||
self.vals.extend(data);
|
||||
}
|
||||
|
||||
// fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
// self.vals
|
||||
// }
|
||||
// }
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
fn make_test_searcher() -> crate::Result<Searcher> {
|
||||
let schema = Schema::builder().build();
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
@@ -22,7 +22,7 @@ struct FastFieldConvertCollector<
|
||||
TFastValue: FastValue,
|
||||
> {
|
||||
pub collector: TCollector,
|
||||
pub field: String,
|
||||
pub field: Field,
|
||||
pub fast_value: std::marker::PhantomData<TFastValue>,
|
||||
}
|
||||
|
||||
@@ -41,8 +41,7 @@ where
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment.schema();
|
||||
let field = schema.get_field(&self.field)?;
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
let field_entry = schema.get_field_entry(self.field);
|
||||
if !field_entry.is_fast() {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a fast field.",
|
||||
@@ -133,17 +132,17 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
sort_column: Arc<dyn Column<u64>>,
|
||||
ff_reader: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.sort_column.get_val(doc)
|
||||
self.ff_reader.get_val(doc)
|
||||
}
|
||||
}
|
||||
|
||||
struct ScorerByField {
|
||||
field: String,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl CustomScorer<u64> for ScorerByField {
|
||||
@@ -155,13 +154,10 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
// mapping is monotonic, so it is sufficient to compute our top-K docs.
|
||||
//
|
||||
// The conversion will then happen only on the top-K docs.
|
||||
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
|
||||
let sort_column = sort_column_opt
|
||||
.ok_or_else(|| FastFieldNotAvailableError {
|
||||
field_name: self.field.clone(),
|
||||
})?
|
||||
.first_or_default_col(0u64);
|
||||
Ok(ScorerByFastFieldReader { sort_column })
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(segment_reader.schema().get_field_name(self.field))?;
|
||||
Ok(ScorerByFastFieldReader { ff_reader })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -294,14 +290,9 @@ impl TopDocs {
|
||||
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
|
||||
pub fn order_by_u64_field(
|
||||
self,
|
||||
field: impl ToString,
|
||||
field: Field,
|
||||
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
||||
CustomScoreTopCollector::new(
|
||||
ScorerByField {
|
||||
field: field.to_string(),
|
||||
},
|
||||
self.0.into_tscore(),
|
||||
)
|
||||
CustomScoreTopCollector::new(ScorerByField { field }, self.0.into_tscore())
|
||||
}
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
@@ -376,15 +367,15 @@ impl TopDocs {
|
||||
/// ```
|
||||
pub fn order_by_fast_field<TFastValue>(
|
||||
self,
|
||||
fast_field: impl ToString,
|
||||
fast_field: Field,
|
||||
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
|
||||
where
|
||||
TFastValue: FastValue,
|
||||
{
|
||||
let u64_collector = self.order_by_u64_field(fast_field.to_string());
|
||||
let u64_collector = self.order_by_u64_field(fast_field);
|
||||
FastFieldConvertCollector {
|
||||
collector: u64_collector,
|
||||
field: fast_field.to_string(),
|
||||
field: fast_field,
|
||||
fast_value: PhantomData,
|
||||
}
|
||||
}
|
||||
@@ -886,7 +877,7 @@ mod tests {
|
||||
});
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -925,7 +916,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
||||
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -955,7 +946,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -985,7 +976,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -1013,7 +1004,7 @@ mod tests {
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
@@ -1031,7 +1022,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let err = top_collector.for_segment(0, segment).err().unwrap();
|
||||
assert!(matches!(err, crate::TantivyError::SchemaError(_)));
|
||||
Ok(())
|
||||
@@ -1048,7 +1039,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(size);
|
||||
let err = top_collector.for_segment(0, segment).err().unwrap();
|
||||
assert!(
|
||||
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
|
||||
@@ -93,7 +93,7 @@ fn save_new_metas(
|
||||
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
/// let number_field = schema_builder.add_u64_field(
|
||||
/// "number",
|
||||
/// NumericOptions::default().set_fast(),
|
||||
/// NumericOptions::default().set_fast(Cardinality::SingleValue),
|
||||
/// );
|
||||
///
|
||||
/// let schema = schema_builder.build();
|
||||
@@ -245,6 +245,12 @@ impl IndexBuilder {
|
||||
sort_by_field.field
|
||||
)));
|
||||
}
|
||||
if entry.field_type().fastfield_cardinality() != Some(Cardinality::SingleValue) {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Only single value fast field Cardinality supported for sorting index {}",
|
||||
sort_by_field.field
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
|
||||
@@ -7,7 +7,7 @@ use fail::fail_point;
|
||||
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
|
||||
use crate::space_usage::SegmentSpaceUsage;
|
||||
@@ -90,8 +90,25 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated with a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
|
||||
todo!();
|
||||
pub fn facet_reader(&self, field: Field) -> crate::Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
let term_ords_reader =
|
||||
self.fast_fields().u64s(self.schema.get_field_name(field))?;
|
||||
let termdict = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.map(TermDictionary::open)
|
||||
.unwrap_or_else(|| Ok(TermDictionary::empty()))?;
|
||||
Ok(FacetReader::new(term_ords_reader, termdict))
|
||||
}
|
||||
_ => Err(crate::TantivyError::InvalidArgument(format!(
|
||||
"Field {:?} is not a facet field.",
|
||||
field_entry.name()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
@@ -153,7 +170,9 @@ impl SegmentReader {
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_readers = Arc::new(FastFieldReaders::open(fast_fields_data)?);
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_fields_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::schema::FieldEntry;
|
||||
#[derive(Debug, Error)]
|
||||
#[error("Fast field not available: '{field_name:?}'")]
|
||||
pub struct FastFieldNotAvailableError {
|
||||
pub(crate) field_name: String,
|
||||
field_name: String,
|
||||
}
|
||||
|
||||
impl FastFieldNotAvailableError {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -38,7 +38,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
NumericOptions::default().set_fast(),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -74,7 +74,7 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
DateOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_stored(),
|
||||
@@ -215,7 +215,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
NumericOptions::default().set_fast(),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -246,7 +246,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let bool_field = schema_builder.add_bool_field(
|
||||
"multifield",
|
||||
NumericOptions::default().set_fast(),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -278,7 +278,7 @@ mod tests {
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
@@ -424,7 +424,7 @@ mod bench {
|
||||
let mut builder = crate::schema::SchemaBuilder::new();
|
||||
|
||||
let fast_multi =
|
||||
crate::schema::NumericOptions::default().set_fast();
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let multi_field = builder.add_f64_field("f64s", fast_multi);
|
||||
|
||||
let index = crate::Index::create_in_ram(builder.build());
|
||||
@@ -504,7 +504,7 @@ mod bench {
|
||||
let path = Path::new("test");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let field = {
|
||||
let options = NumericOptions::default().set_fast();
|
||||
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", options);
|
||||
let schema = schema_builder.build();
|
||||
@@ -562,7 +562,7 @@ mod bench {
|
||||
|
||||
b.iter(|| {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let options = NumericOptions::default().set_fast();
|
||||
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", options);
|
||||
let schema = schema_builder.build();
|
||||
@@ -595,7 +595,7 @@ mod bench {
|
||||
|
||||
b.iter(|| {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let options = NumericOptions::default().set_fast();
|
||||
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
@@ -137,7 +137,7 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
DateOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_precision(DatePrecision::Microseconds)
|
||||
@@ -188,7 +188,7 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
DateOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
// TODO: Test different precision after fixing https://github.com/quickwit-oss/tantivy/issues/1783
|
||||
.set_precision(DatePrecision::Microseconds)
|
||||
.set_indexed()
|
||||
@@ -307,7 +307,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_options = NumericOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast();
|
||||
.set_fast(Cardinality::MultiValues);
|
||||
let item_field = schema_builder.add_i64_field("items", field_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -1,16 +1,12 @@
|
||||
use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{
|
||||
ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
|
||||
HasAssociatedColumnType, NumericalType,
|
||||
};
|
||||
use fastfield_codecs::{open, open_u128, Column};
|
||||
|
||||
use super::multivalued::MultiValuedFastFieldReader;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::fastfield::{BytesFastFieldReader, FastFieldNotAvailableError, FastValue};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::{DateTime, TantivyError};
|
||||
|
||||
@@ -20,152 +16,315 @@ use crate::{DateTime, TantivyError};
|
||||
/// and just wraps several `HashMap`.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReaders {
|
||||
columnar: Arc<ColumnarReader>,
|
||||
schema: Schema,
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
U128,
|
||||
F64,
|
||||
Bool,
|
||||
Date,
|
||||
}
|
||||
|
||||
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
match field_type {
|
||||
FieldType::U64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::U64, cardinality)),
|
||||
FieldType::I64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::I64, cardinality)),
|
||||
FieldType::F64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::F64, cardinality)),
|
||||
FieldType::Bool(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Bool, cardinality)),
|
||||
FieldType::Date(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
FieldType::Str(options) if options.is_fast() => {
|
||||
Some((FastType::U64, Cardinality::MultiValues))
|
||||
}
|
||||
FieldType::IpAddr(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::U128, cardinality)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReaders {
|
||||
pub(crate) fn open(fast_field_file: FileSlice) -> io::Result<FastFieldReaders> {
|
||||
let columnar = Arc::new(ColumnarReader::open(fast_field_file)?);
|
||||
Ok(FastFieldReaders { columnar })
|
||||
pub(crate) fn new(schema: Schema, fast_fields_composite: CompositeFile) -> FastFieldReaders {
|
||||
FastFieldReaders {
|
||||
schema,
|
||||
fast_fields_composite,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
todo!()
|
||||
self.fast_fields_composite.space_usage()
|
||||
}
|
||||
|
||||
pub fn typed_column_opt<T>(
|
||||
#[doc(hidden)]
|
||||
pub fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
|
||||
self.fast_fields_composite
|
||||
.open_read_with_idx(field, idx)
|
||||
.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_entry(field).name();
|
||||
TantivyError::SchemaError(format!("Field({}) data was not found", field_name))
|
||||
})
|
||||
}
|
||||
|
||||
fn check_type(
|
||||
&self,
|
||||
field: Field,
|
||||
expected_fast_type: FastType,
|
||||
expected_cardinality: Cardinality,
|
||||
) -> crate::Result<()> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let (fast_type, cardinality) =
|
||||
type_and_cardinality(field_entry.field_type()).ok_or_else(|| {
|
||||
crate::TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a fast field.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
if fast_type != expected_fast_type {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field {:?} is of type {:?}, expected {:?}.",
|
||||
field_entry.name(),
|
||||
fast_type,
|
||||
expected_fast_type
|
||||
)));
|
||||
}
|
||||
if cardinality != expected_cardinality {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field {:?} is of cardinality {:?}, expected {:?}.",
|
||||
field_entry.name(),
|
||||
cardinality,
|
||||
expected_cardinality
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
|
||||
&self,
|
||||
field_name: &str,
|
||||
) -> crate::Result<Option<columnar::Column<T>>>
|
||||
where
|
||||
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + Default + 'static,
|
||||
DynamicColumn: Into<Option<columnar::Column<T>>>,
|
||||
{
|
||||
let column_type = T::column_type();
|
||||
let Some(dynamic_column_handle) = self.column_handle(field_name, column_type)?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let dynamic_column = dynamic_column_handle.open()?;
|
||||
Ok(dynamic_column.into())
|
||||
index: usize,
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
|
||||
let fast_field_slice = self.fast_field_data(field, index)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
let column = fastfield_codecs::open(bytes)?;
|
||||
Ok(column)
|
||||
}
|
||||
|
||||
pub fn column_num_bytes(&self, field: &str) -> crate::Result<usize> {
|
||||
Ok(self
|
||||
.columnar
|
||||
.read_columns(field)?
|
||||
.into_iter()
|
||||
.map(|column_handle| column_handle.num_bytes())
|
||||
.sum())
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field_name: &str,
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
self.typed_fast_field_reader_with_idx(field_name, 0)
|
||||
}
|
||||
|
||||
pub fn typed_column_first_or_default<T>(&self, field: &str) -> crate::Result<Arc<dyn Column<T>>>
|
||||
where
|
||||
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + Default + 'static,
|
||||
DynamicColumn: Into<Option<columnar::Column<T>>>,
|
||||
{
|
||||
let col_opt = self.typed_column_opt(field)?;
|
||||
if let Some(col) = col_opt {
|
||||
Ok(col.first_or_default_col(T::default()))
|
||||
} else {
|
||||
todo!();
|
||||
}
|
||||
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field_name: &str,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let idx_reader = self.typed_fast_field_reader(field_name)?;
|
||||
let vals_reader = self.typed_fast_field_reader_with_idx(field_name, 1)?;
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<u64>>> {
|
||||
self.typed_column_first_or_default(field)
|
||||
}
|
||||
|
||||
/// Returns the `date` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a date fast field, this method returns an Error.
|
||||
pub fn date(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<columnar::DateTime>>> {
|
||||
self.typed_column_first_or_default(field)
|
||||
pub fn u64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
self.check_type(
|
||||
self.schema.get_field(field_name)?,
|
||||
FastType::U64,
|
||||
Cardinality::SingleValue,
|
||||
)?;
|
||||
self.typed_fast_field_reader(field_name)
|
||||
}
|
||||
|
||||
/// Returns the `ip` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub fn ip_addr(&self, field: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
|
||||
todo!();
|
||||
// self.check_type(field, FastType::U128)?;
|
||||
// let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
|
||||
// Ok(open_u128::<Ipv6Addr>(bytes)?)
|
||||
pub fn ip_addr(&self, field_name: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
|
||||
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
|
||||
Ok(open_u128::<Ipv6Addr>(bytes)?)
|
||||
}
|
||||
|
||||
/// Returns the `ip` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub fn ip_addrs(
|
||||
&self,
|
||||
field_name: &str,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<Ipv6Addr>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
|
||||
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field_name)?;
|
||||
|
||||
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
|
||||
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
|
||||
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
/// Returns the `u128` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub(crate) fn u128(&self, field: &str) -> crate::Result<Arc<dyn Column<u128>>> {
|
||||
todo!();
|
||||
pub(crate) fn u128(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u128>>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
|
||||
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
|
||||
Ok(open_u128::<u128>(bytes)?)
|
||||
}
|
||||
|
||||
pub fn column_handle(
|
||||
&self,
|
||||
field_name: &str,
|
||||
column_type: ColumnType,
|
||||
) -> crate::Result<Option<DynamicColumnHandle>> {
|
||||
let dynamic_column_handle_opt = self
|
||||
.columnar
|
||||
.read_columns(field_name)?
|
||||
.into_iter()
|
||||
.filter(|column| column.column_type() == column_type)
|
||||
.next();
|
||||
Ok(dynamic_column_handle_opt)
|
||||
/// Returns the `u128` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
|
||||
pub fn u128s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u128>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
|
||||
let idx_reader: Arc<dyn Column<u64>> =
|
||||
self.typed_fast_field_reader(self.schema.get_field_name(field))?;
|
||||
|
||||
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
|
||||
let vals_reader = open_u128::<u128>(bytes)?;
|
||||
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Option<columnar::Column<u64>>> {
|
||||
for col in self.columnar.read_columns(field_name)? {
|
||||
if let Some(col_u64) = col.open_u64_lenient()? {
|
||||
return Ok(Some(col_u64));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
/// Returns the `u64` fast field reader reader associated with `field`, regardless of whether
|
||||
/// the given field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated with the original
|
||||
/// FastValue.
|
||||
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
self.typed_fast_field_reader(field_name)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<i64>>> {
|
||||
self.typed_column_first_or_default(field_name)
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(self.schema.get_field_name(field))
|
||||
}
|
||||
|
||||
/// Returns the `date` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a date fast field, this method returns an Error.
|
||||
pub fn date(&self, field_name: &str) -> crate::Result<Arc<dyn Column<DateTime>>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field_name)
|
||||
}
|
||||
|
||||
/// Returns the `f64` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<f64>>> {
|
||||
self.typed_column_first_or_default(field_name)
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field_name)
|
||||
}
|
||||
|
||||
/// Returns the `bool` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a bool fast field, this method returns an Error.
|
||||
pub fn bool(&self, field_name: &str) -> crate::Result<Arc<dyn Column<bool>>> {
|
||||
self.typed_column_first_or_default(field_name)
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field_name)
|
||||
}
|
||||
|
||||
// Returns the `bytes` fast field reader associated with `field`.
|
||||
//
|
||||
// If `field` is not a bytes fast field, returns an Error.
|
||||
// pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
|
||||
// let field_entry = self.schema.get_field_entry(field);
|
||||
// if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
|
||||
// if !bytes_option.is_fast() {
|
||||
// return Err(crate::TantivyError::SchemaError(format!(
|
||||
// "Field {:?} is not a fast field.",
|
||||
// field_entry.name()
|
||||
// )));
|
||||
// }
|
||||
// let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
// let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
|
||||
// let idx_reader = open(fast_field_idx_bytes)?;
|
||||
// let data = self.fast_field_data(field, 1)?;
|
||||
// BytesFastFieldReader::open(idx_reader, data)
|
||||
// } else {
|
||||
// Err(FastFieldNotAvailableError::new(field_entry).into())
|
||||
// }
|
||||
// }
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
|
||||
pub fn u64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(field_name)
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`, regardless
|
||||
/// of whether the given field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
|
||||
pub fn u64s_lenient(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
|
||||
self.typed_fast_field_multi_reader(field_name)
|
||||
}
|
||||
|
||||
/// Returns a `i64s` multi-valued fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a i64 multi-valued fast field, this method returns an Error.
|
||||
pub fn i64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<i64>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
|
||||
}
|
||||
|
||||
/// Returns a `f64s` multi-valued fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a f64 multi-valued fast field, this method returns an Error.
|
||||
pub fn f64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<f64>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
|
||||
}
|
||||
|
||||
/// Returns a `bools` multi-valued fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a bool multi-valued fast field, this method returns an Error.
|
||||
pub fn bools(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<bool>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::Bool, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
|
||||
}
|
||||
|
||||
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated with
|
||||
/// `field`.
|
||||
///
|
||||
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
|
||||
/// Error.
|
||||
pub fn dates(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
|
||||
}
|
||||
|
||||
/// Returns the `bytes` fast field reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns an Error.
|
||||
pub fn bytes(&self, field_name: &str) -> crate::Result<BytesFastFieldReader> {
|
||||
let field = self.schema.get_field(field_name)?;
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
|
||||
if !bytes_option.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a fast field.",
|
||||
field_entry.name()
|
||||
)));
|
||||
}
|
||||
let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
|
||||
let idx_reader = open(fast_field_idx_bytes)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,141 +1,558 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
|
||||
use common;
|
||||
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
|
||||
use super::FastFieldType;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::{DatePrecision, DocId};
|
||||
use crate::DatePrecision;
|
||||
|
||||
/// The `FastFieldsWriter` groups all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
columnar_writer: ColumnarWriter,
|
||||
fast_field_names: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
|
||||
date_precisions: Vec<DatePrecision>,
|
||||
num_docs: DocId,
|
||||
term_id_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
u128_value_writers: Vec<U128FastFieldWriter>,
|
||||
u128_multi_value_writers: Vec<MultiValueU128FastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
}
|
||||
|
||||
pub(crate) fn unexpected_value(expected: &str, actual: &Value) -> crate::TantivyError {
|
||||
crate::TantivyError::SchemaError(format!(
|
||||
"Expected a {:?} in fast field, but got {:?}",
|
||||
expected, actual
|
||||
))
|
||||
}
|
||||
|
||||
fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(_) | FieldType::Date(_) => common::i64_to_u64(0i64),
|
||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||
_ => 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
let mut fast_fields: Vec<Option<String>> = vec![None; schema.num_fields()];
|
||||
let mut date_precisions: Vec<DatePrecision> =
|
||||
std::iter::repeat_with(DatePrecision::default)
|
||||
.take(schema.num_fields())
|
||||
.collect();
|
||||
// TODO see other types
|
||||
for (field_id, field_entry) in schema.fields() {
|
||||
if !field_entry.field_type().is_fast() {
|
||||
continue;
|
||||
}
|
||||
fast_fields[field_id.field_id() as usize] = Some(field_entry.name().to_string());
|
||||
let column_type = match field_entry.field_type().value_type() {
|
||||
Type::Str => ColumnType::Str,
|
||||
Type::U64 => ColumnType::Numerical(NumericalType::U64),
|
||||
Type::I64 => ColumnType::Numerical(NumericalType::I64),
|
||||
Type::F64 => ColumnType::Numerical(NumericalType::F64),
|
||||
Type::Bool => ColumnType::Bool,
|
||||
Type::Date => ColumnType::DateTime,
|
||||
Type::Facet => ColumnType::Str,
|
||||
Type::Bytes => todo!(),
|
||||
Type::Json => {
|
||||
continue;
|
||||
let mut u128_value_writers = Vec::new();
|
||||
let mut u128_multi_value_writers = Vec::new();
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut term_id_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
for (field, field_entry) in schema.fields() {
|
||||
match field_entry.field_type() {
|
||||
FieldType::I64(ref int_options)
|
||||
| FieldType::U64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Bool(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
|
||||
let default_value = fast_field_default_value(field_entry);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(
|
||||
field,
|
||||
FastFieldType::Numeric,
|
||||
None,
|
||||
);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
Type::IpAddr => todo!(),
|
||||
};
|
||||
if let FieldType::Date(date_options) = field_entry.field_type() {
|
||||
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
|
||||
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer =
|
||||
IntFastFieldWriter::new(field, Some(options.get_precision()));
|
||||
let default_value = fast_field_default_value(field_entry);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(
|
||||
field,
|
||||
FastFieldType::Numeric,
|
||||
Some(options.get_precision()),
|
||||
);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Facet(_) => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Str(_) if field_entry.is_fast() => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes(bytes_option) => {
|
||||
if bytes_option.is_fast() {
|
||||
let fast_field_writer = BytesFastFieldWriter::new(field);
|
||||
bytes_value_writers.push(fast_field_writer);
|
||||
}
|
||||
}
|
||||
FieldType::IpAddr(opt) => {
|
||||
if opt.is_fast() {
|
||||
match opt.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let fast_field_writer = U128FastFieldWriter::new(field);
|
||||
u128_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValueU128FastFieldWriter::new(field);
|
||||
u128_multi_value_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {}
|
||||
}
|
||||
columnar_writer.record_column_type(field_entry.name(), column_type);
|
||||
}
|
||||
FastFieldsWriter {
|
||||
columnar_writer,
|
||||
fast_field_names: fast_fields,
|
||||
num_docs: 0u32,
|
||||
date_precisions,
|
||||
u128_value_writers,
|
||||
u128_multi_value_writers,
|
||||
term_id_writers,
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.columnar_writer.mem_usage()
|
||||
self.term_id_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_values_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.bytes_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.u128_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.u128_multi_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated with a field.
|
||||
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated with a field.
|
||||
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
.iter()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated with a field.
|
||||
pub fn get_field_writer_mut(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated with a field.
|
||||
pub fn get_term_id_writer_mut(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns `None` if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub fn get_multivalue_writer_mut(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the bytes fast field writer for the given field.
|
||||
///
|
||||
/// Returns `None` if the field does not exist, or is not
|
||||
/// configured as a bytes fastfield in the schema.
|
||||
pub fn get_bytes_writer_mut(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.bytes_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
|
||||
let doc_id = self.num_docs;
|
||||
for field_value in doc.field_values() {
|
||||
if let Some(field_name) =
|
||||
self.fast_field_names[field_value.field().field_id() as usize].as_ref()
|
||||
{
|
||||
match &field_value.value {
|
||||
Value::U64(u64_val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
NumericalValue::from(*u64_val),
|
||||
);
|
||||
}
|
||||
Value::I64(i64_val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
NumericalValue::from(*i64_val),
|
||||
);
|
||||
}
|
||||
Value::F64(f64_val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
NumericalValue::from(*f64_val),
|
||||
);
|
||||
}
|
||||
Value::Str(_) => todo!(),
|
||||
Value::PreTokStr(_) => todo!(),
|
||||
Value::Bool(bool_val) => {
|
||||
self.columnar_writer
|
||||
.record_bool(doc_id, field_name.as_str(), *bool_val);
|
||||
}
|
||||
Value::Date(datetime) => {
|
||||
let date_precision =
|
||||
self.date_precisions[field_value.field().field_id() as usize];
|
||||
let truncated_datetime = datetime.truncate(date_precision);
|
||||
self.columnar_writer.record_datetime(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
truncated_datetime.into(),
|
||||
);
|
||||
}
|
||||
Value::Facet(_) => todo!(),
|
||||
Value::Bytes(_) => todo!(),
|
||||
Value::JsonObject(_) => todo!(),
|
||||
Value::IpAddr(_) => todo!(),
|
||||
}
|
||||
}
|
||||
for field_writer in &mut self.term_id_writers {
|
||||
field_writer.add_document(doc)?;
|
||||
}
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc)?;
|
||||
}
|
||||
for field_writer in &mut self.multi_values_writers {
|
||||
field_writer.add_document(doc)?;
|
||||
}
|
||||
for field_writer in &mut self.bytes_value_writers {
|
||||
field_writer.add_document(doc)?;
|
||||
}
|
||||
for field_writer in &mut self.u128_value_writers {
|
||||
field_writer.add_document(doc)?;
|
||||
}
|
||||
for field_writer in &mut self.u128_multi_value_writers {
|
||||
field_writer.add_document(doc)?;
|
||||
}
|
||||
self.num_docs += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
mut self,
|
||||
wrt: &mut dyn io::Write,
|
||||
self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
assert!(doc_id_map.is_none()); // TODO handle doc id map
|
||||
let num_docs = self.num_docs;
|
||||
self.columnar_writer.serialize(num_docs, wrt)?;
|
||||
for field_writer in self.term_id_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
|
||||
for field_writer in self.multi_values_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
|
||||
}
|
||||
for field_writer in self.bytes_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
for field_writer in self.u128_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
for field_writer in self.u128_multi_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast field writer for u128 values.
|
||||
/// The fast field writer just keeps the values in memory.
|
||||
///
|
||||
/// Only when the segment writer can be closed and
|
||||
/// persisted on disk, the fast field writer is
|
||||
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
|
||||
/// method.
|
||||
///
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// compressed to a compact number space and the number of
|
||||
/// bits required for bitpacking can only been known once
|
||||
/// we have seen all of the values.
|
||||
pub struct U128FastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u128>,
|
||||
val_count: u32,
|
||||
}
|
||||
|
||||
impl U128FastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field) -> Self {
|
||||
Self {
|
||||
field,
|
||||
vals: vec![],
|
||||
val_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.len() * 16
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u128) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
let ip_addr = v.as_ip_addr().ok_or_else(|| unexpected_value("ip", v))?;
|
||||
let value = ip_addr.to_u128();
|
||||
self.add_val(value);
|
||||
}
|
||||
None => {
|
||||
self.add_val(0); // TODO fix null handling
|
||||
}
|
||||
};
|
||||
self.val_count += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let iter_gen = || {
|
||||
doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|idx| self.vals[idx as usize])
|
||||
};
|
||||
|
||||
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
|
||||
} else {
|
||||
let iter_gen = || self.vals.iter().cloned();
|
||||
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast field writer for ints.
|
||||
/// The fast field writer just keeps the values in memory.
|
||||
///
|
||||
/// Only when the segment writer can be closed and
|
||||
/// persisted on disk, the fast field writer is
|
||||
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
|
||||
/// method.
|
||||
///
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// can only been known once we have seen all of the values.
|
||||
///
|
||||
/// Both u64, i64 and f64 use the same writer.
|
||||
/// i64 and f64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
precision_opt: Option<DatePrecision>,
|
||||
vals: BlockedBitpacker,
|
||||
val_count: usize,
|
||||
val_if_missing: u64,
|
||||
val_min: u64,
|
||||
val_max: u64,
|
||||
}
|
||||
|
||||
impl IntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
|
||||
IntFastFieldWriter {
|
||||
field,
|
||||
precision_opt,
|
||||
vals: BlockedBitpacker::new(),
|
||||
val_count: 0,
|
||||
val_if_missing: 0u64,
|
||||
val_min: u64::MAX,
|
||||
val_max: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.mem_usage()
|
||||
}
|
||||
|
||||
/// Returns the field that this writer is targeting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Sets the default value.
|
||||
///
|
||||
/// This default value is recorded for documents if
|
||||
/// a document does not have any value.
|
||||
fn set_val_if_missing(&mut self, val_if_missing: u64) {
|
||||
self.val_if_missing = val_if_missing;
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitly
|
||||
/// associated with the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
self.vals.add(val);
|
||||
|
||||
if val > self.val_max {
|
||||
self.val_max = val;
|
||||
}
|
||||
if val < self.val_min {
|
||||
self.val_min = val;
|
||||
}
|
||||
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
///
|
||||
/// Extract the value associated with the fast field for
|
||||
/// this document.
|
||||
///
|
||||
/// i64 and f64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
///
|
||||
/// If the value is missing, then the default value is used
|
||||
/// instead.
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
///
|
||||
/// Values on text fast fields are skipped.
|
||||
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
let value = match (self.precision_opt, v) {
|
||||
(Some(precision), Value::Date(date_val)) => {
|
||||
date_val.truncate(precision).to_u64()
|
||||
}
|
||||
_ => super::value_to_u64(v)?,
|
||||
};
|
||||
self.add_val(value);
|
||||
}
|
||||
None => {
|
||||
self.add_val(self.val_if_missing);
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// get iterator over the data
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
self.vals.iter()
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
let (min, max) = if self.val_min > self.val_max {
|
||||
(0, 0)
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
min_value: min,
|
||||
max_value: max,
|
||||
num_vals: self.val_count as u32,
|
||||
};
|
||||
|
||||
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
doc_id_map: Option<&'map DocIdMapping>,
|
||||
vals: &'bitp BlockedBitpacker,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
num_vals: u32,
|
||||
}
|
||||
|
||||
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated with the given doc.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
|
||||
/// reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the index.
|
||||
fn get_val(&self, _doc: u32) -> u64 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
if let Some(doc_id_map) = self.doc_id_map {
|
||||
Box::new(
|
||||
doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize)),
|
||||
)
|
||||
} else {
|
||||
Box::new(self.vals.iter())
|
||||
}
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,35 +113,34 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
sort_by_field: IndexSortByField,
|
||||
segment_writer: &SegmentWriter,
|
||||
) -> crate::Result<DocIdMapping> {
|
||||
todo!()
|
||||
// let schema = segment_writer.segment_serializer.segment().schema();
|
||||
// let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
|
||||
// fastfield, but not strictly required let fast_field = segment_writer
|
||||
// .fast_field_writers
|
||||
// .get_field_writer(field_id)
|
||||
// .ok_or_else(|| {
|
||||
// TantivyError::InvalidArgument(format!(
|
||||
// "sort index by field is required to be a fast field {:?}",
|
||||
// sort_by_field.field
|
||||
// ))
|
||||
// })?;
|
||||
let schema = segment_writer.segment_serializer.segment().schema();
|
||||
let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let fast_field = segment_writer
|
||||
.fast_field_writers
|
||||
.get_field_writer(field_id)
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"sort index by field is required to be a fast field {:?}",
|
||||
sort_by_field.field
|
||||
))
|
||||
})?;
|
||||
|
||||
// // create new doc_id to old doc_id index (used in fast_field_writers)
|
||||
// let mut doc_id_and_data = fast_field
|
||||
// .iter()
|
||||
// .enumerate()
|
||||
// .map(|el| (el.0 as DocId, el.1))
|
||||
// .collect::<Vec<_>>();
|
||||
// if sort_by_field.order == Order::Desc {
|
||||
// doc_id_and_data.sort_by_key(|k| Reverse(k.1));
|
||||
// } else {
|
||||
// doc_id_and_data.sort_by_key(|k| k.1);
|
||||
// }
|
||||
// let new_doc_id_to_old = doc_id_and_data
|
||||
// .into_iter()
|
||||
// .map(|el| el.0)
|
||||
// .collect::<Vec<_>>();
|
||||
// Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
|
||||
// create new doc_id to old doc_id index (used in fast_field_writers)
|
||||
let mut doc_id_and_data = fast_field
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|el| (el.0 as DocId, el.1))
|
||||
.collect::<Vec<_>>();
|
||||
if sort_by_field.order == Order::Desc {
|
||||
doc_id_and_data.sort_by_key(|k| Reverse(k.1));
|
||||
} else {
|
||||
doc_id_and_data.sort_by_key(|k| k.1);
|
||||
}
|
||||
let new_doc_id_to_old = doc_id_and_data
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect::<Vec<_>>();
|
||||
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -160,11 +159,15 @@ mod tests_indexsorting {
|
||||
|
||||
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
|
||||
let my_number =
|
||||
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
|
||||
let my_number = schema_builder.add_u64_field(
|
||||
"my_number",
|
||||
NumericOptions::default().set_fast(Cardinality::SingleValue),
|
||||
);
|
||||
|
||||
let multi_numbers =
|
||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
@@ -438,48 +441,47 @@ mod tests_indexsorting {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_sort_index_fast_field() -> crate::Result<()> {
|
||||
// let index = create_test_index(
|
||||
// Some(IndexSettings {
|
||||
// sort_by_field: Some(IndexSortByField {
|
||||
// field: "my_number".to_string(),
|
||||
// order: Order::Asc,
|
||||
// }),
|
||||
// ..Default::default()
|
||||
// }),
|
||||
// get_text_options(),
|
||||
// )?;
|
||||
// assert_eq!(
|
||||
// index.settings().sort_by_field.as_ref().unwrap().field,
|
||||
// "my_number".to_string()
|
||||
// );
|
||||
#[test]
|
||||
fn test_sort_index_fast_field() -> crate::Result<()> {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
)?;
|
||||
assert_eq!(
|
||||
index.settings().sort_by_field.as_ref().unwrap().field,
|
||||
"my_number".to_string()
|
||||
);
|
||||
|
||||
// let searcher = index.reader()?.searcher();
|
||||
// assert_eq!(searcher.segment_readers().len(), 1);
|
||||
// let segment_reader = searcher.segment_reader(0);
|
||||
// let fast_fields = segment_reader.fast_fields();
|
||||
// let my_number = index.schema().get_field("my_number").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
index.schema().get_field("my_number").unwrap();
|
||||
|
||||
// let fast_field = fast_fields.u64(my_number).unwrap();
|
||||
// assert_eq!(fast_field.get_val(0), 10u64);
|
||||
// assert_eq!(fast_field.get_val(1), 20u64);
|
||||
// assert_eq!(fast_field.get_val(2), 30u64);
|
||||
let fast_field = fast_fields.u64("my_number").unwrap();
|
||||
assert_eq!(fast_field.get_val(0), 10u64);
|
||||
assert_eq!(fast_field.get_val(1), 20u64);
|
||||
assert_eq!(fast_field.get_val(2), 30u64);
|
||||
|
||||
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
// let multifield = fast_fields.u64s(multi_numbers).unwrap();
|
||||
// let mut vals = vec![];
|
||||
// multifield.get_vals(0u32, &mut vals);
|
||||
// assert_eq!(vals, &[] as &[u64]);
|
||||
// let mut vals = vec![];
|
||||
// multifield.get_vals(1u32, &mut vals);
|
||||
// assert_eq!(vals, &[5, 6]);
|
||||
let multifield = fast_fields.u64s("multi_numbers").unwrap();
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(0u32, &mut vals);
|
||||
assert_eq!(vals, &[] as &[u64]);
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(1u32, &mut vals);
|
||||
assert_eq!(vals, &[5, 6]);
|
||||
|
||||
// let mut vals = vec![];
|
||||
// multifield.get_vals(2u32, &mut vals);
|
||||
// assert_eq!(vals, &[3]);
|
||||
// Ok(())
|
||||
// }
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(2u32, &mut vals);
|
||||
assert_eq!(vals, &[3]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping() {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -150,6 +150,7 @@ fn index_json_value(
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
None,
|
||||
);
|
||||
}
|
||||
TextOrDateTime::DateTime(dt) => {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,17 +2,19 @@
|
||||
mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions,
|
||||
};
|
||||
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
@@ -60,7 +62,7 @@ mod tests {
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_stored()
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
@@ -69,8 +71,10 @@ mod tests {
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
|
||||
let multi_numbers =
|
||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
@@ -345,130 +349,128 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_merge_sorted_index_asc() {
|
||||
// let index = create_test_index(
|
||||
// Some(IndexSettings {
|
||||
// sort_by_field: Some(IndexSortByField {
|
||||
// field: "intval".to_string(),
|
||||
// order: Order::Asc,
|
||||
// }),
|
||||
// ..Default::default()
|
||||
// }),
|
||||
// false,
|
||||
// )
|
||||
// .unwrap();
|
||||
#[test]
|
||||
fn test_merge_sorted_index_asc() {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// let int_field = index.schema().get_field("intval").unwrap();
|
||||
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
// let bytes_field = index.schema().get_field("bytes").unwrap();
|
||||
// let reader = index.reader().unwrap();
|
||||
// let searcher = reader.searcher();
|
||||
// assert_eq!(searcher.segment_readers().len(), 1);
|
||||
// let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
|
||||
// let fast_fields = segment_reader.fast_fields();
|
||||
// let fast_field = fast_fields.u64(int_field).unwrap();
|
||||
// assert_eq!(fast_field.get_val(0), 1u64);
|
||||
// assert_eq!(fast_field.get_val(1), 2u64);
|
||||
// assert_eq!(fast_field.get_val(2), 3u64);
|
||||
// assert_eq!(fast_field.get_val(3), 10u64);
|
||||
// assert_eq!(fast_field.get_val(4), 20u64);
|
||||
// assert_eq!(fast_field.get_val(5), 1_000u64);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64("intval").unwrap();
|
||||
assert_eq!(fast_field.get_val(0), 1u64);
|
||||
assert_eq!(fast_field.get_val(1), 2u64);
|
||||
assert_eq!(fast_field.get_val(2), 3u64);
|
||||
assert_eq!(fast_field.get_val(3), 10u64);
|
||||
assert_eq!(fast_field.get_val(4), 20u64);
|
||||
assert_eq!(fast_field.get_val(5), 1_000u64);
|
||||
|
||||
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
||||
// let mut vals = vec![];
|
||||
// fast_field.get_vals(doc_id, &mut vals);
|
||||
// vals
|
||||
// };
|
||||
// let fast_fields = segment_reader.fast_fields();
|
||||
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
|
||||
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
|
||||
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
|
||||
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
|
||||
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
|
||||
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
|
||||
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
|
||||
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
||||
let mut vals = vec![];
|
||||
fast_field.get_vals(doc_id, &mut vals);
|
||||
vals
|
||||
};
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64s("multi_numbers").unwrap();
|
||||
assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
|
||||
assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
|
||||
assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
|
||||
assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
|
||||
assert_eq!(&get_vals(&fast_field, 4), &[20]);
|
||||
assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
|
||||
|
||||
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
|
||||
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
|
||||
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
|
||||
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
|
||||
let fast_field = fast_fields.bytes("bytes").unwrap();
|
||||
assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
|
||||
assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
|
||||
assert_eq!(fast_field.get_bytes(5), &[5, 5]);
|
||||
|
||||
// // test new field norm mapping
|
||||
// {
|
||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
|
||||
// }
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
|
||||
}
|
||||
|
||||
// let searcher = index.reader().unwrap().searcher();
|
||||
// {
|
||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
// let do_search = |term: &str| {
|
||||
// let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
// .parse_query(term)
|
||||
// .unwrap();
|
||||
// let top_docs: Vec<(f32, DocAddress)> =
|
||||
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
// };
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
// assert_eq!(do_search("some"), vec![2]);
|
||||
// assert_eq!(do_search("blubber"), vec![3]);
|
||||
// assert_eq!(do_search("biggest"), vec![5]);
|
||||
// }
|
||||
assert_eq!(do_search("some"), vec![2]);
|
||||
assert_eq!(do_search("blubber"), vec![3]);
|
||||
assert_eq!(do_search("biggest"), vec![5]);
|
||||
}
|
||||
|
||||
// // postings file
|
||||
// {
|
||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
// let term_a = Term::from_field_text(my_text_field, "text");
|
||||
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
// let mut postings = inverted_index
|
||||
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
// .unwrap()
|
||||
// .unwrap();
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
// assert_eq!(postings.doc_freq(), 2);
|
||||
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
// assert_eq!(
|
||||
// postings.doc_freq_given_deletes(
|
||||
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
// ),
|
||||
// 2
|
||||
// );
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
// let mut output = vec![];
|
||||
// postings.positions(&mut output);
|
||||
// assert_eq!(output, vec![1, 3]);
|
||||
// postings.advance();
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
postings.advance();
|
||||
|
||||
// postings.positions(&mut output);
|
||||
// assert_eq!(output, vec![1]);
|
||||
// }
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
}
|
||||
|
||||
// // access doc store
|
||||
// {
|
||||
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
|
||||
// }
|
||||
// }
|
||||
// access doc store
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
|
||||
let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
|
||||
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
|
||||
let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
|
||||
let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
|
||||
let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
@@ -485,7 +487,9 @@ mod bench_sorted_index_merge {
|
||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
|
||||
@@ -19,8 +19,8 @@ mod segment_register;
|
||||
pub mod segment_serializer;
|
||||
pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
// mod sorted_doc_id_column;
|
||||
// mod sorted_doc_id_multivalue_column;
|
||||
mod sorted_doc_id_column;
|
||||
mod sorted_doc_id_multivalue_column;
|
||||
mod stamper;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
@@ -58,7 +58,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
|
||||
#[cfg(test)]
|
||||
mod tests_mmap {
|
||||
use crate::collector::Count;
|
||||
// use crate::query::QueryParser;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{JsonObjectOptions, Schema, TEXT};
|
||||
use crate::{Index, Term};
|
||||
|
||||
@@ -79,45 +79,45 @@ mod tests_mmap {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_json_field_expand_dots_disabled_dot_escaped_required() {
|
||||
// let mut schema_builder = Schema::builder();
|
||||
// let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
// let index = Index::create_in_ram(schema_builder.build());
|
||||
// let mut index_writer = index.writer_for_tests().unwrap();
|
||||
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
// index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
// index_writer.commit().unwrap();
|
||||
// let reader = index.reader().unwrap();
|
||||
// let searcher = reader.searcher();
|
||||
// assert_eq!(searcher.num_docs(), 1);
|
||||
// let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
// let query = parse_query
|
||||
// .parse_query(r#"json.k8s\.container\.name:prometheus"#)
|
||||
// .unwrap();
|
||||
// let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
// assert_eq!(num_docs, 1);
|
||||
// }
|
||||
#[test]
|
||||
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
let query = parse_query
|
||||
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
|
||||
.unwrap();
|
||||
let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
assert_eq!(num_docs, 1);
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
|
||||
// let mut schema_builder = Schema::builder();
|
||||
// let json_options: JsonObjectOptions =
|
||||
// JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
|
||||
// let json_field = schema_builder.add_json_field("json", json_options);
|
||||
// let index = Index::create_in_ram(schema_builder.build());
|
||||
// let mut index_writer = index.writer_for_tests().unwrap();
|
||||
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
// index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
// index_writer.commit().unwrap();
|
||||
// let reader = index.reader().unwrap();
|
||||
// let searcher = reader.searcher();
|
||||
// assert_eq!(searcher.num_docs(), 1);
|
||||
// let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
// let query = parse_query
|
||||
// .parse_query(r#"json.k8s.container.name:prometheus"#)
|
||||
// .unwrap();
|
||||
// let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
// assert_eq!(num_docs, 1);
|
||||
// }
|
||||
#[test]
|
||||
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_options: JsonObjectOptions =
|
||||
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
|
||||
let json_field = schema_builder.add_json_field("json", json_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
let query = parse_query
|
||||
.parse_query(r#"json.k8s.container.name:prometheus"#)
|
||||
.unwrap();
|
||||
let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
assert_eq!(num_docs, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
use common::TerminatingWrite;
|
||||
|
||||
use crate::core::{Segment, SegmentComponent};
|
||||
use crate::directory::WritePtr;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
@@ -12,7 +9,7 @@ use crate::store::StoreWriter;
|
||||
pub struct SegmentSerializer {
|
||||
segment: Segment,
|
||||
pub(crate) store_writer: StoreWriter,
|
||||
fast_field_write: WritePtr,
|
||||
fast_field_serializer: CompositeFastFieldSerializer,
|
||||
fieldnorms_serializer: Option<FieldNormsSerializer>,
|
||||
postings_serializer: InvertedIndexSerializer,
|
||||
}
|
||||
@@ -50,6 +47,7 @@ impl SegmentSerializer {
|
||||
};
|
||||
|
||||
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
|
||||
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
|
||||
|
||||
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
|
||||
@@ -58,7 +56,7 @@ impl SegmentSerializer {
|
||||
Ok(SegmentSerializer {
|
||||
segment,
|
||||
store_writer,
|
||||
fast_field_write,
|
||||
fast_field_serializer,
|
||||
fieldnorms_serializer: Some(fieldnorms_serializer),
|
||||
postings_serializer,
|
||||
})
|
||||
@@ -83,8 +81,8 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the `FastFieldSerializer`.
|
||||
pub fn get_fast_field_write(&mut self) -> &mut WritePtr {
|
||||
&mut self.fast_field_write
|
||||
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
|
||||
&mut self.fast_field_serializer
|
||||
}
|
||||
|
||||
/// Extract the field norm serializer.
|
||||
@@ -104,7 +102,7 @@ impl SegmentSerializer {
|
||||
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
|
||||
fieldnorms_serializer.close()?;
|
||||
}
|
||||
self.fast_field_write.terminate()?;
|
||||
self.fast_field_serializer.close()?;
|
||||
self.postings_serializer.close()?;
|
||||
self.store_writer.close()?;
|
||||
Ok(())
|
||||
|
||||
@@ -139,6 +139,7 @@ impl SegmentWriter {
|
||||
self.ctx,
|
||||
self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
&self.schema,
|
||||
self.segment_serializer,
|
||||
mapping.as_ref(),
|
||||
)?;
|
||||
@@ -184,15 +185,22 @@ impl SegmentWriter {
|
||||
for value in values {
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
&mut *facet_tokenizer,
|
||||
term_buffer,
|
||||
ctx,
|
||||
&mut indexing_position,
|
||||
);
|
||||
let mut unordered_term_id_opt = None;
|
||||
FacetTokenizer
|
||||
.token_stream(facet_str)
|
||||
.process(&mut |token| {
|
||||
term_buffer.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
// TODO pass indexing context directly in subscribe function
|
||||
unordered_term_id_opt = Some(unordered_term_id);
|
||||
});
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
self.fast_field_writers
|
||||
.get_term_id_writer_mut(field)
|
||||
.expect("writer for facet missing")
|
||||
.add_val(unordered_term_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
@@ -219,6 +227,7 @@ impl SegmentWriter {
|
||||
term_buffer,
|
||||
ctx,
|
||||
&mut indexing_position,
|
||||
self.fast_field_writers.get_term_id_writer_mut(field),
|
||||
);
|
||||
}
|
||||
if field_entry.has_fieldnorms() {
|
||||
@@ -374,6 +383,7 @@ fn remap_and_write(
|
||||
ctx: IndexingContext,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: &FieldNormsWriter,
|
||||
schema: &Schema,
|
||||
mut serializer: SegmentSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> crate::Result<()> {
|
||||
@@ -385,15 +395,20 @@ fn remap_and_write(
|
||||
.segment()
|
||||
.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
serialize_postings(
|
||||
let term_ord_map = serialize_postings(
|
||||
ctx,
|
||||
per_field_postings_writers,
|
||||
fieldnorm_readers,
|
||||
doc_id_map,
|
||||
schema,
|
||||
serializer.get_postings_serializer(),
|
||||
)?;
|
||||
debug!("fastfield-serialize");
|
||||
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
|
||||
fast_field_writers.serialize(
|
||||
serializer.get_fast_field_serializer(),
|
||||
&term_ord_map,
|
||||
doc_id_map,
|
||||
)?;
|
||||
|
||||
// finalize temp docstore and create version, which reflects the doc_id_map
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
|
||||
18
src/lib.rs
18
src/lib.rs
@@ -147,22 +147,6 @@ pub struct DateTime {
|
||||
pub(crate) timestamp_micros: i64,
|
||||
}
|
||||
|
||||
impl From<columnar::DateTime> for DateTime {
|
||||
fn from(columnar_datetime: columnar::DateTime) -> Self {
|
||||
DateTime {
|
||||
timestamp_micros: columnar_datetime.timestamp_micros,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DateTime> for columnar::DateTime {
|
||||
fn from(datetime: crate::DateTime) -> Self {
|
||||
columnar::DateTime {
|
||||
timestamp_micros: datetime.timestamp_micros,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DateTime {
|
||||
/// Create new from UNIX timestamp in seconds
|
||||
pub const fn from_timestamp_secs(seconds: i64) -> Self {
|
||||
@@ -279,7 +263,7 @@ mod indexer;
|
||||
pub mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
// pub mod aggregation;
|
||||
pub mod aggregation;
|
||||
pub mod collector;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
|
||||
@@ -2,10 +2,13 @@ use std::io;
|
||||
|
||||
use stacker::Addr;
|
||||
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
|
||||
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::postings::{
|
||||
FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
|
||||
};
|
||||
use crate::schema::term::as_json_path_type_value_bytes;
|
||||
use crate::schema::Type;
|
||||
use crate::tokenizer::TokenStream;
|
||||
@@ -30,8 +33,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
pos: u32,
|
||||
term: &crate::Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) {
|
||||
self.non_str_posting_writer.subscribe(doc, pos, term, ctx);
|
||||
) -> UnorderedTermId {
|
||||
self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
|
||||
}
|
||||
|
||||
fn index_text(
|
||||
@@ -41,6 +44,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
self.str_posting_writer.index_text(
|
||||
doc_id,
|
||||
@@ -48,19 +52,20 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr)],
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr) in term_addrs {
|
||||
for (term, addr, _) in term_addrs {
|
||||
// TODO optimization opportunity here.
|
||||
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
|
||||
if typ == Type::Str {
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::ops::Range;
|
||||
use rustc_hash::FxHashMap;
|
||||
use stacker::Addr;
|
||||
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
@@ -20,10 +21,12 @@ use crate::DocId;
|
||||
|
||||
const POSITION_GAP: u32 = 1;
|
||||
|
||||
fn make_field_partition(term_offsets: &[(Term<&[u8]>, Addr)]) -> Vec<(Field, Range<usize>)> {
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(term, _)| term.field())
|
||||
.map(|(term, _, _)| term.field())
|
||||
.enumerate();
|
||||
let mut prev_field_opt = None;
|
||||
let mut fields = vec![];
|
||||
@@ -51,18 +54,48 @@ pub(crate) fn serialize_postings(
|
||||
per_field_postings_writers: &PerFieldPostingsWriter,
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
schema: &Schema,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> crate::Result<()> {
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr)> = Vec::with_capacity(ctx.term_index.len());
|
||||
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(ctx.term_index.len());
|
||||
term_offsets.extend(
|
||||
ctx.term_index
|
||||
.iter()
|
||||
.map(|(bytes, addr, _unordered_id)| (Term::wrap(bytes), addr)),
|
||||
.map(|(bytes, addr, unordered_id)| (Term::wrap(bytes), addr, unordered_id)),
|
||||
);
|
||||
term_offsets.sort_unstable_by_key(|(k, _)| k.clone());
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
for (field, byte_offsets) in field_offsets {
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::Facet(_) => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||
})
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::F64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bool(_) => {}
|
||||
FieldType::Bytes(_) => {}
|
||||
FieldType::JsonObject(_) => {}
|
||||
FieldType::IpAddr(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
|
||||
let mut field_serializer =
|
||||
@@ -75,7 +108,7 @@ pub(crate) fn serialize_postings(
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(())
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -96,13 +129,19 @@ pub(crate) trait PostingsWriter: Send + Sync {
|
||||
/// * term - the term
|
||||
/// * ctx - Contains a term hashmap and a memory arena to store all necessary posting list
|
||||
/// information.
|
||||
fn subscribe(&mut self, doc: DocId, pos: u32, term: &Term, ctx: &mut IndexingContext);
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr)],
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
@@ -116,6 +155,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
let end_of_path_idx = term_buffer.len_bytes();
|
||||
let mut num_tokens = 0;
|
||||
@@ -135,7 +175,11 @@ pub(crate) trait PostingsWriter: Send + Sync {
|
||||
term_buffer.append_bytes(token.text.as_bytes());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = end_position.max(start_position + token.position_length as u32);
|
||||
self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
|
||||
term_id_fast_field_writer.add_val(unordered_term_id);
|
||||
}
|
||||
|
||||
num_tokens += 1;
|
||||
});
|
||||
|
||||
@@ -183,7 +227,13 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
self.total_num_tokens += 1;
|
||||
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
|
||||
@@ -202,18 +252,18 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
}
|
||||
});
|
||||
}) as UnorderedTermId
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr)],
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr) in term_addrs {
|
||||
for (term, addr, _) in term_addrs {
|
||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -15,7 +15,7 @@ mod more_like_this;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
// mod range_query;
|
||||
mod range_query;
|
||||
mod regex_query;
|
||||
mod reqopt_scorer;
|
||||
mod scorer;
|
||||
@@ -50,7 +50,7 @@ pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||
// pub use self::range_query::RangeQuery;
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::regex_query::RegexQuery;
|
||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||
pub use self::score_combiner::{
|
||||
|
||||
@@ -13,19 +13,10 @@ use crate::core::Index;
|
||||
use crate::indexer::{
|
||||
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
|
||||
};
|
||||
// use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||
use crate::query::{
|
||||
AllQuery,
|
||||
BooleanQuery,
|
||||
BoostQuery,
|
||||
EmptyQuery,
|
||||
FuzzyTermQuery,
|
||||
Occur,
|
||||
PhraseQuery,
|
||||
Query,
|
||||
// RangeQuery,
|
||||
TermQuery,
|
||||
TermSetQuery,
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query,
|
||||
RangeQuery, TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
@@ -343,90 +334,89 @@ impl QueryParser {
|
||||
json_path: &str,
|
||||
phrase: &str,
|
||||
) -> Result<Term, QueryParserError> {
|
||||
todo!();
|
||||
// let field_entry = self.schema.get_field_entry(field);
|
||||
// let field_type = field_entry.field_type();
|
||||
// let field_supports_ff_range_queries = field_type.is_fast()
|
||||
// && is_type_valid_for_fastfield_range_query(field_type.value_type());
|
||||
//
|
||||
// if !field_type.is_indexed() && !field_supports_ff_range_queries {
|
||||
// return Err(QueryParserError::FieldNotIndexed(
|
||||
// field_entry.name().to_string(),
|
||||
// ));
|
||||
// }
|
||||
// if !json_path.is_empty() && field_type.value_type() != Type::Json {
|
||||
// return Err(QueryParserError::UnsupportedQuery(format!(
|
||||
// "Json path is not supported for field {:?}",
|
||||
// field_entry.name()
|
||||
// )));
|
||||
// }
|
||||
// match *field_type {
|
||||
// FieldType::U64(_) => {
|
||||
// let val: u64 = u64::from_str(phrase)?;
|
||||
// Ok(Term::from_field_u64(field, val))
|
||||
// }
|
||||
// FieldType::I64(_) => {
|
||||
// let val: i64 = i64::from_str(phrase)?;
|
||||
// Ok(Term::from_field_i64(field, val))
|
||||
// }
|
||||
// FieldType::F64(_) => {
|
||||
// let val: f64 = f64::from_str(phrase)?;
|
||||
// Ok(Term::from_field_f64(field, val))
|
||||
// }
|
||||
// FieldType::Bool(_) => {
|
||||
// let val: bool = bool::from_str(phrase)?;
|
||||
// Ok(Term::from_field_bool(field, val))
|
||||
// }
|
||||
// FieldType::Date(_) => {
|
||||
// let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||
// Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
|
||||
// }
|
||||
// FieldType::Str(ref str_options) => {
|
||||
// let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
// QueryParserError::FieldNotIndexed(field_entry.name().to_string())
|
||||
// })?;
|
||||
// let text_analyzer =
|
||||
// self.tokenizer_manager
|
||||
// .get(option.tokenizer())
|
||||
// .ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
// field: field_entry.name().to_string(),
|
||||
// tokenizer: option.tokenizer().to_string(),
|
||||
// })?;
|
||||
// let mut terms: Vec<Term> = Vec::new();
|
||||
// let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
// token_stream.process(&mut |token| {
|
||||
// let term = Term::from_field_text(field, &token.text);
|
||||
// terms.push(term);
|
||||
// });
|
||||
// if terms.len() != 1 {
|
||||
// return Err(QueryParserError::UnsupportedQuery(format!(
|
||||
// "Range query boundary cannot have multiple tokens: {phrase:?}."
|
||||
// )));
|
||||
// }
|
||||
// Ok(terms.into_iter().next().unwrap())
|
||||
// }
|
||||
// FieldType::JsonObject(_) => {
|
||||
// Json range are not supported.
|
||||
// Err(QueryParserError::UnsupportedQuery(
|
||||
// "Range query are not supported on json field.".to_string(),
|
||||
// ))
|
||||
// }
|
||||
// FieldType::Facet(_) => match Facet::from_text(phrase) {
|
||||
// Ok(facet) => Ok(Term::from_facet(field, &facet)),
|
||||
// Err(e) => Err(QueryParserError::from(e)),
|
||||
// },
|
||||
// FieldType::Bytes(_) => {
|
||||
// let bytes = BASE64
|
||||
// .decode(phrase)
|
||||
// .map_err(QueryParserError::ExpectedBase64)?;
|
||||
// Ok(Term::from_field_bytes(field, &bytes))
|
||||
// }
|
||||
// FieldType::IpAddr(_) => {
|
||||
// let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
|
||||
// Ok(Term::from_field_ip_addr(field, ip_v6))
|
||||
// }
|
||||
// }
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let field_supports_ff_range_queries = field_type.is_fast()
|
||||
&& is_type_valid_for_fastfield_range_query(field_type.value_type());
|
||||
|
||||
if !field_type.is_indexed() && !field_supports_ff_range_queries {
|
||||
return Err(QueryParserError::FieldNotIndexed(
|
||||
field_entry.name().to_string(),
|
||||
));
|
||||
}
|
||||
if !json_path.is_empty() && field_type.value_type() != Type::Json {
|
||||
return Err(QueryParserError::UnsupportedQuery(format!(
|
||||
"Json path is not supported for field {:?}",
|
||||
field_entry.name()
|
||||
)));
|
||||
}
|
||||
match *field_type {
|
||||
FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
Ok(Term::from_field_u64(field, val))
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
Ok(Term::from_field_i64(field, val))
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
let val: f64 = f64::from_str(phrase)?;
|
||||
Ok(Term::from_field_f64(field, val))
|
||||
}
|
||||
FieldType::Bool(_) => {
|
||||
let val: bool = bool::from_str(phrase)?;
|
||||
Ok(Term::from_field_bool(field, val))
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_entry.name().to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
token_stream.process(&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
terms.push(term);
|
||||
});
|
||||
if terms.len() != 1 {
|
||||
return Err(QueryParserError::UnsupportedQuery(format!(
|
||||
"Range query boundary cannot have multiple tokens: {phrase:?}."
|
||||
)));
|
||||
}
|
||||
Ok(terms.into_iter().next().unwrap())
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
// Json range are not supported.
|
||||
Err(QueryParserError::UnsupportedQuery(
|
||||
"Range query are not supported on json field.".to_string(),
|
||||
))
|
||||
}
|
||||
FieldType::Facet(_) => match Facet::from_text(phrase) {
|
||||
Ok(facet) => Ok(Term::from_facet(field, &facet)),
|
||||
Err(e) => Err(QueryParserError::from(e)),
|
||||
},
|
||||
FieldType::Bytes(_) => {
|
||||
let bytes = BASE64
|
||||
.decode(phrase)
|
||||
.map_err(QueryParserError::ExpectedBase64)?;
|
||||
Ok(Term::from_field_bytes(field, &bytes))
|
||||
}
|
||||
FieldType::IpAddr(_) => {
|
||||
let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
|
||||
Ok(Term::from_field_ip_addr(field, ip_v6))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_for_leaf(
|
||||
@@ -750,12 +740,9 @@ fn convert_literal_to_query(
|
||||
value_type,
|
||||
lower,
|
||||
upper,
|
||||
} => {
|
||||
todo!();
|
||||
// Box::new(RangeQuery::new_term_bounds(
|
||||
// field, value_type, &lower, &upper,
|
||||
// ))
|
||||
}
|
||||
} => Box::new(RangeQuery::new_term_bounds(
|
||||
field, value_type, &lower, &upper,
|
||||
)),
|
||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::fastfield::MakeZero;
|
||||
use crate::fastfield::{MakeZero, MultiValuedFastFieldReader};
|
||||
use crate::{DocId, DocSet, TERMINATED};
|
||||
|
||||
/// Helper to have a cursor over a vec of docids
|
||||
|
||||
@@ -8,13 +8,10 @@ use std::ops::{Bound, RangeInclusive};
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
|
||||
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
|
||||
use super::range_query::map_bound;
|
||||
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
|
||||
<<<<<<< HEAD
|
||||
use crate::schema::Cardinality;
|
||||
=======
|
||||
use crate::schema::Field;
|
||||
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
||||
|
||||
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
|
||||
@@ -43,7 +40,6 @@ impl IPFastFieldRangeWeight {
|
||||
|
||||
impl Weight for IPFastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
<<<<<<< HEAD
|
||||
let field_type = reader
|
||||
.schema()
|
||||
.get_field_entry(reader.schema().get_field(&self.field)?)
|
||||
@@ -78,40 +74,6 @@ impl Weight for IPFastFieldRangeWeight {
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
}
|
||||
=======
|
||||
todo!();
|
||||
// let field_type = reader.schema().get_field_entry(self.field).field_type();
|
||||
// match field_type.fastfield_cardinality().unwrap() {
|
||||
// Cardinality::SingleValue => {
|
||||
// let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
|
||||
// let value_range = bound_to_value_range(
|
||||
// &self.left_bound,
|
||||
// &self.right_bound,
|
||||
// ip_addr_fast_field.min_value(),
|
||||
// ip_addr_fast_field.max_value(),
|
||||
// );
|
||||
// let docset = RangeDocSet::new(
|
||||
// value_range,
|
||||
// FastFieldCardinality::SingleValue(ip_addr_fast_field),
|
||||
// );
|
||||
// Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
// }
|
||||
// Cardinality::MultiValues => {
|
||||
// let ip_addr_fast_field = reader.fast_fields().ip_addrs(self.field)?;
|
||||
// let value_range = bound_to_value_range(
|
||||
// &self.left_bound,
|
||||
// &self.right_bound,
|
||||
// ip_addr_fast_field.min_value(),
|
||||
// ip_addr_fast_field.max_value(),
|
||||
// );
|
||||
// let docset = RangeDocSet::new(
|
||||
// value_range,
|
||||
// FastFieldCardinality::MultiValue(ip_addr_fast_field),
|
||||
// );
|
||||
// Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
// }
|
||||
// }
|
||||
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
@@ -228,7 +190,7 @@ mod tests {
|
||||
let ips_field = schema_builder.add_ip_addr_field(
|
||||
"ips",
|
||||
IpAddrOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("id", STRING | STORED);
|
||||
|
||||
@@ -6,14 +6,10 @@ use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
|
||||
use super::fast_field_range_query::RangeDocSet;
|
||||
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
|
||||
use super::range_query::map_bound;
|
||||
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
|
||||
<<<<<<< HEAD
|
||||
use crate::schema::Cardinality;
|
||||
=======
|
||||
use crate::schema::Field;
|
||||
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
||||
|
||||
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
|
||||
@@ -37,7 +33,6 @@ impl FastFieldRangeWeight {
|
||||
|
||||
impl Weight for FastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
<<<<<<< HEAD
|
||||
let field_type = reader
|
||||
.schema()
|
||||
.get_field_entry(reader.schema().get_field(&self.field)?)
|
||||
@@ -68,36 +63,6 @@ impl Weight for FastFieldRangeWeight {
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
}
|
||||
=======
|
||||
todo!();
|
||||
// let field_type = reader.schema().get_field_entry(self.field).field_type();
|
||||
// match field_type.fastfield_cardinality().unwrap() {
|
||||
// Cardinality::SingleValue => {
|
||||
// let fast_field = reader.fast_fields().u64_lenient(self.field)?;
|
||||
// let value_range = bound_to_value_range(
|
||||
// &self.left_bound,
|
||||
// &self.right_bound,
|
||||
// fast_field.min_value(),
|
||||
// fast_field.max_value(),
|
||||
// );
|
||||
// let docset =
|
||||
// RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field));
|
||||
// Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
// }
|
||||
// Cardinality::MultiValues => {
|
||||
// let fast_field = reader.fast_fields().u64s_lenient(self.field)?;
|
||||
// let value_range = bound_to_value_range(
|
||||
// &self.left_bound,
|
||||
// &self.right_bound,
|
||||
// fast_field.min_value(),
|
||||
// fast_field.max_value(),
|
||||
// );
|
||||
// let docset =
|
||||
// RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field));
|
||||
// Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
// }
|
||||
// }
|
||||
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
@@ -221,7 +186,7 @@ mod tests {
|
||||
let ids_u64_field = schema_builder.add_u64_field(
|
||||
"ids",
|
||||
NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
|
||||
@@ -229,7 +194,7 @@ mod tests {
|
||||
let ids_f64_field = schema_builder.add_f64_field(
|
||||
"ids_f64",
|
||||
NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
|
||||
@@ -237,7 +202,7 @@ mod tests {
|
||||
let ids_i64_field = schema_builder.add_i64_field(
|
||||
"ids_i64",
|
||||
NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
|
||||
|
||||
@@ -2,16 +2,14 @@ use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::Cardinality;
|
||||
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
|
||||
/// DateTime Precision
|
||||
#[derive(
|
||||
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
|
||||
)]
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum DatePrecision {
|
||||
/// Seconds precision
|
||||
#[default]
|
||||
Seconds,
|
||||
/// Milli-seconds precision.
|
||||
Milliseconds,
|
||||
@@ -19,13 +17,20 @@ pub enum DatePrecision {
|
||||
Microseconds,
|
||||
}
|
||||
|
||||
impl Default for DatePrecision {
|
||||
fn default() -> Self {
|
||||
DatePrecision::Seconds
|
||||
}
|
||||
}
|
||||
|
||||
/// Defines how DateTime field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct DateOptions {
|
||||
indexed: bool,
|
||||
// This boolean has no effect if the field is not marked as indexed true.
|
||||
fieldnorms: bool,
|
||||
fast: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
// Internal storage precision, used to optimize storage
|
||||
// compression on fast fields.
|
||||
@@ -49,9 +54,18 @@ impl DateOptions {
|
||||
self.fieldnorms && self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field and multivalue.
|
||||
pub fn is_multivalue_fast(&self) -> bool {
|
||||
if let Some(cardinality) = self.fast {
|
||||
cardinality == Cardinality::MultiValues
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
self.fast.is_some()
|
||||
}
|
||||
|
||||
/// Set the field as stored.
|
||||
@@ -93,11 +107,19 @@ impl DateOptions {
|
||||
/// If more than one value is associated with a fast field, only the last one is
|
||||
/// kept.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> DateOptions {
|
||||
self.fast = true;
|
||||
pub fn set_fast(mut self, cardinality: Cardinality) -> DateOptions {
|
||||
self.fast = Some(cardinality);
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the cardinality of the fastfield.
|
||||
///
|
||||
/// If the field has not been declared as a fastfield, then
|
||||
/// the method returns `None`.
|
||||
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Sets the precision for this DateTime field.
|
||||
///
|
||||
/// Internal storage precision, used to optimize storage
|
||||
@@ -125,7 +147,10 @@ impl From<()> for DateOptions {
|
||||
impl From<FastFlag> for DateOptions {
|
||||
fn from(_: FastFlag) -> Self {
|
||||
DateOptions {
|
||||
fast: true,
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
fast: Some(Cardinality::SingleValue),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
@@ -134,7 +159,10 @@ impl From<FastFlag> for DateOptions {
|
||||
impl From<StoredFlag> for DateOptions {
|
||||
fn from(_: StoredFlag) -> Self {
|
||||
DateOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: true,
|
||||
fast: None,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
@@ -145,6 +173,8 @@ impl From<IndexedFlag> for DateOptions {
|
||||
DateOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
stored: false,
|
||||
fast: None,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
@@ -159,7 +189,7 @@ impl<T: Into<DateOptions>> BitOr<T> for DateOptions {
|
||||
indexed: self.indexed | other.indexed,
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
fast: self.fast.or(other.fast),
|
||||
precision: self.precision,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use serde_json::Value as JsonValue;
|
||||
use thiserror::Error;
|
||||
|
||||
use super::ip_options::IpAddrOptions;
|
||||
use super::IntoIpv6Addr;
|
||||
use super::{Cardinality, IntoIpv6Addr};
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::{
|
||||
@@ -241,6 +241,26 @@ impl FieldType {
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is fast.
|
||||
pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
|
||||
match *self {
|
||||
FieldType::Bytes(ref bytes_options) => {
|
||||
bytes_options.is_fast().then_some(Cardinality::SingleValue)
|
||||
}
|
||||
FieldType::Str(ref text_options) => {
|
||||
text_options.is_fast().then_some(Cardinality::MultiValues)
|
||||
}
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality(),
|
||||
FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
|
||||
FieldType::Facet(_) => Some(Cardinality::MultiValues),
|
||||
FieldType::JsonObject(_) => None,
|
||||
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.get_fastfield_cardinality(),
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
match *self {
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::ops::BitOr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
use super::Cardinality;
|
||||
|
||||
/// Trait to convert into an Ipv6Addr.
|
||||
pub trait IntoIpv6Addr {
|
||||
@@ -23,7 +24,8 @@ impl IntoIpv6Addr for IpAddr {
|
||||
/// Define how an ip field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct IpAddrOptions {
|
||||
fast: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
indexed: bool,
|
||||
fieldnorms: bool,
|
||||
@@ -32,7 +34,7 @@ pub struct IpAddrOptions {
|
||||
impl IpAddrOptions {
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
self.fast.is_some()
|
||||
}
|
||||
|
||||
/// Returns `true` if the ip address should be stored in the doc store.
|
||||
@@ -50,6 +52,14 @@ impl IpAddrOptions {
|
||||
self.fieldnorms
|
||||
}
|
||||
|
||||
/// Returns the cardinality of the fastfield.
|
||||
///
|
||||
/// If the field has not been declared as a fastfield, then
|
||||
/// the method returns None.
|
||||
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Set the field as normed.
|
||||
///
|
||||
/// Setting an integer as normed will generate
|
||||
@@ -87,8 +97,8 @@ impl IpAddrOptions {
|
||||
/// If more than one value is associated with a fast field, only the last one is
|
||||
/// kept.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> Self {
|
||||
self.fast = true;
|
||||
pub fn set_fast(mut self, cardinality: Cardinality) -> Self {
|
||||
self.fast = Some(cardinality);
|
||||
self
|
||||
}
|
||||
}
|
||||
@@ -105,7 +115,7 @@ impl From<FastFlag> for IpAddrOptions {
|
||||
fieldnorms: false,
|
||||
indexed: false,
|
||||
stored: false,
|
||||
fast: true,
|
||||
fast: Some(Cardinality::SingleValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -116,7 +126,7 @@ impl From<StoredFlag> for IpAddrOptions {
|
||||
fieldnorms: false,
|
||||
indexed: false,
|
||||
stored: true,
|
||||
fast: false,
|
||||
fast: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -127,7 +137,7 @@ impl From<IndexedFlag> for IpAddrOptions {
|
||||
fieldnorms: true,
|
||||
indexed: true,
|
||||
stored: false,
|
||||
fast: false,
|
||||
fast: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -141,7 +151,7 @@ impl<T: Into<IpAddrOptions>> BitOr<T> for IpAddrOptions {
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
indexed: self.indexed | other.indexed,
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
fast: self.fast.or(other.fast),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,9 +141,9 @@ pub use self::index_record_option::IndexRecordOption;
|
||||
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
|
||||
pub use self::json_object_options::JsonObjectOptions;
|
||||
pub use self::named_field_document::NamedFieldDocument;
|
||||
#[allow(deprecated)]
|
||||
pub use self::numeric_options::IntOptions;
|
||||
pub use self::numeric_options::NumericOptions;
|
||||
#[allow(deprecated)]
|
||||
pub use self::numeric_options::{Cardinality, IntOptions};
|
||||
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
|
||||
pub use self::term::Term;
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
|
||||
@@ -4,6 +4,18 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
|
||||
/// Express whether a field is single-value or multi-valued.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
|
||||
pub enum Cardinality {
|
||||
/// The document must have exactly one value associated with the document.
|
||||
#[serde(rename = "single")]
|
||||
SingleValue,
|
||||
/// The document can have any number of values associated with the document.
|
||||
/// This is more memory and CPU expensive than the `SingleValue` solution.
|
||||
#[serde(rename = "multi")]
|
||||
MultiValues,
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
|
||||
/// Deprecated use [`NumericOptions`] instead.
|
||||
pub type IntOptions = NumericOptions;
|
||||
@@ -15,7 +27,8 @@ pub struct NumericOptions {
|
||||
indexed: bool,
|
||||
// This boolean has no effect if the field is not marked as indexed too.
|
||||
fieldnorms: bool, // This attribute only has an effect if indexed is true.
|
||||
fast: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
@@ -29,7 +42,8 @@ struct NumericOptionsDeser {
|
||||
indexed: bool,
|
||||
#[serde(default)]
|
||||
fieldnorms: Option<bool>, // This attribute only has an effect if indexed is true.
|
||||
fast: bool,
|
||||
#[serde(default)]
|
||||
fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
@@ -60,9 +74,18 @@ impl NumericOptions {
|
||||
self.fieldnorms && self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field and multivalue.
|
||||
pub fn is_multivalue_fast(&self) -> bool {
|
||||
if let Some(cardinality) = self.fast {
|
||||
cardinality == Cardinality::MultiValues
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
self.fast.is_some()
|
||||
}
|
||||
|
||||
/// Set the field as stored.
|
||||
@@ -104,10 +127,18 @@ impl NumericOptions {
|
||||
/// If more than one value is associated with a fast field, only the last one is
|
||||
/// kept.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> NumericOptions {
|
||||
self.fast = true;
|
||||
pub fn set_fast(mut self, cardinality: Cardinality) -> NumericOptions {
|
||||
self.fast = Some(cardinality);
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the cardinality of the fastfield.
|
||||
///
|
||||
/// If the field has not been declared as a fastfield, then
|
||||
/// the method returns `None`.
|
||||
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
|
||||
self.fast
|
||||
}
|
||||
}
|
||||
|
||||
impl From<()> for NumericOptions {
|
||||
@@ -122,7 +153,7 @@ impl From<FastFlag> for NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
fast: true,
|
||||
fast: Some(Cardinality::SingleValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -133,7 +164,7 @@ impl From<StoredFlag> for NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: true,
|
||||
fast: false,
|
||||
fast: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -144,7 +175,7 @@ impl From<IndexedFlag> for NumericOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
stored: false,
|
||||
fast: false,
|
||||
fast: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -158,7 +189,7 @@ impl<T: Into<NumericOptions>> BitOr<T> for NumericOptions {
|
||||
indexed: self.indexed | other.indexed,
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
fast: self.fast.or(other.fast),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -190,7 +221,7 @@ mod tests {
|
||||
&NumericOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
@@ -208,7 +239,7 @@ mod tests {
|
||||
&NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
fast: false,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
@@ -227,7 +258,7 @@ mod tests {
|
||||
&NumericOptions {
|
||||
indexed: true,
|
||||
fieldnorms: false,
|
||||
fast: false,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
@@ -247,7 +278,7 @@ mod tests {
|
||||
&NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
|
||||
@@ -484,6 +484,7 @@ mod tests {
|
||||
use serde_json;
|
||||
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::numeric_options::Cardinality::SingleValue;
|
||||
use crate::schema::schema::DocParsingError::InvalidJson;
|
||||
use crate::schema::*;
|
||||
|
||||
@@ -505,13 +506,19 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_schema_serialization() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_options = NumericOptions::default().set_stored().set_fast();
|
||||
let popularity_options = NumericOptions::default().set_stored().set_fast();
|
||||
let count_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = NumericOptions::default()
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_fast();
|
||||
let is_read_options = NumericOptions::default().set_stored().set_fast();
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let is_read_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field(
|
||||
"author",
|
||||
@@ -636,8 +643,12 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_document_to_json() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_options = NumericOptions::default().set_stored().set_fast();
|
||||
let is_read_options = NumericOptions::default().set_stored().set_fast();
|
||||
let count_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let is_read_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
@@ -737,9 +748,15 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_options = NumericOptions::default().set_stored().set_fast();
|
||||
let popularity_options = NumericOptions::default().set_stored().set_fast();
|
||||
let score_options = NumericOptions::default().set_indexed().set_fast();
|
||||
let count_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = NumericOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
let author_field = schema_builder.add_text_field("author", STRING);
|
||||
let count_field = schema_builder.add_u64_field("count", count_options);
|
||||
@@ -890,7 +907,7 @@ mod tests {
|
||||
.set_stored()
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_fast();
|
||||
.set_fast(SingleValue);
|
||||
schema_builder.add_text_field("_id", id_options);
|
||||
schema_builder.add_date_field("_timestamp", timestamp_options);
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user