mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Compare commits
24 Commits
optional_c
...
revert-171
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
08a723d936 | ||
|
|
a05a0035f8 | ||
|
|
976128a412 | ||
|
|
f27b3e312d | ||
|
|
56dea6f08d | ||
|
|
789d29cf45 | ||
|
|
a36b50d825 | ||
|
|
09f65e5467 | ||
|
|
2c2f5c3877 | ||
|
|
96c93a6ba3 | ||
|
|
11b01e4141 | ||
|
|
3e8852c606 | ||
|
|
725f1ecb80 | ||
|
|
afa27afe7d | ||
|
|
495824361a | ||
|
|
485a8f507e | ||
|
|
1119e59eae | ||
|
|
ee1f2c1f28 | ||
|
|
600548fd26 | ||
|
|
9929c0c221 | ||
|
|
f53e65648b | ||
|
|
0281b22b77 | ||
|
|
a05c184830 | ||
|
|
0b40a7fe43 |
@@ -1,9 +1,13 @@
|
||||
Tantivy 0.19
|
||||
================================
|
||||
#### Bugfixes
|
||||
- Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
|
||||
- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480 (@PSeitz @fulmicoton)
|
||||
|
||||
#### Features/Improvements
|
||||
- Add support for `IN` in queryparser , e.g. `field: IN [val1 val2 val3]` [#1683](https://github.com/quickwit-oss/tantivy/pull/1683) (@trinity-1686a)
|
||||
- Skip score calculation, when no scoring is required [#1646](https://github.com/quickwit-oss/tantivy/pull/1646) (@PSeitz)
|
||||
- Limit fast fields to u32 (`get_val(u32)`) [#1644](https://github.com/quickwit-oss/tantivy/pull/1644) (@PSeitz)
|
||||
- Major bugfix: Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
|
||||
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
|
||||
The `DateTime` type has been updated to hold timestamps with microseconds precision.
|
||||
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing). (@evanxg852000)
|
||||
@@ -11,7 +15,6 @@ Tantivy 0.19
|
||||
- Add boolean field type [#1382](https://github.com/quickwit-oss/tantivy/pull/1382) (@boraarslan)
|
||||
- Remove Searcher pool and make `Searcher` cloneable. (@PSeitz)
|
||||
- Validate settings on create [#1570](https://github.com/quickwit-oss/tantivy/pull/1570 (@PSeitz)
|
||||
- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480 (@PSeitz @fulmicoton)
|
||||
- Detect and apply gcd on fastfield codecs [#1418](https://github.com/quickwit-oss/tantivy/pull/1418) (@PSeitz)
|
||||
- Doc store
|
||||
- use separate thread to compress block store [#1389](https://github.com/quickwit-oss/tantivy/pull/1389) [#1510](https://github.com/quickwit-oss/tantivy/pull/1510 (@PSeitz @fulmicoton)
|
||||
@@ -21,6 +24,7 @@ Tantivy 0.19
|
||||
- Make `tantivy::TantivyError` cloneable [#1402](https://github.com/quickwit-oss/tantivy/pull/1402) (@PSeitz)
|
||||
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
||||
- Aggregation
|
||||
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
||||
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
||||
- Faster indexing
|
||||
|
||||
17
Cargo.toml
17
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.19.0-dev"
|
||||
version = "0.19.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -25,7 +25,7 @@ tantivy-fst = "0.4.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.11", optional = true, default-features = false }
|
||||
zstd = { version = "0.12", optional = true, default-features = false }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
@@ -36,11 +36,6 @@ fs2 = { version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
|
||||
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.3", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2.0"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
@@ -62,6 +57,12 @@ ciborium = { version = "0.2", optional = true}
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
|
||||
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
|
||||
common = { version= "0.4", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version= "0.4", path="./ownedbytes" }
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
|
||||
@@ -73,7 +74,7 @@ pretty_assertions = "1.2.1"
|
||||
proptest = "1.0.0"
|
||||
criterion = "0.4"
|
||||
test-log = "0.2.10"
|
||||
env_logger = "0.9.0"
|
||||
env_logger = "0.10.0"
|
||||
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
|
||||
futures = "0.3.21"
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.2.0"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
@@ -8,6 +8,8 @@ categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
keywords = []
|
||||
documentation = "https://docs.rs/tantivy-bitpacker/latest/tantivy_bitpacker"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
documentation = "https://docs.rs/tantivy_common/"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.3", path="../ownedbytes" }
|
||||
ownedbytes = { version= "0.4", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
|
||||
@@ -94,6 +94,20 @@ impl FixedSize for u32 {
|
||||
const SIZE_IN_BYTES: usize = 4;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u16 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u16::<Endianness>(*self)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u16> {
|
||||
reader.read_u16::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u16 {
|
||||
const SIZE_IN_BYTES: usize = 2;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u64 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u64::<Endianness>(*self)
|
||||
|
||||
@@ -118,7 +118,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::OptionalColumn;
|
||||
use fastfield_codecs::Column;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
@@ -97,7 +97,7 @@ impl Collector for StatsCollector {
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: Arc<dyn OptionalColumn<u64>>,
|
||||
fast_field_reader: Arc<dyn Column<u64>>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
@@ -105,12 +105,10 @@ impl SegmentCollector for StatsSegmentCollector {
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: Score) {
|
||||
if let Some(value) = self.fast_field_reader.get_val(doc) {
|
||||
let value = value as f64;
|
||||
self.stats.count += 1;
|
||||
self.stats.sum += value;
|
||||
self.stats.squared_sum += value * value;
|
||||
}
|
||||
let value = self.fast_field_reader.get_val(doc) as f64;
|
||||
self.stats.count += 1;
|
||||
self.stats.sum += value;
|
||||
self.stats.squared_sum += value * value;
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
|
||||
@@ -51,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
|
||||
let product_id_reader = segment.fast_fields().u64(self.field)?;
|
||||
let product_ids: Vec<ProductId> = segment
|
||||
.doc_ids_alive()
|
||||
.flat_map(|doc| product_id_reader.get_val(doc))
|
||||
.map(|doc| product_id_reader.get_val(doc))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
|
||||
@@ -1,17 +1,20 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.2.0"
|
||||
version = "0.3.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
documentation = "https://docs.rs/fastfield_codecs/"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
|
||||
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
|
||||
common = { version = "0.4", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
|
||||
ownedbytes = { version = "0.4.0", path = "../ownedbytes" }
|
||||
prettytable-rs = {version="0.9.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
fastdivide = "0.4"
|
||||
|
||||
@@ -41,7 +41,7 @@ mod tests {
|
||||
) -> Arc<dyn Column<T>> {
|
||||
let mut buffer = Vec::new();
|
||||
serialize(VecColumn::from(&column), &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||
open(OwnedBytes::new(buffer)).unwrap().to_full().unwrap()
|
||||
open(OwnedBytes::new(buffer)).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -103,7 +103,7 @@ mod tests {
|
||||
let iter_gen = || data.iter().cloned();
|
||||
serialize_u128(iter_gen, data.len() as u32, &mut out).unwrap();
|
||||
let out = OwnedBytes::new(out);
|
||||
open_u128::<u128>(out).unwrap().to_full().unwrap()
|
||||
open_u128::<u128>(out).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
|
||||
@@ -456,6 +456,8 @@ impl CompactSpaceDecompressor {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::format_version::read_format_version;
|
||||
use crate::null_index_footer::read_null_index_footer;
|
||||
use crate::serialize::U128Header;
|
||||
use crate::{open_u128, serialize_u128};
|
||||
|
||||
@@ -541,7 +543,10 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let data = OwnedBytes::new(out);
|
||||
let (data, _format_version) = read_format_version(data).unwrap();
|
||||
let (data, _null_index_footer) = read_null_index_footer(data).unwrap();
|
||||
test_all(data.clone(), u128_vals);
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
@@ -559,6 +564,7 @@ mod tests {
|
||||
333u128,
|
||||
];
|
||||
let mut data = test_aux_vals(vals);
|
||||
|
||||
let _header = U128Header::deserialize(&mut data);
|
||||
let decomp = CompactSpaceDecompressor::open(data).unwrap();
|
||||
let complete_range = 0..vals.len() as u32;
|
||||
@@ -731,10 +737,7 @@ mod tests {
|
||||
];
|
||||
let mut out = Vec::new();
|
||||
serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap();
|
||||
let decomp = open_u128::<u128>(OwnedBytes::new(out))
|
||||
.unwrap()
|
||||
.to_full()
|
||||
.unwrap();
|
||||
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
|
||||
let complete_range = 0..vals.len() as u32;
|
||||
|
||||
assert_eq!(
|
||||
|
||||
39
fastfield_codecs/src/format_version.rs
Normal file
39
fastfield_codecs/src/format_version.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use std::io;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
const MAGIC_NUMBER: u16 = 4335u16;
|
||||
const FASTFIELD_FORMAT_VERSION: u8 = 1;
|
||||
|
||||
pub(crate) fn append_format_version(output: &mut impl io::Write) -> io::Result<()> {
|
||||
FASTFIELD_FORMAT_VERSION.serialize(output)?;
|
||||
MAGIC_NUMBER.serialize(output)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn read_format_version(data: OwnedBytes) -> io::Result<(OwnedBytes, u8)> {
|
||||
let (data, magic_number_bytes) = data.rsplit(2);
|
||||
|
||||
let magic_number = u16::deserialize(&mut magic_number_bytes.as_slice())?;
|
||||
if magic_number != MAGIC_NUMBER {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("magic number mismatch {} != {}", magic_number, MAGIC_NUMBER),
|
||||
));
|
||||
}
|
||||
let (data, format_version_bytes) = data.rsplit(1);
|
||||
let format_version = u8::deserialize(&mut format_version_bytes.as_slice())?;
|
||||
if format_version > FASTFIELD_FORMAT_VERSION {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Unsupported fastfield format version: {}. Max supported version: {}",
|
||||
format_version, FASTFIELD_FORMAT_VERSION
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
Ok((data, format_version))
|
||||
}
|
||||
@@ -59,11 +59,11 @@ mod tests {
|
||||
crate::serialize(VecColumn::from(&vals), &mut buffer, &[codec_type])?;
|
||||
let buffer = OwnedBytes::new(buffer);
|
||||
let column = crate::open::<i64>(buffer.clone())?;
|
||||
assert_eq!(column.get_val(0), Some(-4000i64));
|
||||
assert_eq!(column.get_val(1), Some(-3000i64));
|
||||
assert_eq!(column.get_val(2), Some(-2000i64));
|
||||
assert_eq!(column.max_value(), Some((num_vals as i64 - 5) * 1000));
|
||||
assert_eq!(column.min_value(), Some(-4000i64));
|
||||
assert_eq!(column.get_val(0), -4000i64);
|
||||
assert_eq!(column.get_val(1), -3000i64);
|
||||
assert_eq!(column.get_val(2), -2000i64);
|
||||
assert_eq!(column.max_value(), (num_vals as i64 - 5) * 1000);
|
||||
assert_eq!(column.min_value(), -4000i64);
|
||||
|
||||
// Can't apply gcd
|
||||
let mut buffer_without_gcd = Vec::new();
|
||||
@@ -101,11 +101,11 @@ mod tests {
|
||||
crate::serialize(VecColumn::from(&vals), &mut buffer, &[codec_type])?;
|
||||
let buffer = OwnedBytes::new(buffer);
|
||||
let column = crate::open::<u64>(buffer.clone())?;
|
||||
assert_eq!(column.get_val(0), Some(1000u64));
|
||||
assert_eq!(column.get_val(1), Some(2000u64));
|
||||
assert_eq!(column.get_val(2), Some(3000u64));
|
||||
assert_eq!(column.max_value(), Some(num_vals as u64 * 1000));
|
||||
assert_eq!(column.min_value(), Some(1000u64));
|
||||
assert_eq!(column.get_val(0), 1000u64);
|
||||
assert_eq!(column.get_val(1), 2000u64);
|
||||
assert_eq!(column.get_val(2), 3000u64);
|
||||
assert_eq!(column.max_value(), num_vals as u64 * 1000);
|
||||
assert_eq!(column.min_value(), 1000u64);
|
||||
|
||||
// Can't apply gcd
|
||||
let mut buffer_without_gcd = Vec::new();
|
||||
|
||||
@@ -20,23 +20,24 @@ use std::sync::Arc;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use compact_space::CompactSpaceDecompressor;
|
||||
use format_version::read_format_version;
|
||||
use monotonic_mapping::{
|
||||
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
|
||||
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
|
||||
};
|
||||
pub use optional_column::OptionalColumn;
|
||||
use optional_column::ToOptionalColumn;
|
||||
use null_index_footer::read_null_index_footer;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use serialize::{Header, U128Header};
|
||||
|
||||
mod bitpacked;
|
||||
mod blockwise_linear;
|
||||
mod compact_space;
|
||||
mod format_version;
|
||||
mod line;
|
||||
mod linear;
|
||||
mod monotonic_mapping;
|
||||
mod monotonic_mapping_u128;
|
||||
mod optional_column;
|
||||
mod null_index_footer;
|
||||
|
||||
mod column;
|
||||
mod gcd;
|
||||
@@ -132,23 +133,22 @@ impl U128FastFieldCodecType {
|
||||
|
||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
pub fn open_u128<Item: MonotonicallyMappableToU128>(
|
||||
mut bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn OptionalColumn<Item>>> {
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
let header = U128Header::deserialize(&mut bytes)?;
|
||||
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
|
||||
let reader = CompactSpaceDecompressor::open(bytes)?;
|
||||
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
|
||||
StrictlyMonotonicMappingToInternal::<Item>::new().into();
|
||||
|
||||
Ok(Arc::new(ToOptionalColumn::new(Arc::new(
|
||||
monotonic_map_column(reader, inverted),
|
||||
))))
|
||||
Ok(Arc::new(monotonic_map_column(reader, inverted)))
|
||||
}
|
||||
|
||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
pub fn open<T: MonotonicallyMappableToU64>(
|
||||
mut bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn OptionalColumn<T>>> {
|
||||
pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
|
||||
let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
let header = Header::deserialize(&mut bytes)?;
|
||||
match header.codec_type {
|
||||
FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
|
||||
@@ -162,7 +162,7 @@ pub fn open<T: MonotonicallyMappableToU64>(
|
||||
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
|
||||
bytes: OwnedBytes,
|
||||
header: &Header,
|
||||
) -> io::Result<Arc<dyn OptionalColumn<Item>>> {
|
||||
) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
let normalized_header = header.normalized();
|
||||
let reader = C::open_from_bytes(bytes, normalized_header)?;
|
||||
let min_value = header.min_value;
|
||||
@@ -170,16 +170,12 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
|
||||
let mapping = StrictlyMonotonicMappingInverter::from(
|
||||
StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
|
||||
);
|
||||
Ok(Arc::new(ToOptionalColumn::new(Arc::new(
|
||||
monotonic_map_column(reader, mapping),
|
||||
))))
|
||||
Ok(Arc::new(monotonic_map_column(reader, mapping)))
|
||||
} else {
|
||||
let mapping = StrictlyMonotonicMappingInverter::from(
|
||||
StrictlyMonotonicMappingToInternalBaseval::new(min_value),
|
||||
);
|
||||
Ok(Arc::new(ToOptionalColumn::new(Arc::new(
|
||||
monotonic_map_column(reader, mapping),
|
||||
))))
|
||||
Ok(Arc::new(monotonic_map_column(reader, mapping)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,9 +246,8 @@ mod tests {
|
||||
for (doc, orig_val) in data.iter().copied().enumerate() {
|
||||
let val = reader.get_val(doc as u32);
|
||||
assert_eq!(
|
||||
val,
|
||||
Some(orig_val),
|
||||
"val `{val:?}` does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
val, orig_val,
|
||||
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
`{data:?}`",
|
||||
);
|
||||
}
|
||||
|
||||
@@ -113,10 +113,7 @@ fn bench_ip() {
|
||||
(data.len() * 8) as f32 / dataset.len() as f32
|
||||
);
|
||||
|
||||
let decompressor = open_u128::<u128>(OwnedBytes::new(data))
|
||||
.unwrap()
|
||||
.to_full()
|
||||
.unwrap();
|
||||
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
|
||||
// Sample some ranges
|
||||
let mut doc_values = Vec::new();
|
||||
for value in dataset.iter().take(1110).skip(1100).cloned() {
|
||||
|
||||
144
fastfield_codecs/src/null_index_footer.rs
Normal file
144
fastfield_codecs/src/null_index_footer.rs
Normal file
@@ -0,0 +1,144 @@
|
||||
use std::io::{self, Write};
|
||||
use std::ops::Range;
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, VInt};
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) enum FastFieldCardinality {
|
||||
Single = 1,
|
||||
}
|
||||
|
||||
impl BinarySerializable for FastFieldCardinality {
|
||||
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
self.to_code().serialize(wrt)
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let code = u8::deserialize(reader)?;
|
||||
let codec_type: Self = Self::from_code(code)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
|
||||
Ok(codec_type)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCardinality {
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub(crate) fn from_code(code: u8) -> Option<Self> {
|
||||
match code {
|
||||
1 => Some(Self::Single),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub(crate) enum NullIndexCodec {
|
||||
Full = 1,
|
||||
}
|
||||
|
||||
impl BinarySerializable for NullIndexCodec {
|
||||
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
self.to_code().serialize(wrt)
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let code = u8::deserialize(reader)?;
|
||||
let codec_type: Self = Self::from_code(code)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
|
||||
Ok(codec_type)
|
||||
}
|
||||
}
|
||||
|
||||
impl NullIndexCodec {
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub(crate) fn from_code(code: u8) -> Option<Self> {
|
||||
match code {
|
||||
1 => Some(Self::Full),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub(crate) struct NullIndexFooter {
|
||||
pub(crate) cardinality: FastFieldCardinality,
|
||||
pub(crate) null_index_codec: NullIndexCodec,
|
||||
// Unused for NullIndexCodec::Full
|
||||
pub(crate) null_index_byte_range: Range<u64>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for NullIndexFooter {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.cardinality.serialize(writer)?;
|
||||
self.null_index_codec.serialize(writer)?;
|
||||
VInt(self.null_index_byte_range.start).serialize(writer)?;
|
||||
VInt(self.null_index_byte_range.end - self.null_index_byte_range.start)
|
||||
.serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let cardinality = FastFieldCardinality::deserialize(reader)?;
|
||||
let null_index_codec = NullIndexCodec::deserialize(reader)?;
|
||||
let null_index_byte_range_start = VInt::deserialize(reader)?.0;
|
||||
let null_index_byte_range_end = VInt::deserialize(reader)?.0 + null_index_byte_range_start;
|
||||
Ok(Self {
|
||||
cardinality,
|
||||
null_index_codec,
|
||||
null_index_byte_range: null_index_byte_range_start..null_index_byte_range_end,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn append_null_index_footer(
|
||||
output: &mut impl io::Write,
|
||||
null_index_footer: NullIndexFooter,
|
||||
) -> io::Result<()> {
|
||||
let mut counting_write = CountingWriter::wrap(output);
|
||||
null_index_footer.serialize(&mut counting_write)?;
|
||||
let footer_payload_len = counting_write.written_bytes();
|
||||
BinarySerializable::serialize(&(footer_payload_len as u16), &mut counting_write)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn read_null_index_footer(
|
||||
data: OwnedBytes,
|
||||
) -> io::Result<(OwnedBytes, NullIndexFooter)> {
|
||||
let (data, null_footer_length_bytes) = data.rsplit(2);
|
||||
|
||||
let footer_length = u16::deserialize(&mut null_footer_length_bytes.as_slice())?;
|
||||
let (data, null_index_footer_bytes) = data.rsplit(footer_length as usize);
|
||||
let null_index_footer = NullIndexFooter::deserialize(&mut null_index_footer_bytes.as_ref())?;
|
||||
|
||||
Ok((data, null_index_footer))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn null_index_footer_deser_test() {
|
||||
let null_index_footer = NullIndexFooter {
|
||||
cardinality: FastFieldCardinality::Single,
|
||||
null_index_codec: NullIndexCodec::Full,
|
||||
null_index_byte_range: 100..120,
|
||||
};
|
||||
|
||||
let mut out = vec![];
|
||||
null_index_footer.serialize(&mut out).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
null_index_footer,
|
||||
NullIndexFooter::deserialize(&mut &out[..]).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::Column;
|
||||
|
||||
/// `OptionalColumn` provides columnar access on a field.
|
||||
pub trait OptionalColumn<T: PartialOrd = u64>: Send + Sync {
|
||||
/// Return the value associated with the given idx.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `idx` is greater than the column length.
|
||||
fn get_val(&self, idx: u32) -> Option<T>;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Must panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u64, output: &mut [Option<T>]) {
|
||||
for (out, idx) in output.iter_mut().zip(start..) {
|
||||
*out = self.get_val(idx as u32);
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the positions of values which are in the provided range.
|
||||
fn get_docids_for_value_range(
|
||||
&self,
|
||||
value_range: RangeInclusive<T>,
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
|
||||
|
||||
for idx in doc_id_range {
|
||||
let val = self.get_val(idx);
|
||||
if let Some(val) = val {
|
||||
if value_range.contains(&val) {
|
||||
positions.push(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// This min_value may not be exact.
|
||||
/// For instance, the min value does not take in account of possible
|
||||
/// deleted document. All values are however guaranteed to be higher than
|
||||
/// `.min_value()`.
|
||||
fn min_value(&self) -> Option<T>;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// This max_value may not be exact.
|
||||
/// For instance, the max value does not take in account of possible
|
||||
/// deleted document. All values are however guaranteed to be higher than
|
||||
/// `.max_value()`.
|
||||
fn max_value(&self) -> Option<T>;
|
||||
|
||||
/// The number of values including `None` in the column.
|
||||
fn num_vals(&self) -> u32;
|
||||
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = Option<T>> + 'a> {
|
||||
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
|
||||
}
|
||||
|
||||
/// return full column if all values are set and is not empty
|
||||
fn to_full(&self) -> Option<Arc<dyn Column<T>>> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Temporary wrapper to migrate to optional column
|
||||
pub(crate) struct ToOptionalColumn<T> {
|
||||
column: Arc<dyn Column<T>>,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> ToOptionalColumn<T> {
|
||||
pub(crate) fn new(column: Arc<dyn Column<T>>) -> Self {
|
||||
Self { column }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> OptionalColumn<T> for ToOptionalColumn<T> {
|
||||
#[inline]
|
||||
fn get_val(&self, idx: u32) -> Option<T> {
|
||||
let val = self.column.get_val(idx);
|
||||
Some(val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> Option<T> {
|
||||
let min_value = self.column.min_value();
|
||||
Some(min_value)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> Option<T> {
|
||||
let max_value = self.column.max_value();
|
||||
Some(max_value)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.column.num_vals()
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Option<T>> + '_> {
|
||||
Box::new(self.column.iter().map(|el| Some(el)))
|
||||
}
|
||||
/// return full column if all values are set and is not empty
|
||||
fn to_full(&self) -> Option<Arc<dyn Column<T>>> {
|
||||
Some(self.column.clone())
|
||||
}
|
||||
}
|
||||
@@ -28,11 +28,15 @@ use ownedbytes::OwnedBytes;
|
||||
use crate::bitpacked::BitpackedCodec;
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
use crate::compact_space::CompactSpaceCompressor;
|
||||
use crate::format_version::append_format_version;
|
||||
use crate::linear::LinearCodec;
|
||||
use crate::monotonic_mapping::{
|
||||
StrictlyMonotonicFn, StrictlyMonotonicMappingToInternal,
|
||||
StrictlyMonotonicMappingToInternalGCDBaseval,
|
||||
};
|
||||
use crate::null_index_footer::{
|
||||
append_null_index_footer, FastFieldCardinality, NullIndexCodec, NullIndexFooter,
|
||||
};
|
||||
use crate::{
|
||||
monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64,
|
||||
U128FastFieldCodecType, VecColumn, ALL_CODEC_TYPES,
|
||||
@@ -198,6 +202,14 @@ pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
|
||||
let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
|
||||
compressor.compress_into(iter_gen(), output).unwrap();
|
||||
|
||||
let null_index_footer = NullIndexFooter {
|
||||
cardinality: FastFieldCardinality::Single,
|
||||
null_index_codec: NullIndexCodec::Full,
|
||||
null_index_byte_range: 0..0,
|
||||
};
|
||||
append_null_index_footer(output, null_index_footer)?;
|
||||
append_format_version(output)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -221,6 +233,15 @@ pub fn serialize<T: MonotonicallyMappableToU64>(
|
||||
let normalized_column = header.normalize_column(column);
|
||||
assert_eq!(normalized_column.min_value(), 0u64);
|
||||
serialize_given_codec(normalized_column, header.codec_type, output)?;
|
||||
|
||||
let null_index_footer = NullIndexFooter {
|
||||
cardinality: FastFieldCardinality::Single,
|
||||
null_index_codec: NullIndexCodec::Full,
|
||||
null_index_byte_range: 0..0,
|
||||
};
|
||||
append_null_index_footer(output, null_index_footer)?;
|
||||
append_format_version(output)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -278,10 +299,7 @@ pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
|
||||
) -> Arc<dyn Column<T>> {
|
||||
let mut buffer = Vec::new();
|
||||
super::serialize(VecColumn::from(&column), &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||
super::open(OwnedBytes::new(buffer))
|
||||
.unwrap()
|
||||
.to_full()
|
||||
.unwrap()
|
||||
super::open(OwnedBytes::new(buffer)).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -313,7 +331,7 @@ mod tests {
|
||||
let col = VecColumn::from(&[false, true][..]);
|
||||
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||
// 5 bytes of header, 1 byte of value, 7 bytes of padding.
|
||||
assert_eq!(buffer.len(), 5 + 8);
|
||||
assert_eq!(buffer.len(), 3 + 5 + 8 + 4 + 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -322,7 +340,7 @@ mod tests {
|
||||
let col = VecColumn::from(&[true][..]);
|
||||
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||
// 5 bytes of header, 0 bytes of value, 7 bytes of padding.
|
||||
assert_eq!(buffer.len(), 5 + 7);
|
||||
assert_eq!(buffer.len(), 3 + 5 + 7 + 4 + 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -332,6 +350,6 @@ mod tests {
|
||||
let col = VecColumn::from(&vals[..]);
|
||||
serialize(col, &mut buffer, &[FastFieldCodecType::Bitpacked]).unwrap();
|
||||
// Values are stored over 3 bits.
|
||||
assert_eq!(buffer.len(), 7 + (3 * 80 / 8) + 7);
|
||||
assert_eq!(buffer.len(), 3 + 7 + (3 * 80 / 8) + 7 + 4 + 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
edition = "2021"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
documentation = "https://docs.rs/ownedbytes/"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -80,6 +80,21 @@ impl OwnedBytes {
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
|
||||
///
|
||||
/// Right will hold `split_len` bytes.
|
||||
///
|
||||
/// This operation is cheap and does not require to copy any memory.
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn rsplit(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let data_len = self.data.len();
|
||||
self.split(data_len - split_len)
|
||||
}
|
||||
|
||||
/// Splits the right part of the `OwnedBytes` at the given offset.
|
||||
///
|
||||
/// `self` is truncated to `split_len`, left with the remaining bytes.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -4,14 +4,14 @@ use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::OptionalColumn;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::segment_agg_result::BucketCount;
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader};
|
||||
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
|
||||
@@ -37,16 +37,16 @@ impl AggregationsWithAccessor {
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(Arc<dyn OptionalColumn<u64>>),
|
||||
Single(Arc<dyn Column<u64>>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&dyn OptionalColumn<u64>> {
|
||||
pub fn as_single(&self) -> Option<&dyn Column<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(&**reader),
|
||||
}
|
||||
}
|
||||
pub fn into_single(self) -> Option<Arc<dyn OptionalColumn<u64>>> {
|
||||
pub fn into_single(self) -> Option<Arc<dyn Column<u64>>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
@@ -124,7 +124,7 @@ impl BucketAggregationWithAccessor {
|
||||
pub struct MetricAggregationWithAccessor {
|
||||
pub metric: MetricAggregation,
|
||||
pub field_type: Type,
|
||||
pub accessor: Arc<dyn OptionalColumn>,
|
||||
pub accessor: Arc<dyn Column>,
|
||||
}
|
||||
|
||||
impl MetricAggregationWithAccessor {
|
||||
@@ -194,13 +194,7 @@ fn get_ff_reader_and_validate(
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
|
||||
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
|
||||
if ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Unsupported field type date in aggregation".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some((_ff_type, field_cardinality)) = type_and_cardinality(field_type) {
|
||||
if cardinality != field_cardinality {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
|
||||
|
||||
@@ -12,6 +12,7 @@ use super::bucket::GetDocCount;
|
||||
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::Key;
|
||||
use crate::schema::Schema;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -129,9 +130,12 @@ pub enum BucketResult {
|
||||
}
|
||||
|
||||
impl BucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
|
||||
pub(crate) fn empty_from_req(
|
||||
req: &BucketAggregationInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<Self> {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
empty_bucket.into_final_bucket_result(req)
|
||||
empty_bucket.into_final_bucket_result(req, schema)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,6 +178,9 @@ pub enum BucketEntries<T> {
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BucketEntry {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The string representation of the bucket.
|
||||
pub key_as_string: Option<String>,
|
||||
/// The identifier of the bucket.
|
||||
pub key: Key,
|
||||
/// Number of documents in the bucket.
|
||||
@@ -238,4 +245,10 @@ pub struct RangeBucketEntry {
|
||||
/// The to range of the bucket. Equals `f64::MAX` when `None`.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
/// The optional string representation for the `from` range.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from_as_string: Option<String>,
|
||||
/// The optional string representation for the `to` range.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to_as_string: Option<String>,
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt::Display;
|
||||
|
||||
use fastfield_codecs::OptionalColumn;
|
||||
use fastfield_codecs::Column;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -10,12 +10,12 @@ use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::agg_result::BucketEntry;
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::schema::Type;
|
||||
use crate::aggregation::{f64_from_fastfield_u64, format_date};
|
||||
use crate::schema::{Schema, Type};
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// Histogram is a bucket aggregation, where buckets are created dynamically for given `interval`.
|
||||
@@ -263,17 +263,13 @@ impl SegmentHistogramCollector {
|
||||
req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
field_type: Type,
|
||||
accessor: &dyn OptionalColumn<u64>,
|
||||
accessor: &dyn Column<u64>,
|
||||
) -> crate::Result<Self> {
|
||||
req.validate()?;
|
||||
let min_max_u64 = accessor.min_value().zip(accessor.max_value());
|
||||
let min_max_f64 = min_max_u64.map(|(min, max)| {
|
||||
let min = f64_from_fastfield_u64(min, &field_type);
|
||||
let max = f64_from_fastfield_u64(max, &field_type);
|
||||
(min, max)
|
||||
});
|
||||
let min = f64_from_fastfield_u64(accessor.min_value(), &field_type);
|
||||
let max = f64_from_fastfield_u64(accessor.max_value(), &field_type);
|
||||
|
||||
let (min, max) = get_req_min_max(req, min_max_f64);
|
||||
let (min, max) = get_req_min_max(req, Some((min, max)));
|
||||
|
||||
// We compute and generate the buckets range (min, max) based on the request and the min
|
||||
// max in the fast field, but this is likely not ideal when this is a subbucket, where many
|
||||
@@ -335,58 +331,47 @@ impl SegmentHistogramCollector {
|
||||
.expect("unexpected fast field cardinatility");
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
if let Some(val) = accessor.get_val(docs[0]) {
|
||||
let val = self.f64_from_fastfield_u64(val);
|
||||
let bucket_pos = get_bucket_num(val);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val,
|
||||
&bounds,
|
||||
bucket_pos,
|
||||
docs[0],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
}
|
||||
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0]));
|
||||
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1]));
|
||||
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2]));
|
||||
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3]));
|
||||
|
||||
if let Some(val) = accessor.get_val(docs[1]) {
|
||||
let val = self.f64_from_fastfield_u64(val);
|
||||
let bucket_pos = get_bucket_num(val);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val,
|
||||
&bounds,
|
||||
bucket_pos,
|
||||
docs[1],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
}
|
||||
let bucket_pos0 = get_bucket_num(val0);
|
||||
let bucket_pos1 = get_bucket_num(val1);
|
||||
let bucket_pos2 = get_bucket_num(val2);
|
||||
let bucket_pos3 = get_bucket_num(val3);
|
||||
|
||||
if let Some(val) = accessor.get_val(docs[2]) {
|
||||
let val = self.f64_from_fastfield_u64(val);
|
||||
let bucket_pos = get_bucket_num(val);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val,
|
||||
&bounds,
|
||||
bucket_pos,
|
||||
docs[2],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
}
|
||||
|
||||
if let Some(val) = accessor.get_val(docs[3]) {
|
||||
let val = self.f64_from_fastfield_u64(val);
|
||||
let bucket_pos = get_bucket_num(val);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val,
|
||||
&bounds,
|
||||
bucket_pos,
|
||||
docs[3],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
}
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val0,
|
||||
&bounds,
|
||||
bucket_pos0,
|
||||
docs[0],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val1,
|
||||
&bounds,
|
||||
bucket_pos1,
|
||||
docs[1],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val2,
|
||||
&bounds,
|
||||
bucket_pos2,
|
||||
docs[2],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val3,
|
||||
&bounds,
|
||||
bucket_pos3,
|
||||
docs[3],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
let Some(val) = accessor.get_val(doc).map(|val|f64_from_fastfield_u64(val, &self.field_type)) else{
|
||||
continue;
|
||||
};
|
||||
let val = f64_from_fastfield_u64(accessor.get_val(doc), &self.field_type);
|
||||
if !bounds.contains(val) {
|
||||
continue;
|
||||
}
|
||||
@@ -466,6 +451,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
// Generate the full list of buckets without gaps.
|
||||
//
|
||||
@@ -506,7 +492,9 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
sub_aggregation: empty_sub_aggregation.clone(),
|
||||
},
|
||||
})
|
||||
.map(|intermediate_bucket| intermediate_bucket.into_final_bucket_entry(sub_aggregation))
|
||||
.map(|intermediate_bucket| {
|
||||
intermediate_bucket.into_final_bucket_entry(sub_aggregation, schema)
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()
|
||||
}
|
||||
|
||||
@@ -515,20 +503,43 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
if histogram_req.min_doc_count() == 0 {
|
||||
let mut buckets = if histogram_req.min_doc_count() == 0 {
|
||||
// With min_doc_count != 0, we may need to add buckets, so that there are no
|
||||
// gaps, since intermediate result does not contain empty buckets (filtered to
|
||||
// reduce serialization size).
|
||||
|
||||
intermediate_buckets_to_final_buckets_fill_gaps(buckets, histogram_req, sub_aggregation)
|
||||
intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
buckets,
|
||||
histogram_req,
|
||||
sub_aggregation,
|
||||
schema,
|
||||
)?
|
||||
} else {
|
||||
buckets
|
||||
.into_iter()
|
||||
.filter(|histogram_bucket| histogram_bucket.doc_count >= histogram_req.min_doc_count())
|
||||
.map(|histogram_bucket| histogram_bucket.into_final_bucket_entry(sub_aggregation))
|
||||
.collect::<crate::Result<Vec<_>>>()
|
||||
.map(|histogram_bucket| {
|
||||
histogram_bucket.into_final_bucket_entry(sub_aggregation, schema)
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?
|
||||
};
|
||||
|
||||
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
|
||||
let field = schema
|
||||
.get_field(&histogram_req.field)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(histogram_req.field.to_string()))?;
|
||||
if schema.get_field_entry(field).field_type().is_date() {
|
||||
for bucket in buckets.iter_mut() {
|
||||
if let crate::aggregation::Key::F64(val) = bucket.key {
|
||||
let key_as_string = format_date(val as i64)?;
|
||||
bucket.key_as_string = Some(key_as_string);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(buckets)
|
||||
}
|
||||
|
||||
/// Applies req extended_bounds/hard_bounds on the min_max value
|
||||
@@ -1387,6 +1398,63 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_date_test_single_segment() -> crate::Result<()> {
|
||||
histogram_date_test_with_opt(true)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_date_test_multi_segment() -> crate::Result<()> {
|
||||
histogram_date_test_with_opt(false)
|
||||
}
|
||||
|
||||
fn histogram_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(merge_segments)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"histogram".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "date".to_string(),
|
||||
interval: 86400000000.0, // one day in microseconds
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_res = exec_request(agg_req, &index)?;
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["histogram"]["buckets"][0]["key"], 1546300800000000.0);
|
||||
assert_eq!(
|
||||
res["histogram"]["buckets"][0]["key_as_string"],
|
||||
"2019-01-01T00:00:00Z"
|
||||
);
|
||||
assert_eq!(res["histogram"]["buckets"][0]["doc_count"], 1);
|
||||
|
||||
assert_eq!(res["histogram"]["buckets"][1]["key"], 1546387200000000.0);
|
||||
assert_eq!(
|
||||
res["histogram"]["buckets"][1]["key_as_string"],
|
||||
"2019-01-02T00:00:00Z"
|
||||
);
|
||||
|
||||
assert_eq!(res["histogram"]["buckets"][1]["doc_count"], 5);
|
||||
|
||||
assert_eq!(res["histogram"]["buckets"][2]["key"], 1546473600000000.0);
|
||||
assert_eq!(
|
||||
res["histogram"]["buckets"][2]["key_as_string"],
|
||||
"2019-01-03T00:00:00Z"
|
||||
);
|
||||
|
||||
assert_eq!(res["histogram"]["buckets"][3], Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_invalid_request() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(true)?;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -11,7 +12,9 @@ use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
|
||||
use crate::aggregation::{
|
||||
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey,
|
||||
};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
@@ -181,7 +184,7 @@ impl SegmentRangeCollector {
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
Ok((
|
||||
range_to_string(&range_bucket.range, &field_type),
|
||||
range_to_string(&range_bucket.range, &field_type)?,
|
||||
range_bucket
|
||||
.bucket
|
||||
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
|
||||
@@ -209,8 +212,8 @@ impl SegmentRangeCollector {
|
||||
let key = range
|
||||
.key
|
||||
.clone()
|
||||
.map(Key::Str)
|
||||
.unwrap_or_else(|| range_to_key(&range.range, &field_type));
|
||||
.map(|key| Ok(Key::Str(key)))
|
||||
.unwrap_or_else(|| range_to_key(&range.range, &field_type))?;
|
||||
let to = if range.range.end == u64::MAX {
|
||||
None
|
||||
} else {
|
||||
@@ -228,6 +231,7 @@ impl SegmentRangeCollector {
|
||||
sub_aggregation,
|
||||
)?)
|
||||
};
|
||||
|
||||
Ok(SegmentRangeAndBucketEntry {
|
||||
range: range.range.clone(),
|
||||
bucket: SegmentRangeBucketEntry {
|
||||
@@ -267,29 +271,20 @@ impl SegmentRangeCollector {
|
||||
let val2 = accessor.get_val(docs[1]);
|
||||
let val3 = accessor.get_val(docs[2]);
|
||||
let val4 = accessor.get_val(docs[3]);
|
||||
if let Some(val) = val1 {
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, docs[0], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if let Some(val) = val2 {
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, docs[1], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if let Some(val) = val3 {
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, docs[2], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if let Some(val) = val4 {
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, docs[3], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
let bucket_pos4 = self.get_bucket_pos(val4);
|
||||
|
||||
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
let val = accessor.get_val(doc);
|
||||
if let Some(val) = val {
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if force_flush {
|
||||
for bucket in &mut self.buckets {
|
||||
@@ -411,34 +406,45 @@ fn extend_validate_ranges(
|
||||
Ok(converted_buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> String {
|
||||
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> crate::Result<String> {
|
||||
// is_start is there for malformed requests, e.g. ig the user passes the range u64::MIN..0.0,
|
||||
// it should be rendered as "*-0" and not "*-*"
|
||||
let to_str = |val: u64, is_start: bool| {
|
||||
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
|
||||
"*".to_string()
|
||||
Ok("*".to_string())
|
||||
} else if *field_type == Type::Date {
|
||||
let val = i64::from_u64(val);
|
||||
format_date(val)
|
||||
} else {
|
||||
f64_from_fastfield_u64(val, field_type).to_string()
|
||||
Ok(f64_from_fastfield_u64(val, field_type).to_string())
|
||||
}
|
||||
};
|
||||
|
||||
format!("{}-{}", to_str(range.start, true), to_str(range.end, false))
|
||||
Ok(format!(
|
||||
"{}-{}",
|
||||
to_str(range.start, true)?,
|
||||
to_str(range.end, false)?
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> Key {
|
||||
Key::Str(range_to_string(range, field_type))
|
||||
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> crate::Result<Key> {
|
||||
Ok(Key::Str(range_to_string(range, field_type)?))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
|
||||
};
|
||||
use crate::aggregation::tests::{exec_request_with_query, get_test_index_with_num_docs};
|
||||
use crate::aggregation::tests::{
|
||||
exec_request, exec_request_with_query, get_test_index_2_segments,
|
||||
get_test_index_with_num_docs,
|
||||
};
|
||||
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
@@ -576,6 +582,77 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_date_test_single_segment() -> crate::Result<()> {
|
||||
range_date_test_with_opt(true)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_date_test_multi_segment() -> crate::Result<()> {
|
||||
range_date_test_with_opt(false)
|
||||
}
|
||||
|
||||
fn range_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(merge_segments)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"date_ranges".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "date".to_string(),
|
||||
ranges: vec![
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
from: None,
|
||||
to: Some(1546300800000000.0f64),
|
||||
},
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
from: Some(1546300800000000.0f64),
|
||||
to: Some(1546387200000000.0f64),
|
||||
},
|
||||
],
|
||||
keyed: false,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_res = exec_request(agg_req, &index)?;
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(
|
||||
res["date_ranges"]["buckets"][0]["from_as_string"],
|
||||
Value::Null
|
||||
);
|
||||
assert_eq!(
|
||||
res["date_ranges"]["buckets"][0]["key"],
|
||||
"*-2019-01-01T00:00:00Z"
|
||||
);
|
||||
assert_eq!(
|
||||
res["date_ranges"]["buckets"][1]["from_as_string"],
|
||||
"2019-01-01T00:00:00Z"
|
||||
);
|
||||
assert_eq!(
|
||||
res["date_ranges"]["buckets"][1]["to_as_string"],
|
||||
"2019-01-02T00:00:00Z"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["date_ranges"]["buckets"][2]["from_as_string"],
|
||||
"2019-01-02T00:00:00Z"
|
||||
);
|
||||
assert_eq!(
|
||||
res["date_ranges"]["buckets"][2]["to_as_string"],
|
||||
Value::Null
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
@@ -7,6 +7,7 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::schema::Schema;
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// The default max bucket count, before the aggregation fails.
|
||||
@@ -16,6 +17,7 @@ pub const MAX_BUCKET_COUNT: u32 = 65000;
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
schema: Schema,
|
||||
agg: Aggregations,
|
||||
max_bucket_count: u32,
|
||||
}
|
||||
@@ -25,8 +27,9 @@ impl AggregationCollector {
|
||||
///
|
||||
/// Aggregation fails when the total bucket count is higher than max_bucket_count.
|
||||
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
|
||||
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
|
||||
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>, schema: Schema) -> Self {
|
||||
Self {
|
||||
schema,
|
||||
agg,
|
||||
max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT),
|
||||
}
|
||||
@@ -113,7 +116,7 @@ impl Collector for AggregationCollector {
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
res.into_final_bucket_result(self.agg.clone())
|
||||
res.into_final_bucket_result(self.agg.clone(), &self.schema)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
18
src/aggregation/date.rs
Normal file
18
src/aggregation/date.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
pub(crate) fn format_date(val: i64) -> crate::Result<String> {
|
||||
let datetime =
|
||||
OffsetDateTime::from_unix_timestamp_nanos(1_000 * (val as i128)).map_err(|err| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"Could not convert {:?} to OffsetDateTime, err {:?}",
|
||||
val, err
|
||||
))
|
||||
})?;
|
||||
let key_as_string = datetime
|
||||
.format(&Rfc3339)
|
||||
.map_err(|_err| TantivyError::InvalidArgument("Could not serialize date".to_string()))?;
|
||||
Ok(key_as_string)
|
||||
}
|
||||
@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{
|
||||
Aggregations, AggregationsInternal, BucketAggregationInternal, BucketAggregationType,
|
||||
MetricAggregation,
|
||||
MetricAggregation, RangeAggregation,
|
||||
};
|
||||
use super::agg_result::{AggregationResult, BucketResult, RangeBucketEntry};
|
||||
use super::bucket::{
|
||||
@@ -19,9 +19,11 @@ use super::bucket::{
|
||||
};
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::SegmentMetricResultCollector;
|
||||
use super::{Key, SerializedKey, VecWithNames};
|
||||
use super::{format_date, Key, SerializedKey, VecWithNames};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
use crate::schema::Schema;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
@@ -35,8 +37,12 @@ pub struct IntermediateAggregationResults {
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
/// Convert intermediate result and its aggregation request to the final result.
|
||||
pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result<AggregationResults> {
|
||||
self.into_final_bucket_result_internal(&(req.into()))
|
||||
pub fn into_final_bucket_result(
|
||||
self,
|
||||
req: Aggregations,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
self.into_final_bucket_result_internal(&(req.into()), schema)
|
||||
}
|
||||
|
||||
/// Convert intermediate result and its aggregation request to the final result.
|
||||
@@ -46,6 +52,7 @@ impl IntermediateAggregationResults {
|
||||
pub(crate) fn into_final_bucket_result_internal(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
// Important assumption:
|
||||
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
|
||||
@@ -53,11 +60,11 @@ impl IntermediateAggregationResults {
|
||||
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
||||
|
||||
if let Some(buckets) = self.buckets {
|
||||
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)?
|
||||
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets, schema)?
|
||||
} else {
|
||||
// When there are no buckets, we create empty buckets, so that the serialized json
|
||||
// format is constant
|
||||
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
|
||||
add_empty_final_buckets_to_result(&mut results, &req.buckets, schema)?
|
||||
};
|
||||
|
||||
if let Some(metrics) = self.metrics {
|
||||
@@ -158,10 +165,12 @@ fn add_empty_final_metrics_to_result(
|
||||
fn add_empty_final_buckets_to_result(
|
||||
results: &mut FxHashMap<String, AggregationResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<()> {
|
||||
let requested_buckets = req_buckets.iter();
|
||||
for (key, req) in requested_buckets {
|
||||
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
|
||||
let empty_bucket =
|
||||
AggregationResult::BucketResult(BucketResult::empty_from_req(req, schema)?);
|
||||
results.insert(key.to_string(), empty_bucket);
|
||||
}
|
||||
Ok(())
|
||||
@@ -171,12 +180,13 @@ fn convert_and_add_final_buckets_to_result(
|
||||
results: &mut FxHashMap<String, AggregationResult>,
|
||||
buckets: VecWithNames<IntermediateBucketResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<()> {
|
||||
assert_eq!(buckets.len(), req_buckets.len());
|
||||
|
||||
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
|
||||
for ((key, bucket), req) in buckets_with_request {
|
||||
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req)?);
|
||||
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req, schema)?);
|
||||
results.insert(key, result);
|
||||
}
|
||||
Ok(())
|
||||
@@ -266,13 +276,21 @@ impl IntermediateBucketResult {
|
||||
pub(crate) fn into_final_bucket_result(
|
||||
self,
|
||||
req: &BucketAggregationInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<BucketResult> {
|
||||
match self {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_res
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| bucket.into_final_bucket_entry(&req.sub_aggregation))
|
||||
.map(|(_, bucket)| {
|
||||
bucket.into_final_bucket_entry(
|
||||
&req.sub_aggregation,
|
||||
schema,
|
||||
req.as_range()
|
||||
.expect("unexpected aggregation, expected histogram aggregation"),
|
||||
)
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets.sort_by(|left, right| {
|
||||
@@ -303,6 +321,7 @@ impl IntermediateBucketResult {
|
||||
req.as_histogram()
|
||||
.expect("unexpected aggregation, expected histogram aggregation"),
|
||||
&req.sub_aggregation,
|
||||
schema,
|
||||
)?;
|
||||
|
||||
let buckets = if req.as_histogram().unwrap().keyed {
|
||||
@@ -321,6 +340,7 @@ impl IntermediateBucketResult {
|
||||
req.as_term()
|
||||
.expect("unexpected aggregation, expected term aggregation"),
|
||||
&req.sub_aggregation,
|
||||
schema,
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -411,6 +431,7 @@ impl IntermediateTermBucketResult {
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &AggregationsInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
@@ -419,11 +440,12 @@ impl IntermediateTermBucketResult {
|
||||
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
|
||||
.map(|(key, entry)| {
|
||||
Ok(BucketEntry {
|
||||
key_as_string: None,
|
||||
key: Key::Str(key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(sub_aggregation_req)?,
|
||||
.into_final_bucket_result_internal(sub_aggregation_req, schema)?,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
@@ -528,13 +550,15 @@ impl IntermediateHistogramBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<BucketEntry> {
|
||||
Ok(BucketEntry {
|
||||
key_as_string: None,
|
||||
key: Key::F64(self.key),
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(req)?,
|
||||
.into_final_bucket_result_internal(req, schema)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -571,16 +595,38 @@ impl IntermediateRangeBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
schema: &Schema,
|
||||
range_req: &RangeAggregation,
|
||||
) -> crate::Result<RangeBucketEntry> {
|
||||
Ok(RangeBucketEntry {
|
||||
let mut range_bucket_entry = RangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(req)?,
|
||||
.into_final_bucket_result_internal(req, schema)?,
|
||||
to: self.to,
|
||||
from: self.from,
|
||||
})
|
||||
to_as_string: None,
|
||||
from_as_string: None,
|
||||
};
|
||||
|
||||
// If we have a date type on the histogram buckets, we add the `key_as_string` field as
|
||||
// rfc339
|
||||
let field = schema
|
||||
.get_field(&range_req.field)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(range_req.field.to_string()))?;
|
||||
if schema.get_field_entry(field).field_type().is_date() {
|
||||
if let Some(val) = range_bucket_entry.to {
|
||||
let key_as_string = format_date(val as i64)?;
|
||||
range_bucket_entry.to_as_string = Some(key_as_string);
|
||||
}
|
||||
if let Some(val) = range_bucket_entry.from {
|
||||
let key_as_string = format_date(val as i64)?;
|
||||
range_bucket_entry.from_as_string = Some(key_as_string);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(range_bucket_entry)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use fastfield_codecs::OptionalColumn;
|
||||
use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
@@ -57,33 +57,26 @@ impl SegmentAverageCollector {
|
||||
data: Default::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn OptionalColumn<u64>) {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
if let Some(val) = field.get_val(docs[0]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
if let Some(val) = field.get_val(docs[1]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
|
||||
if let Some(val) = field.get_val(docs[2]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
|
||||
if let Some(val) = field.get_val(docs[3]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
let val1 = field.get_val(docs[0]);
|
||||
let val2 = field.get_val(docs[1]);
|
||||
let val3 = field.get_val(docs[2]);
|
||||
let val4 = field.get_val(docs[3]);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.data.collect(val1);
|
||||
self.data.collect(val2);
|
||||
self.data.collect(val3);
|
||||
self.data.collect(val4);
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
if let Some(val) = field.get_val(doc) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
let val = field.get_val(doc);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use fastfield_codecs::OptionalColumn;
|
||||
use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
@@ -163,31 +163,26 @@ impl SegmentStatsCollector {
|
||||
stats: IntermediateStats::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn OptionalColumn<u64>) {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
if let Some(val) = field.get_val(docs[0]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
if let Some(val) = field.get_val(docs[1]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
if let Some(val) = field.get_val(docs[2]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
if let Some(val) = field.get_val(docs[3]) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
let val1 = field.get_val(docs[0]);
|
||||
let val2 = field.get_val(docs[1]);
|
||||
let val3 = field.get_val(docs[2]);
|
||||
let val4 = field.get_val(docs[3]);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.stats.collect(val1);
|
||||
self.stats.collect(val2);
|
||||
self.stats.collect(val3);
|
||||
self.stats.collect(val4);
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
if let Some(val) = field.get_val(doc) {
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
let val = field.get_val(doc);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -227,7 +222,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -305,7 +300,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//!
|
||||
//! ## Prerequisite
|
||||
//! Currently aggregations work only on [fast fields](`crate::fastfield`). Single value fast fields
|
||||
//! of type `u64`, `f64`, `i64` and fast fields on text fields.
|
||||
//! of type `u64`, `f64`, `i64`, `date` and fast fields on text fields.
|
||||
//!
|
||||
//! ## Usage
|
||||
//! To use aggregations, build an aggregation request by constructing
|
||||
@@ -53,9 +53,10 @@
|
||||
//! use tantivy::query::AllQuery;
|
||||
//! use tantivy::aggregation::agg_result::AggregationResults;
|
||||
//! use tantivy::IndexReader;
|
||||
//! use tantivy::schema::Schema;
|
||||
//!
|
||||
//! # #[allow(dead_code)]
|
||||
//! fn aggregate_on_index(reader: &IndexReader) {
|
||||
//! fn aggregate_on_index(reader: &IndexReader, schema: Schema) {
|
||||
//! let agg_req: Aggregations = vec![
|
||||
//! (
|
||||
//! "average".to_string(),
|
||||
@@ -67,7 +68,7 @@
|
||||
//! .into_iter()
|
||||
//! .collect();
|
||||
//!
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req, None, schema);
|
||||
//!
|
||||
//! let searcher = reader.searcher();
|
||||
//! let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
@@ -157,6 +158,7 @@ mod agg_req_with_accessor;
|
||||
pub mod agg_result;
|
||||
pub mod bucket;
|
||||
mod collector;
|
||||
mod date;
|
||||
pub mod intermediate_agg_result;
|
||||
pub mod metric;
|
||||
mod segment_agg_result;
|
||||
@@ -167,6 +169,7 @@ pub use collector::{
|
||||
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
||||
MAX_BUCKET_COUNT,
|
||||
};
|
||||
pub(crate) use date::format_date;
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -283,11 +286,11 @@ impl Display for Key {
|
||||
/// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics.
|
||||
///
|
||||
/// # Panics
|
||||
/// Only `u64`, `f64`, and `i64` are supported.
|
||||
/// Only `u64`, `f64`, `date`, and `i64` are supported.
|
||||
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
|
||||
match field_type {
|
||||
Type::U64 => val as f64,
|
||||
Type::I64 => i64::from_u64(val) as f64,
|
||||
Type::I64 | Type::Date => i64::from_u64(val) as f64,
|
||||
Type::F64 => f64::from_u64(val),
|
||||
_ => {
|
||||
panic!("unexpected type {:?}. This should not happen", field_type)
|
||||
@@ -295,10 +298,9 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the `f64` value to fast field value space.
|
||||
/// Converts the `f64` value to fast field value space, which is always u64.
|
||||
///
|
||||
/// If the fast field has `u64`, values are stored as `u64` in the fast field.
|
||||
/// A `f64` value of e.g. `2.0` therefore needs to be converted to `1u64`.
|
||||
/// If the fast field has `u64`, values are stored unchanged as `u64` in the fast field.
|
||||
///
|
||||
/// If the fast field has `f64` values are converted and stored to `u64` using a
|
||||
/// monotonic mapping.
|
||||
@@ -308,7 +310,7 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
|
||||
pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> Option<u64> {
|
||||
match field_type {
|
||||
Type::U64 => Some(val as u64),
|
||||
Type::I64 => Some((val as i64).to_u64()),
|
||||
Type::I64 | Type::Date => Some((val as i64).to_u64()),
|
||||
Type::F64 => Some(val.to_u64()),
|
||||
_ => None,
|
||||
}
|
||||
@@ -317,6 +319,7 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> Option<u64> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregation};
|
||||
use super::bucket::RangeAggregation;
|
||||
@@ -332,7 +335,7 @@ mod tests {
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use crate::{Index, Term};
|
||||
use crate::{DateTime, Index, Term};
|
||||
|
||||
fn get_avg_req(field_name: &str) -> Aggregation {
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
@@ -358,7 +361,7 @@ mod tests {
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -552,10 +555,10 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
|
||||
intermediate_agg_result
|
||||
.into_final_bucket_result(agg_req)
|
||||
.into_final_bucket_result(agg_req, &index.schema())
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
@@ -648,6 +651,7 @@ mod tests {
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
schema_builder.add_text_field("dummy_text", STRING);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
@@ -665,6 +669,7 @@ mod tests {
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800).unwrap()),
|
||||
score_field => 1u64,
|
||||
score_field_f64 => 1f64,
|
||||
score_field_i64 => 1i64,
|
||||
@@ -673,6 +678,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
|
||||
score_field => 3u64,
|
||||
score_field_f64 => 3f64,
|
||||
score_field_i64 => 3i64,
|
||||
@@ -681,18 +687,21 @@ mod tests {
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
|
||||
score_field => 5u64,
|
||||
score_field_f64 => 5f64,
|
||||
score_field_i64 => 5i64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
|
||||
score_field => 6u64,
|
||||
score_field_f64 => 6f64,
|
||||
score_field_i64 => 6i64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
|
||||
score_field => 7u64,
|
||||
score_field_f64 => 7f64,
|
||||
score_field_i64 => 7i64,
|
||||
@@ -700,12 +709,14 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
|
||||
score_field => 11u64,
|
||||
score_field_f64 => 11f64,
|
||||
score_field_i64 => 11i64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400 + 86400).unwrap()),
|
||||
score_field => 14u64,
|
||||
score_field_f64 => 14f64,
|
||||
score_field_i64 => 14i64,
|
||||
@@ -713,6 +724,7 @@ mod tests {
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400 + 86400).unwrap()),
|
||||
score_field => 44u64,
|
||||
score_field_f64 => 44.5f64,
|
||||
score_field_i64 => 44i64,
|
||||
@@ -723,6 +735,7 @@ mod tests {
|
||||
// no hits segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400 + 86400).unwrap()),
|
||||
score_field => 44u64,
|
||||
score_field_f64 => 44.5f64,
|
||||
score_field_i64 => 44i64,
|
||||
@@ -795,7 +808,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
@@ -995,9 +1008,10 @@ mod tests {
|
||||
// Test de/serialization roundtrip on intermediate_agg_result
|
||||
let res: IntermediateAggregationResults =
|
||||
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
|
||||
res.into_final_bucket_result(agg_req.clone()).unwrap()
|
||||
res.into_final_bucket_result(agg_req.clone(), &index.schema())
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req.clone(), None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req.clone(), None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
@@ -1055,7 +1069,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// Test empty result set
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query_with_no_hits, &collector).unwrap();
|
||||
|
||||
@@ -1120,7 +1134,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -1233,7 +1247,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1264,7 +1278,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1295,7 +1309,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1334,7 +1348,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1363,7 +1377,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1392,7 +1406,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1429,7 +1443,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1464,7 +1478,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1503,7 +1517,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1533,7 +1547,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1590,7 +1604,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
|
||||
@@ -130,9 +130,7 @@ where
|
||||
|
||||
let fast_field_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(self.field)?
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
.typed_fast_field_reader(self.field)?;
|
||||
|
||||
let segment_collector = self
|
||||
.collector
|
||||
|
||||
@@ -112,11 +112,7 @@ impl Collector for HistogramCollector {
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
segment: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let ff_reader = segment
|
||||
.fast_fields()
|
||||
.u64_lenient(self.field)?
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
let ff_reader = segment.fast_fields().u64_lenient(self.field)?;
|
||||
Ok(SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer {
|
||||
counts: vec![0; self.num_buckets],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::OptionalColumn;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
@@ -160,7 +160,7 @@ pub struct FastFieldTestCollector {
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: Arc<dyn OptionalColumn<u64>>,
|
||||
reader: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
@@ -202,9 +202,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get_val(doc);
|
||||
if let Some(val) = val {
|
||||
self.vals.push(val);
|
||||
}
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<u64> {
|
||||
|
||||
@@ -156,9 +156,7 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
// The conversion will then happen only on the top-K docs.
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(self.field)?
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
.typed_fast_field_reader(self.field)?;
|
||||
Ok(ScorerByFastFieldReader { ff_reader })
|
||||
}
|
||||
}
|
||||
@@ -460,7 +458,7 @@ impl TopDocs {
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId, original_score: Score| {
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc).unwrap();
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc);
|
||||
/// // Well.. For the sake of the example we use a simple logarithm
|
||||
/// // function.
|
||||
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
|
||||
@@ -569,8 +567,8 @@ impl TopDocs {
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId| {
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc).unwrap();
|
||||
/// let boosted: u64 = boosted_reader.get_val(doc).unwrap();
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc);
|
||||
/// let boosted: u64 = boosted_reader.get_val(doc);
|
||||
/// // Score do not have to be `f64` in tantivy.
|
||||
/// // Here we return a couple to get lexicographical order
|
||||
/// // for free.
|
||||
|
||||
@@ -149,7 +149,8 @@ impl IndexBuilder {
|
||||
/// Creates a new index using the [`RamDirectory`].
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
/// This is useful for indexing small set of documents
|
||||
/// for instances like unit test or temporary in memory index.
|
||||
pub fn create_in_ram(self) -> Result<Index, TantivyError> {
|
||||
let ram_directory = RamDirectory::create();
|
||||
self.create(ram_directory)
|
||||
|
||||
@@ -30,8 +30,8 @@ pub use self::multivalued::{
|
||||
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader,
|
||||
MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader,
|
||||
};
|
||||
pub(crate) use self::readers::type_and_cardinality;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
pub use self::serializer::{Column, CompositeFastFieldSerializer};
|
||||
use self::writer::unexpected_value;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
@@ -207,10 +207,10 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 25);
|
||||
assert_eq!(file.len(), 34);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open::<u64>(fast_field_bytes)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<u64>(fast_field_bytes)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 2u64);
|
||||
@@ -256,14 +256,14 @@ mod tests {
|
||||
serializer.close()?;
|
||||
}
|
||||
let file = directory.open_read(path)?;
|
||||
assert_eq!(file.len(), 53);
|
||||
assert_eq!(file.len(), 62);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open::<u64>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<u64>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 3_052u64);
|
||||
@@ -297,14 +297,14 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 26);
|
||||
assert_eq!(file.len(), 35);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open::<u64>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<u64>(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
|
||||
}
|
||||
@@ -336,14 +336,14 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 80040);
|
||||
assert_eq!(file.len(), 80049);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open::<u64>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<u64>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -378,7 +378,7 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 40_usize);
|
||||
assert_eq!(file.len(), 49_usize);
|
||||
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
@@ -386,7 +386,7 @@ mod tests {
|
||||
.open_read(i64_field)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open::<i64>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<i64>(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -429,7 +429,7 @@ mod tests {
|
||||
.open_read(i64_field)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open::<i64>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<i64>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
@@ -470,7 +470,7 @@ mod tests {
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open::<u64>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<u64>(data)?;
|
||||
|
||||
for a in 0..n {
|
||||
assert_eq!(fast_field_reader.get_val(a as u32), permutation[a as usize]);
|
||||
@@ -763,28 +763,19 @@ mod tests {
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(
|
||||
date_fast_field.get_val(0).unwrap().into_timestamp_micros(),
|
||||
1i64
|
||||
);
|
||||
assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
|
||||
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
date_fast_field.get_val(1).unwrap().into_timestamp_micros(),
|
||||
4i64
|
||||
);
|
||||
assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
date_fast_field.get_val(2).unwrap().into_timestamp_micros(),
|
||||
0i64
|
||||
);
|
||||
assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
|
||||
@@ -831,10 +822,10 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 24);
|
||||
assert_eq!(file.len(), 33);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open::<bool>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<bool>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), true);
|
||||
assert_eq!(fast_field_reader.get_val(1), false);
|
||||
assert_eq!(fast_field_reader.get_val(2), true);
|
||||
@@ -869,10 +860,10 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 36);
|
||||
assert_eq!(file.len(), 45);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open::<bool>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<bool>(data)?;
|
||||
for i in 0..25 {
|
||||
assert_eq!(fast_field_reader.get_val(i * 2), true);
|
||||
assert_eq!(fast_field_reader.get_val(i * 2 + 1), false);
|
||||
@@ -901,9 +892,9 @@ mod tests {
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
assert_eq!(file.len(), 23);
|
||||
assert_eq!(file.len(), 32);
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open::<bool>(data)?.to_full().unwrap();
|
||||
let fast_field_reader = open::<bool>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), false);
|
||||
|
||||
Ok(())
|
||||
@@ -935,10 +926,10 @@ mod tests {
|
||||
pub fn test_gcd_date() -> crate::Result<()> {
|
||||
let size_prec_sec =
|
||||
test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Seconds)?;
|
||||
assert_eq!(size_prec_sec, 28 + (1_000 * 13) / 8); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
|
||||
assert_eq!(size_prec_sec, 5 + 4 + 28 + (1_000 * 13) / 8); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
|
||||
let size_prec_micro =
|
||||
test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Microseconds)?;
|
||||
assert_eq!(size_prec_micro, 26 + (1_000 * 33) / 8); // 33 bits per val = ceil(log_2(number of microsecsseconds in 2hours);
|
||||
assert_eq!(size_prec_micro, 5 + 4 + 26 + (1_000 * 33) / 8); // 33 bits per val = ceil(log_2(number of microsecsseconds in 2hours);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -971,9 +962,7 @@ mod tests {
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let len = file.len();
|
||||
let test_fastfield = open::<DateTime>(file.read_bytes()?)?
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
let test_fastfield = open::<DateTime>(file.read_bytes()?)?;
|
||||
|
||||
for (i, time) in times.iter().enumerate() {
|
||||
assert_eq!(test_fastfield.get_val(i as u32), time.truncate(precision));
|
||||
|
||||
@@ -533,17 +533,14 @@ mod bench {
|
||||
.unwrap()
|
||||
.read_bytes()
|
||||
.unwrap();
|
||||
let idx_reader = fastfield_codecs::open(data_idx).unwrap().to_full().unwrap();
|
||||
let idx_reader = fastfield_codecs::open(data_idx).unwrap();
|
||||
|
||||
let data_vals = fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.unwrap()
|
||||
.read_bytes()
|
||||
.unwrap();
|
||||
let vals_reader = fastfield_codecs::open(data_vals)
|
||||
.unwrap()
|
||||
.to_full()
|
||||
.unwrap();
|
||||
let vals_reader = fastfield_codecs::open(data_vals).unwrap();
|
||||
let fast_field_reader = MultiValuedFastFieldReader::open(idx_reader, vals_reader);
|
||||
b.iter(|| {
|
||||
let mut sum = 0u64;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::net::Ipv6Addr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::{open, open_u128, Column, OptionalColumn};
|
||||
use fastfield_codecs::{open, open_u128, Column};
|
||||
|
||||
use super::multivalued::MultiValuedU128FastFieldReader;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
@@ -118,7 +118,7 @@ impl FastFieldReaders {
|
||||
&self,
|
||||
field: Field,
|
||||
index: usize,
|
||||
) -> crate::Result<Arc<dyn OptionalColumn<TFastValue>>> {
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
let fast_field_slice = self.fast_field_data(field, index)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
let column = fastfield_codecs::open(bytes)?;
|
||||
@@ -128,7 +128,7 @@ impl FastFieldReaders {
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<Arc<dyn OptionalColumn<TFastValue>>> {
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
self.typed_fast_field_reader_with_idx(field, 0)
|
||||
}
|
||||
|
||||
@@ -138,20 +138,13 @@ impl FastFieldReaders {
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let idx_reader = self.typed_fast_field_reader(field)?;
|
||||
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValuedFastFieldReader::open(
|
||||
idx_reader
|
||||
.to_full()
|
||||
.expect("multivalue fast field are always full"),
|
||||
vals_reader
|
||||
.to_full()
|
||||
.expect("multivalue fast field are always full"),
|
||||
))
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<u64>>> {
|
||||
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -159,7 +152,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `ip` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub fn ip_addr(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<Ipv6Addr>>> {
|
||||
pub fn ip_addr(&self, field: Field) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
|
||||
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
|
||||
Ok(open_u128::<Ipv6Addr>(bytes)?)
|
||||
@@ -173,15 +166,10 @@ impl FastFieldReaders {
|
||||
field: Field,
|
||||
) -> crate::Result<MultiValuedU128FastFieldReader<Ipv6Addr>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
|
||||
let idx_reader: Arc<dyn Column<u64>> = self
|
||||
.typed_fast_field_reader(field)?
|
||||
.to_full()
|
||||
.expect("multivalue fast fields are always full");
|
||||
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field)?;
|
||||
|
||||
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
|
||||
let vals_reader = open_u128::<Ipv6Addr>(bytes)?
|
||||
.to_full()
|
||||
.expect("multivalue fields are always full");
|
||||
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
|
||||
|
||||
Ok(MultiValuedU128FastFieldReader::open(
|
||||
idx_reader,
|
||||
@@ -191,9 +179,8 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns the `u128` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 base type fast field, this method returns an Error.
|
||||
/// Ip addresses use u128 as base type.
|
||||
pub(crate) fn u128(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<u128>>> {
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub(crate) fn u128(&self, field: Field) -> crate::Result<Arc<dyn Column<u128>>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
|
||||
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
|
||||
Ok(open_u128::<u128>(bytes)?)
|
||||
@@ -204,15 +191,10 @@ impl FastFieldReaders {
|
||||
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
|
||||
pub fn u128s(&self, field: Field) -> crate::Result<MultiValuedU128FastFieldReader<u128>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
|
||||
let idx_reader: Arc<dyn Column<u64>> = self
|
||||
.typed_fast_field_reader(field)?
|
||||
.to_full()
|
||||
.expect("multivalue fast fields are always full");
|
||||
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field)?;
|
||||
|
||||
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
|
||||
let vals_reader = open_u128::<u128>(bytes)?
|
||||
.to_full()
|
||||
.expect("multivalue fast fields are always full");
|
||||
let vals_reader = open_u128::<u128>(bytes)?;
|
||||
|
||||
Ok(MultiValuedU128FastFieldReader::open(
|
||||
idx_reader,
|
||||
@@ -225,14 +207,14 @@ impl FastFieldReaders {
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated with the original
|
||||
/// FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<u64>>> {
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<i64>>> {
|
||||
pub fn i64(&self, field: Field) -> crate::Result<Arc<dyn Column<i64>>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -240,7 +222,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `date` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a date fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<DateTime>>> {
|
||||
pub fn date(&self, field: Field) -> crate::Result<Arc<dyn Column<DateTime>>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -248,7 +230,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `f64` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<f64>>> {
|
||||
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn Column<f64>>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -256,7 +238,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `bool` fast field reader reader associated with `field`.
|
||||
///
|
||||
/// If `field` is not a bool fast field, this method returns an Error.
|
||||
pub fn bool(&self, field: Field) -> crate::Result<Arc<dyn OptionalColumn<bool>>> {
|
||||
pub fn bool(&self, field: Field) -> crate::Result<Arc<dyn Column<bool>>> {
|
||||
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -327,12 +309,7 @@ impl FastFieldReaders {
|
||||
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
|
||||
let idx_reader = open(fast_field_idx_bytes)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(
|
||||
idx_reader
|
||||
.to_full()
|
||||
.expect("multivalue fields are always full"),
|
||||
data,
|
||||
)
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry).into())
|
||||
}
|
||||
|
||||
@@ -465,9 +465,9 @@ mod tests_indexsorting {
|
||||
let my_number = index.schema().get_field("my_number").unwrap();
|
||||
|
||||
let fast_field = fast_fields.u64(my_number).unwrap();
|
||||
assert_eq!(fast_field.get_val(0), Some(10u64));
|
||||
assert_eq!(fast_field.get_val(1), Some(20u64));
|
||||
assert_eq!(fast_field.get_val(2), Some(30u64));
|
||||
assert_eq!(fast_field.get_val(0), 10u64);
|
||||
assert_eq!(fast_field.get_val(1), 20u64);
|
||||
assert_eq!(fast_field.get_val(2), 30u64);
|
||||
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
let multifield = fast_fields.u64s(multi_numbers).unwrap();
|
||||
|
||||
@@ -1467,7 +1467,7 @@ mod tests {
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
|
||||
let in_order_alive_ids: Vec<u64> = segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc| fast_field_reader.get_val(doc).unwrap())
|
||||
.map(|doc| fast_field_reader.get_val(doc))
|
||||
.collect();
|
||||
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
|
||||
Ok(())
|
||||
@@ -1528,7 +1528,7 @@ mod tests {
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
|
||||
let in_order_alive_ids: Vec<u64> = segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc| fast_field_reader.get_val(doc).unwrap())
|
||||
.map(|doc| fast_field_reader.get_val(doc))
|
||||
.collect();
|
||||
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
|
||||
Ok(())
|
||||
@@ -1777,12 +1777,7 @@ mod tests {
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(id_field)
|
||||
.unwrap()
|
||||
.to_full()
|
||||
.unwrap();
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get_val(doc))
|
||||
@@ -1793,12 +1788,7 @@ mod tests {
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(id_field)
|
||||
.unwrap()
|
||||
.to_full()
|
||||
.unwrap();
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get_val(doc))
|
||||
@@ -1874,7 +1864,7 @@ mod tests {
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap();
|
||||
segment_reader.doc_ids_alive().flat_map(move |doc| {
|
||||
let val = ff_reader.get_val(doc).unwrap(); // TODO handle null
|
||||
let val = ff_reader.get_val(doc);
|
||||
if val == Ipv6Addr::from_u128(0) {
|
||||
// TODO Fix null handling
|
||||
None
|
||||
@@ -1931,7 +1921,7 @@ mod tests {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
assert_eq!(vals.len(), 2);
|
||||
assert_eq!(vals[0], vals[1]);
|
||||
assert_eq!(id_reader.get_val(doc).unwrap(), vals[0]);
|
||||
assert_eq!(id_reader.get_val(doc), vals[0]);
|
||||
|
||||
let mut bool_vals = vec![];
|
||||
bool_ff_reader.get_vals(doc, &mut bool_vals);
|
||||
@@ -2127,7 +2117,7 @@ mod tests {
|
||||
facet_reader
|
||||
.facet_from_ord(facet_ords[0], &mut facet)
|
||||
.unwrap();
|
||||
let id = ff_reader.get_val(doc_id).unwrap();
|
||||
let id = ff_reader.get_val(doc_id);
|
||||
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||
|
||||
assert_eq!(facet, facet_expected);
|
||||
|
||||
@@ -67,11 +67,12 @@ pub(crate) fn index_json_values<'a>(
|
||||
doc: DocId,
|
||||
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
expand_dots_enabled: bool,
|
||||
term_buffer: &mut Term,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> crate::Result<()> {
|
||||
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
|
||||
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
|
||||
for json_value_res in json_values {
|
||||
let json_value = json_value_res?;
|
||||
@@ -259,6 +260,7 @@ pub(crate) fn set_string_and_get_terms(
|
||||
pub struct JsonTermWriter<'a> {
|
||||
term_buffer: &'a mut Term,
|
||||
path_stack: Vec<usize>,
|
||||
expand_dots_enabled: bool,
|
||||
}
|
||||
|
||||
/// Splits a json path supplied to the query parser in such a way that
|
||||
@@ -298,23 +300,25 @@ impl<'a> JsonTermWriter<'a> {
|
||||
pub fn from_field_and_json_path(
|
||||
field: Field,
|
||||
json_path: &str,
|
||||
expand_dots_enabled: bool,
|
||||
term_buffer: &'a mut Term,
|
||||
) -> Self {
|
||||
term_buffer.set_field_and_type(field, Type::Json);
|
||||
let mut json_term_writer = Self::wrap(term_buffer);
|
||||
let mut json_term_writer = Self::wrap(term_buffer, expand_dots_enabled);
|
||||
for segment in split_json_path(json_path) {
|
||||
json_term_writer.push_path_segment(&segment);
|
||||
}
|
||||
json_term_writer
|
||||
}
|
||||
|
||||
pub fn wrap(term_buffer: &'a mut Term) -> Self {
|
||||
pub fn wrap(term_buffer: &'a mut Term, expand_dots_enabled: bool) -> Self {
|
||||
term_buffer.clear_with_type(Type::Json);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(0);
|
||||
Self {
|
||||
term_buffer,
|
||||
path_stack,
|
||||
expand_dots_enabled,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -336,11 +340,24 @@ impl<'a> JsonTermWriter<'a> {
|
||||
self.trim_to_end_of_path();
|
||||
let buffer = self.term_buffer.value_bytes_mut();
|
||||
let buffer_len = buffer.len();
|
||||
|
||||
if self.path_stack.len() > 1 {
|
||||
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
|
||||
}
|
||||
self.term_buffer.append_bytes(segment.as_bytes());
|
||||
self.term_buffer.append_bytes(&[JSON_PATH_SEGMENT_SEP]);
|
||||
if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') {
|
||||
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
|
||||
self.term_buffer
|
||||
.append_bytes(segment.as_bytes())
|
||||
.iter_mut()
|
||||
.for_each(|byte| {
|
||||
if *byte == b'.' {
|
||||
*byte = JSON_PATH_SEGMENT_SEP;
|
||||
}
|
||||
});
|
||||
} else {
|
||||
self.term_buffer.append_bytes(segment.as_bytes());
|
||||
}
|
||||
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
|
||||
self.path_stack.push(self.term_buffer.len_bytes());
|
||||
}
|
||||
|
||||
@@ -391,7 +408,7 @@ mod tests {
|
||||
fn test_json_writer() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("attributes");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
@@ -425,7 +442,7 @@ mod tests {
|
||||
fn test_string_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
@@ -438,7 +455,7 @@ mod tests {
|
||||
fn test_i64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(-4i64);
|
||||
assert_eq!(
|
||||
@@ -451,7 +468,7 @@ mod tests {
|
||||
fn test_u64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4u64);
|
||||
assert_eq!(
|
||||
@@ -464,7 +481,7 @@ mod tests {
|
||||
fn test_f64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4.0f64);
|
||||
assert_eq!(
|
||||
@@ -477,7 +494,7 @@ mod tests {
|
||||
fn test_bool_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(true);
|
||||
assert_eq!(
|
||||
@@ -490,7 +507,7 @@ mod tests {
|
||||
fn test_push_after_set_path_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("attribute");
|
||||
json_writer.set_str("something");
|
||||
json_writer.push_path_segment("color");
|
||||
@@ -505,7 +522,7 @@ mod tests {
|
||||
fn test_pop_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.push_path_segment("hue");
|
||||
json_writer.pop_path_segment();
|
||||
@@ -520,7 +537,7 @@ mod tests {
|
||||
fn test_json_writer_path() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color");
|
||||
assert_eq!(json_writer.path(), b"color");
|
||||
json_writer.push_path_segment("hue");
|
||||
@@ -529,6 +546,37 @@ mod tests {
|
||||
assert_eq!(json_writer.path(), b"color\x01hue");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_path_expand_dots_disabled() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_writer.push_path_segment("color.hue");
|
||||
assert_eq!(json_writer.path(), b"color.hue");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_path_expand_dots_enabled() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
|
||||
json_writer.push_path_segment("color.hue");
|
||||
assert_eq!(json_writer.path(), b"color\x01hue");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_path_expand_dots_enabled_pop_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::with_type_and_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
|
||||
json_writer.push_path_segment("hello");
|
||||
assert_eq!(json_writer.path(), b"hello");
|
||||
json_writer.push_path_segment("color.hue");
|
||||
assert_eq!(json_writer.path(), b"hello\x01color\x01hue");
|
||||
json_writer.pop_path_segment();
|
||||
assert_eq!(json_writer.path(), b"hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_json_path_simple() {
|
||||
let json_path = split_json_path("titi.toto");
|
||||
|
||||
@@ -401,15 +401,10 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u128_reader: Arc<dyn Column<u128>> = reader
|
||||
.fast_fields()
|
||||
.u128(field)
|
||||
.expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
)
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
let u128_reader: Arc<dyn Column<u128>> = reader.fast_fields().u128(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and it \
|
||||
should never happen.",
|
||||
);
|
||||
u128_reader
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
@@ -470,11 +465,7 @@ impl IndexMerger {
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Arc<dyn Column>> {
|
||||
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field_id)?
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
Ok(value_accessor)
|
||||
}
|
||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
||||
@@ -1377,16 +1368,16 @@ mod tests {
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), Some(4000));
|
||||
assert_eq!(score_field_reader.max_value(), Some(7000));
|
||||
assert_eq!(score_field_reader.min_value(), 4000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(1)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), Some(1));
|
||||
assert_eq!(score_field_reader.max_value(), Some(3));
|
||||
assert_eq!(score_field_reader.min_value(), 1);
|
||||
assert_eq!(score_field_reader.max_value(), 3);
|
||||
}
|
||||
{
|
||||
// merging the segments
|
||||
@@ -1431,8 +1422,8 @@ mod tests {
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), Some(3));
|
||||
assert_eq!(score_field_reader.max_value(), Some(7000));
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{
|
||||
// test a commit with only deletes
|
||||
@@ -1478,8 +1469,8 @@ mod tests {
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), Some(3));
|
||||
assert_eq!(score_field_reader.max_value(), Some(7000));
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{
|
||||
// Test merging a single segment in order to remove deletes.
|
||||
@@ -1525,8 +1516,8 @@ mod tests {
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), Some(6000));
|
||||
assert_eq!(score_field_reader.max_value(), Some(7000));
|
||||
assert_eq!(score_field_reader.min_value(), 6000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -186,17 +186,17 @@ mod tests {
|
||||
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64(int_field).unwrap();
|
||||
assert_eq!(fast_field.get_val(5), Some(1u64));
|
||||
assert_eq!(fast_field.get_val(4), Some(2u64));
|
||||
assert_eq!(fast_field.get_val(3), Some(3u64));
|
||||
assert_eq!(fast_field.get_val(5), 1u64);
|
||||
assert_eq!(fast_field.get_val(4), 2u64);
|
||||
assert_eq!(fast_field.get_val(3), 3u64);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fast_field.get_val(2), Some(20u64));
|
||||
assert_eq!(fast_field.get_val(1), Some(100u64));
|
||||
assert_eq!(fast_field.get_val(2), 20u64);
|
||||
assert_eq!(fast_field.get_val(1), 100u64);
|
||||
} else {
|
||||
assert_eq!(fast_field.get_val(2), Some(10u64));
|
||||
assert_eq!(fast_field.get_val(1), Some(20u64));
|
||||
assert_eq!(fast_field.get_val(2), 10u64);
|
||||
assert_eq!(fast_field.get_val(1), 20u64);
|
||||
}
|
||||
assert_eq!(fast_field.get_val(0), Some(1_000u64));
|
||||
assert_eq!(fast_field.get_val(0), 1_000u64);
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
@@ -373,12 +373,12 @@ mod tests {
|
||||
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64(int_field).unwrap();
|
||||
assert_eq!(fast_field.get_val(0), Some(1u64));
|
||||
assert_eq!(fast_field.get_val(1), Some(2u64));
|
||||
assert_eq!(fast_field.get_val(2), Some(3u64));
|
||||
assert_eq!(fast_field.get_val(3), Some(10u64));
|
||||
assert_eq!(fast_field.get_val(4), Some(20u64));
|
||||
assert_eq!(fast_field.get_val(5), Some(1_000u64));
|
||||
assert_eq!(fast_field.get_val(0), 1u64);
|
||||
assert_eq!(fast_field.get_val(1), 2u64);
|
||||
assert_eq!(fast_field.get_val(2), 3u64);
|
||||
assert_eq!(fast_field.get_val(3), 10u64);
|
||||
assert_eq!(fast_field.get_val(4), 20u64);
|
||||
assert_eq!(fast_field.get_val(5), 1_000u64);
|
||||
|
||||
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
||||
let mut vals = vec![];
|
||||
@@ -535,15 +535,11 @@ mod bench_sorted_index_merge {
|
||||
b.iter(|| {
|
||||
let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
|
||||
let reader = &merger.readers[doc_addr.segment_ord as usize];
|
||||
let u64_reader: Arc<dyn Column<u64>> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect(
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
)
|
||||
.to_full()
|
||||
.unwrap();
|
||||
);
|
||||
(doc_addr.doc_id, reader, u64_reader)
|
||||
});
|
||||
// add values in order of the new doc_ids
|
||||
|
||||
@@ -60,7 +60,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
|
||||
mod tests_mmap {
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, STORED, TEXT};
|
||||
use crate::schema::{JsonObjectOptions, Schema, TEXT};
|
||||
use crate::{Index, Term};
|
||||
|
||||
#[test]
|
||||
@@ -81,9 +81,9 @@ mod tests_mmap {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_field_espace() {
|
||||
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT | STORED);
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
@@ -99,4 +99,26 @@ mod tests_mmap {
|
||||
let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
assert_eq!(num_docs, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_options: JsonObjectOptions =
|
||||
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
|
||||
let json_field = schema_builder.add_json_field("json", json_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
let query = parse_query
|
||||
.parse_query(r#"json.k8s.container.name:prometheus"#)
|
||||
.unwrap();
|
||||
let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
assert_eq!(num_docs, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +180,7 @@ impl SegmentWriter {
|
||||
self.per_field_postings_writers.get_for_field_mut(field);
|
||||
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
|
||||
|
||||
match *field_entry.field_type() {
|
||||
match field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
for value in values {
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
@@ -307,7 +307,7 @@ impl SegmentWriter {
|
||||
self.fieldnorms_writer.record(doc_id, field, num_vals);
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
FieldType::JsonObject(json_options) => {
|
||||
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it =
|
||||
values.map(|value| value.as_json().ok_or_else(make_schema_error));
|
||||
@@ -315,6 +315,7 @@ impl SegmentWriter {
|
||||
doc_id,
|
||||
json_values_it,
|
||||
text_analyzer,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
term_buffer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
@@ -557,7 +558,7 @@ mod tests {
|
||||
let mut term = Term::with_type_and_field(Type::Json, json_field);
|
||||
let mut term_stream = term_dict.stream().unwrap();
|
||||
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
|
||||
json_term_writer.push_path_segment("bool");
|
||||
json_term_writer.set_fast_value(true);
|
||||
@@ -648,7 +649,7 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_index = segment_reader.inverted_index(json_field).unwrap();
|
||||
let mut term = Term::with_type_and_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.set_str("token");
|
||||
let term_info = inv_index
|
||||
@@ -692,7 +693,7 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_index = segment_reader.inverted_index(json_field).unwrap();
|
||||
let mut term = Term::with_type_and_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.set_str("two tokens");
|
||||
let term_info = inv_index
|
||||
@@ -737,7 +738,7 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let mut term = Term::with_type_and_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.push_path_segment("field");
|
||||
json_term_writer.set_str("hello");
|
||||
|
||||
@@ -46,15 +46,11 @@ impl<'a> RemappedDocIdColumn<'a> {
|
||||
let (min_value, max_value) = readers
|
||||
.iter()
|
||||
.filter_map(|reader| {
|
||||
let u64_reader: Arc<dyn Column<u64>> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect(
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
)
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
);
|
||||
compute_min_max_val(&*u64_reader, reader)
|
||||
})
|
||||
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
|
||||
@@ -63,15 +59,11 @@ impl<'a> RemappedDocIdColumn<'a> {
|
||||
let fast_field_readers = readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64_reader: Arc<dyn Column<u64>> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect(
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
)
|
||||
.to_full()
|
||||
.expect("temp migration solution");
|
||||
);
|
||||
u64_reader
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -1037,21 +1037,21 @@ pub mod tests {
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
|
||||
assert!(fast_field_reader_opt.is_ok());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get_val(0), Some(4u64))
|
||||
assert_eq!(fast_field_reader.get_val(0), 4u64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get_val(0), Some(4i64))
|
||||
assert_eq!(fast_field_reader.get_val(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get_val(0), Some(4f64))
|
||||
assert_eq!(fast_field_reader.get_val(0), 4f64)
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -16,7 +16,8 @@ use crate::query::{
|
||||
TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type,
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
Schema, Term, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -182,7 +183,6 @@ pub struct QueryParser {
|
||||
conjunction_by_default: bool,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
boost: HashMap<Field, Score>,
|
||||
field_names: HashMap<String, Field>,
|
||||
}
|
||||
|
||||
fn all_negative(ast: &LogicalAst) -> bool {
|
||||
@@ -195,31 +195,6 @@ fn all_negative(ast: &LogicalAst) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
|
||||
//
|
||||
// This function operates directly on bytes (as opposed to codepoint), relying
|
||||
// on a encoding property of utf-8 for its correctness.
|
||||
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
|
||||
let mut splitting_dots_pos = Vec::new();
|
||||
let mut escape_state = false;
|
||||
for (pos, b) in field_path.bytes().enumerate() {
|
||||
if escape_state {
|
||||
escape_state = false;
|
||||
continue;
|
||||
}
|
||||
match b {
|
||||
b'\\' => {
|
||||
escape_state = true;
|
||||
}
|
||||
b'.' => {
|
||||
splitting_dots_pos.push(pos);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
splitting_dots_pos
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`, given
|
||||
/// * schema - index Schema
|
||||
@@ -229,34 +204,19 @@ impl QueryParser {
|
||||
default_fields: Vec<Field>,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
) -> QueryParser {
|
||||
let field_names = schema
|
||||
.fields()
|
||||
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
|
||||
.collect();
|
||||
QueryParser {
|
||||
schema,
|
||||
default_fields,
|
||||
tokenizer_manager,
|
||||
conjunction_by_default: false,
|
||||
boost: Default::default(),
|
||||
field_names,
|
||||
}
|
||||
}
|
||||
|
||||
// Splits a full_path as written in a query, into a field name and a
|
||||
// json path.
|
||||
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
|
||||
if let Some(field) = self.field_names.get(full_path) {
|
||||
return Some((*field, ""));
|
||||
}
|
||||
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
|
||||
while let Some(pos) = splitting_period_pos.pop() {
|
||||
let (prefix, suffix) = full_path.split_at(pos);
|
||||
if let Some(field) = self.field_names.get(prefix) {
|
||||
return Some((*field, &suffix[1..]));
|
||||
}
|
||||
}
|
||||
None
|
||||
self.schema.find_field(full_path)
|
||||
}
|
||||
|
||||
/// Creates a `QueryParser`, given
|
||||
@@ -482,28 +442,14 @@ impl QueryParser {
|
||||
.into_iter()
|
||||
.collect())
|
||||
}
|
||||
FieldType::JsonObject(ref json_options) => {
|
||||
let option = json_options.get_text_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = option.index_option();
|
||||
generate_literals_for_json_object(
|
||||
field_name,
|
||||
field,
|
||||
json_path,
|
||||
phrase,
|
||||
&text_analyzer,
|
||||
index_record_option,
|
||||
)
|
||||
}
|
||||
FieldType::JsonObject(ref json_options) => generate_literals_for_json_object(
|
||||
field_name,
|
||||
field,
|
||||
json_path,
|
||||
phrase,
|
||||
&self.tokenizer_manager,
|
||||
json_options,
|
||||
),
|
||||
FieldType::Facet(_) => match Facet::from_text(phrase) {
|
||||
Ok(facet) => {
|
||||
let facet_term = Term::from_facet(field, &facet);
|
||||
@@ -767,17 +713,32 @@ fn generate_literals_for_json_object(
|
||||
field: Field,
|
||||
json_path: &str,
|
||||
phrase: &str,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
index_record_option: IndexRecordOption,
|
||||
tokenizer_manager: &TokenizerManager,
|
||||
json_options: &JsonObjectOptions,
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let text_options = json_options.get_text_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer = tokenizer_manager
|
||||
.get(text_options.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: text_options.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = text_options.index_option();
|
||||
let mut logical_literals = Vec::new();
|
||||
let mut term = Term::with_capacity(100);
|
||||
let mut json_term_writer =
|
||||
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
|
||||
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(
|
||||
field,
|
||||
json_path,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
&mut term,
|
||||
);
|
||||
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
}
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
|
||||
drop(json_term_writer);
|
||||
if terms.len() <= 1 {
|
||||
for (_, term) in terms {
|
||||
@@ -1564,13 +1525,6 @@ mod test {
|
||||
assert_eq!(query_parser.split_full_path("firsty"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_locate_splitting_dots() {
|
||||
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_slop() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::ops::{Bound, RangeInclusive};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::{MonotonicallyMappableToU128, OptionalColumn};
|
||||
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
|
||||
|
||||
use super::range_query::map_bound;
|
||||
use super::{ConstScorer, Explanation, Scorer, Weight};
|
||||
@@ -45,10 +45,12 @@ impl Weight for IPFastFieldRangeWeight {
|
||||
match field_type.fastfield_cardinality().unwrap() {
|
||||
Cardinality::SingleValue => {
|
||||
let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
|
||||
let minmax = ip_addr_fast_field
|
||||
.min_value()
|
||||
.zip(ip_addr_fast_field.max_value());
|
||||
let value_range = bound_to_value_range(&self.left_bound, &self.right_bound, minmax);
|
||||
let value_range = bound_to_value_range(
|
||||
&self.left_bound,
|
||||
&self.right_bound,
|
||||
ip_addr_fast_field.min_value(),
|
||||
ip_addr_fast_field.max_value(),
|
||||
);
|
||||
let docset = IpRangeDocSet::new(
|
||||
value_range,
|
||||
IpFastFieldCardinality::SingleValue(ip_addr_fast_field),
|
||||
@@ -60,10 +62,8 @@ impl Weight for IPFastFieldRangeWeight {
|
||||
let value_range = bound_to_value_range(
|
||||
&self.left_bound,
|
||||
&self.right_bound,
|
||||
Some((
|
||||
ip_addr_fast_field.min_value(),
|
||||
ip_addr_fast_field.max_value(),
|
||||
)),
|
||||
ip_addr_fast_field.min_value(),
|
||||
ip_addr_fast_field.max_value(),
|
||||
);
|
||||
let docset = IpRangeDocSet::new(
|
||||
value_range,
|
||||
@@ -91,10 +91,9 @@ impl Weight for IPFastFieldRangeWeight {
|
||||
fn bound_to_value_range(
|
||||
left_bound: &Bound<Ipv6Addr>,
|
||||
right_bound: &Bound<Ipv6Addr>,
|
||||
min_max: Option<(Ipv6Addr, Ipv6Addr)>,
|
||||
min_value: Ipv6Addr,
|
||||
max_value: Ipv6Addr,
|
||||
) -> RangeInclusive<Ipv6Addr> {
|
||||
let (min_value, max_value) =
|
||||
min_max.unwrap_or((Ipv6Addr::from(u128::MIN), Ipv6Addr::from(u128::MAX)));
|
||||
let start_value = match left_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
|
||||
@@ -143,7 +142,7 @@ impl VecCursor {
|
||||
}
|
||||
|
||||
pub(crate) enum IpFastFieldCardinality {
|
||||
SingleValue(Arc<dyn OptionalColumn<Ipv6Addr>>),
|
||||
SingleValue(Arc<dyn Column<Ipv6Addr>>),
|
||||
MultiValue(MultiValuedU128FastFieldReader<Ipv6Addr>),
|
||||
}
|
||||
|
||||
|
||||
@@ -181,6 +181,11 @@ impl FieldType {
|
||||
matches!(self, FieldType::IpAddr(_))
|
||||
}
|
||||
|
||||
/// returns true if this is an date field
|
||||
pub fn is_date(&self) -> bool {
|
||||
matches!(self, FieldType::Date(_))
|
||||
}
|
||||
|
||||
/// returns true if the field is indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match *self {
|
||||
|
||||
@@ -13,6 +13,8 @@ pub struct JsonObjectOptions {
|
||||
// If set to some, int, date, f64 and text will be indexed.
|
||||
// Text will use the TextFieldIndexing setting for indexing.
|
||||
indexing: Option<TextFieldIndexing>,
|
||||
|
||||
expand_dots_enabled: bool,
|
||||
}
|
||||
|
||||
impl JsonObjectOptions {
|
||||
@@ -26,6 +28,29 @@ impl JsonObjectOptions {
|
||||
self.indexing.is_some()
|
||||
}
|
||||
|
||||
/// Returns `true` iff dots in json keys should be expanded.
|
||||
///
|
||||
/// When expand_dots is enabled, json object like
|
||||
/// `{"k8s.node.id": 5}` is processed as if it was
|
||||
/// `{"k8s": {"node": {"id": 5}}}`.
|
||||
/// It option has the merit of allowing users to
|
||||
/// write queries like `k8s.node.id:5`.
|
||||
/// On the other, enabling that feature can lead to
|
||||
/// ambiguity.
|
||||
///
|
||||
/// If disabled, the "." need to be escaped:
|
||||
/// `k8s\.node\.id:5`.
|
||||
pub fn is_expand_dots_enabled(&self) -> bool {
|
||||
self.expand_dots_enabled
|
||||
}
|
||||
|
||||
/// Sets `expands_dots` to true.
|
||||
/// See `is_expand_dots_enabled` for more information.
|
||||
pub fn set_expand_dots_enabled(mut self) -> Self {
|
||||
self.expand_dots_enabled = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the text indexing options.
|
||||
///
|
||||
/// If set to `Some` then both int and str values will be indexed.
|
||||
@@ -55,6 +80,7 @@ impl From<StoredFlag> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: true,
|
||||
indexing: None,
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,10 +95,11 @@ impl<T: Into<JsonObjectOptions>> BitOr<T> for JsonObjectOptions {
|
||||
type Output = JsonObjectOptions;
|
||||
|
||||
fn bitor(self, other: T) -> Self {
|
||||
let other = other.into();
|
||||
let other: JsonObjectOptions = other.into();
|
||||
JsonObjectOptions {
|
||||
indexing: self.indexing.or(other.indexing),
|
||||
stored: self.stored | other.stored,
|
||||
expand_dots_enabled: self.expand_dots_enabled | other.expand_dots_enabled,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -93,6 +120,7 @@ impl From<TextOptions> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: text_options.is_stored(),
|
||||
indexing: text_options.get_indexing_options().cloned(),
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -252,6 +252,31 @@ impl Eq for InnerSchema {}
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct Schema(Arc<InnerSchema>);
|
||||
|
||||
// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
|
||||
//
|
||||
// This function operates directly on bytes (as opposed to codepoint), relying
|
||||
// on a encoding property of utf-8 for its correctness.
|
||||
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
|
||||
let mut splitting_dots_pos = Vec::new();
|
||||
let mut escape_state = false;
|
||||
for (pos, b) in field_path.bytes().enumerate() {
|
||||
if escape_state {
|
||||
escape_state = false;
|
||||
continue;
|
||||
}
|
||||
match b {
|
||||
b'\\' => {
|
||||
escape_state = true;
|
||||
}
|
||||
b'.' => {
|
||||
splitting_dots_pos.push(pos);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
splitting_dots_pos
|
||||
}
|
||||
|
||||
impl Schema {
|
||||
/// Return the `FieldEntry` associated with a `Field`.
|
||||
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
|
||||
@@ -358,6 +383,28 @@ impl Schema {
|
||||
}
|
||||
Ok(doc)
|
||||
}
|
||||
|
||||
/// Searches for a full_path in the schema, returning the field name and a JSON path.
|
||||
///
|
||||
/// This function works by checking if the field exists for the exact given full_path.
|
||||
/// If it's not, it splits the full_path at non-escaped '.' chars and tries to match the
|
||||
/// prefix with the field names, favoring the longest field names.
|
||||
///
|
||||
/// This does not check if field is a JSON field. It is possible for this functions to
|
||||
/// return a non-empty JSON path with a non-JSON field.
|
||||
pub fn find_field<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
|
||||
if let Some(field) = self.0.fields_map.get(full_path) {
|
||||
return Some((*field, ""));
|
||||
}
|
||||
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
|
||||
while let Some(pos) = splitting_period_pos.pop() {
|
||||
let (prefix, suffix) = full_path.split_at(pos);
|
||||
if let Some(field) = self.0.fields_map.get(prefix) {
|
||||
return Some((*field, &suffix[1..]));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Schema {
|
||||
@@ -436,6 +483,13 @@ mod tests {
|
||||
use crate::schema::schema::DocParsingError::InvalidJson;
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
fn test_locate_splitting_dots() {
|
||||
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn is_indexed_test() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -936,4 +990,46 @@ mod tests {
|
||||
]"#;
|
||||
assert_eq!(schema_json, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_json_field("foo", STRING);
|
||||
|
||||
schema_builder.add_text_field("bar", STRING);
|
||||
schema_builder.add_text_field("foo.bar", STRING);
|
||||
schema_builder.add_text_field("foo.bar.baz", STRING);
|
||||
schema_builder.add_text_field("bar.a.b.c", STRING);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
assert_eq!(
|
||||
schema.find_field("foo.bar"),
|
||||
Some((schema.get_field("foo.bar").unwrap(), ""))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.find_field("foo.bar.bar"),
|
||||
Some((schema.get_field("foo.bar").unwrap(), "bar"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.find_field("foo.bar.baz"),
|
||||
Some((schema.get_field("foo.bar.baz").unwrap(), ""))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.find_field("foo.toto"),
|
||||
Some((schema.get_field("foo").unwrap(), "toto"))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.find_field("foo.bar"),
|
||||
Some((schema.get_field("foo.bar").unwrap(), ""))
|
||||
);
|
||||
assert_eq!(
|
||||
schema.find_field("bar.toto.titi"),
|
||||
Some((schema.get_field("bar").unwrap(), "toto.titi"))
|
||||
);
|
||||
|
||||
assert_eq!(schema.find_field("hello"), None);
|
||||
assert_eq!(schema.find_field(""), None);
|
||||
assert_eq!(schema.find_field("thiswouldbeareallylongfieldname"), None);
|
||||
assert_eq!(schema.find_field("baz.bar.foo"), None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,8 +197,19 @@ impl Term {
|
||||
}
|
||||
|
||||
/// Appends value bytes to the Term.
|
||||
pub fn append_bytes(&mut self, bytes: &[u8]) {
|
||||
///
|
||||
/// This function returns the segment that has just been added.
|
||||
#[inline]
|
||||
pub fn append_bytes(&mut self, bytes: &[u8]) -> &mut [u8] {
|
||||
let len_before = self.0.len();
|
||||
self.0.extend_from_slice(bytes);
|
||||
&mut self.0[len_before..]
|
||||
}
|
||||
|
||||
/// Appends a single byte to the term.
|
||||
#[inline]
|
||||
pub fn push_byte(&mut self, byte: u8) {
|
||||
self.0.push(byte);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user