Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
727d024a23 Bugfix position broken.
For Field with several FieldValues, with a
value that contained no token at all, the token position
was reinitialized to 0.

As a result, PhraseQueries can show some false positives.
In addition, after the computation of the position delta, we can
underflow u32, and end up with gigantic delta.

We haven't been able to actually explain the bug in 1629, but it
is assumed that in some corner case these delta can cause a panic.

Closes #1629
2022-10-20 10:19:41 +09:00
121 changed files with 1015 additions and 6455 deletions

View File

@@ -48,7 +48,7 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]

View File

@@ -1,13 +1,7 @@
Tantivy 0.19
================================
#### Bugfixes
- Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480 (@PSeitz @fulmicoton)
#### Features/Improvements
- Add support for `IN` in queryparser , e.g. `field: IN [val1 val2 val3]` [#1683](https://github.com/quickwit-oss/tantivy/pull/1683) (@trinity-1686a)
- Skip score calculation, when no scoring is required [#1646](https://github.com/quickwit-oss/tantivy/pull/1646) (@PSeitz)
- Limit fast fields to u32 (`get_val(u32)`) [#1644](https://github.com/quickwit-oss/tantivy/pull/1644) (@PSeitz)
- Major bugfix: Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
The `DateTime` type has been updated to hold timestamps with microseconds precision.
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing). (@evanxg852000)
@@ -15,6 +9,7 @@ Tantivy 0.19
- Add boolean field type [#1382](https://github.com/quickwit-oss/tantivy/pull/1382) (@boraarslan)
- Remove Searcher pool and make `Searcher` cloneable. (@PSeitz)
- Validate settings on create [#1570](https://github.com/quickwit-oss/tantivy/pull/1570 (@PSeitz)
- Fix interpolation overflow in linear interpolation fastfield codec [#1480](https://github.com/quickwit-oss/tantivy/pull/1480 (@PSeitz @fulmicoton)
- Detect and apply gcd on fastfield codecs [#1418](https://github.com/quickwit-oss/tantivy/pull/1418) (@PSeitz)
- Doc store
- use separate thread to compress block store [#1389](https://github.com/quickwit-oss/tantivy/pull/1389) [#1510](https://github.com/quickwit-oss/tantivy/pull/1510 (@PSeitz @fulmicoton)
@@ -24,15 +19,14 @@ Tantivy 0.19
- Make `tantivy::TantivyError` cloneable [#1402](https://github.com/quickwit-oss/tantivy/pull/1402) (@PSeitz)
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
- Aggregation
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
- Faster indexing
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
- [#1594](https://github.com/quickwit-oss/tantivy/pull/1594) (@PSeitz)
- [#1582](https://github.com/quickwit-oss/tantivy/pull/1582) (@PSeitz)
- [#1611](https://github.com/quickwit-oss/tantivy/pull/1611) (@PSeitz)
- Added a pre-configured stop word filter for various language [#1666](https://github.com/quickwit-oss/tantivy/pull/1666) (@adamreichold)
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610 (@PSeitz)
- [#1594](https://github.com/quickwit-oss/tantivy/pull/1594 (@PSeitz)
- [#1582](https://github.com/quickwit-oss/tantivy/pull/1582 (@PSeitz)
- [#1611](https://github.com/quickwit-oss/tantivy/pull/1611 (@PSeitz)
Tantivy 0.18
================================
@@ -50,10 +44,6 @@ Tantivy 0.18
- Add terms aggregation (@PSeitz)
- Add support for zstd compression (@kryesh)
Tantivy 0.18.1
================================
- Hotfix: positions computation. #1629 (@fmassot, @fulmicoton, @PSeitz)
Tantivy 0.17
================================

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.19.0"
version = "0.18.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -14,18 +14,17 @@ edition = "2021"
rust-version = "1.62"
[dependencies]
oneshot = "0.1.5"
oneshot = "0.1.3"
base64 = "0.13.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "0.7"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
zstd = { version = "0.11", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
@@ -36,12 +35,17 @@ fs2 = { version = "0.4.3", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.3", path="./ownedbytes" }
stable_deref_trait = "1.2.0"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0"
rustc-hash = "1.1.0"
fnv = "1.0.7"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = "0.5.0"
@@ -57,12 +61,6 @@ ciborium = { version = "0.2", optional = true}
async-trait = "0.1.53"
arc-swap = "1.5.0"
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
common = { version= "0.4", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
ownedbytes = { version= "0.4", path="./ownedbytes" }
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -72,10 +70,10 @@ maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
criterion = "0.4"
criterion = "0.3.5"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
env_logger = "0.9.0"
pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] }
futures = "0.3.21"
[dev-dependencies.fail]
@@ -92,9 +90,8 @@ debug-assertions = true
overflow-checks = true
[features]
default = ["mmap", "stopwords", "lz4-compression"]
default = ["mmap", "lz4-compression" ]
mmap = ["fs2", "tempfile", "memmap2"]
stopwords = []
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.3.0"
version = "0.2.0"
edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
@@ -8,8 +8,6 @@ categories = []
description = """Tantivy-sub crate: bitpacking"""
repository = "https://github.com/quickwit-oss/tantivy"
keywords = []
documentation = "https://docs.rs/tantivy-bitpacker/latest/tantivy_bitpacker"
homepage = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@@ -87,15 +87,15 @@ impl BitUnpacker {
}
#[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 {
return 0u64;
}
let addr_in_bits = idx * self.num_bits as u32;
let addr_in_bits = idx * self.num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(
addr + 8 <= data.len() as u32,
addr + 8 <= data.len() as u64,
"The fast field field should have been padded with 7 bytes."
);
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
@@ -130,7 +130,7 @@ mod test {
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u32, &data), *val);
assert_eq!(bitunpacker.get(i as u64, &data), *val);
}
}

View File

@@ -130,7 +130,7 @@ impl BlockedBitpacker {
let pos_in_block = idx % BLOCK_SIZE as usize;
if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
let unpacked = BitUnpacker::new(metadata.num_bits()).get(
pos_in_block as u32,
pos_in_block as u64,
&self.compressed_blocks[metadata.offset() as usize..],
);
unpacked + metadata.base_value()

View File

@@ -1,20 +1,16 @@
[package]
name = "tantivy-common"
version = "0.4.0"
version = "0.3.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2021"
description = "common traits and utility functions used by multiple tantivy subcrates"
documentation = "https://docs.rs/tantivy_common/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version= "0.4", path="../ownedbytes" }
ownedbytes = { version="0.3", path="../ownedbytes" }
[dev-dependencies]
proptest = "1.0.0"

View File

@@ -94,20 +94,6 @@ impl FixedSize for u32 {
const SIZE_IN_BYTES: usize = 4;
}
impl BinarySerializable for u16 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u16::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u16> {
reader.read_u16::<Endianness>()
}
}
impl FixedSize for u16 {
const SIZE_IN_BYTES: usize = 2;
}
impl BinarySerializable for u64 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u64::<Endianness>(*self)

View File

@@ -157,7 +157,7 @@ fn vint_len(data: &[u8]) -> usize {
/// If the buffer does not start by a valid
/// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let (result, vlen) = read_u32_vint_no_advance(data);
let (result, vlen) = read_u32_vint_no_advance(*data);
*data = &data[vlen..];
result
}

View File

@@ -118,7 +118,7 @@ fn main() -> tantivy::Result<()> {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();

View File

@@ -105,7 +105,7 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get_val(doc) as f64;
let value = self.fast_field_reader.get_val(doc as u64) as f64;
self.stats.count += 1;
self.stats.sum += value;
self.stats.squared_sum += value * value;

View File

@@ -51,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.map(|doc| product_id_reader.get_val(doc as u64))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();

View File

@@ -1,20 +1,17 @@
[package]
name = "fastfield_codecs"
version = "0.3.0"
version = "0.2.0"
authors = ["Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2021"
description = "Fast field codecs used by tantivy"
documentation = "https://docs.rs/fastfield_codecs/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { version = "0.4", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
ownedbytes = { version = "0.4.0", path = "../ownedbytes" }
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
prettytable-rs = {version="0.9.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"

View File

@@ -65,7 +65,7 @@ mod tests {
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = column.get_val(a as u32);
a = column.get_val(a as u64);
}
a
});
@@ -101,7 +101,7 @@ mod tests {
fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> {
let mut out = vec![];
let iter_gen = || data.iter().cloned();
serialize_u128(iter_gen, data.len() as u32, &mut out).unwrap();
serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap();
let out = OwnedBytes::new(out);
open_u128::<u128>(out).unwrap()
}
@@ -111,15 +111,7 @@ mod tests {
let (major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
major_item..=major_item,
0..data.len() as u32,
&mut positions,
);
positions
});
b.iter(|| column.get_between_vals(major_item..=major_item));
}
#[bench]
@@ -127,15 +119,7 @@ mod tests {
let (_major_item, minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
minor_item..=minor_item,
0..data.len() as u32,
&mut positions,
);
positions
});
b.iter(|| column.get_between_vals(minor_item..=minor_item));
}
#[bench]
@@ -143,11 +127,7 @@ mod tests {
let (_major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
positions
});
b.iter(|| column.get_between_vals(0..=u128::MAX));
}
#[bench]
@@ -157,7 +137,7 @@ mod tests {
b.iter(|| {
let mut a = 0u128;
for i in 0u64..column.num_vals() as u64 {
a += column.get_val(i as u32);
a += column.get_val(i);
}
a
});
@@ -171,7 +151,7 @@ mod tests {
let n = column.num_vals();
let mut a = 0u128;
for i in (0..n / 5).map(|val| val * 5) {
a += column.get_val(i);
a += column.get_val(i as u64);
}
a
});
@@ -196,9 +176,9 @@ mod tests {
let n = permutation.len();
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| {
let mut a = 0;
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += column.get_val(i as u32);
a += column.get_val(i as u64);
}
a
});
@@ -211,7 +191,7 @@ mod tests {
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0u32..n as u32 {
for i in 0u64..n as u64 {
a += column.get_val(i);
}
a
@@ -225,8 +205,8 @@ mod tests {
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0..n {
a += column.get_val(i as u32);
for i in 0..n as u64 {
a += column.get_val(i);
}
a
});

View File

@@ -17,7 +17,7 @@ pub struct BitpackedReader {
impl Column for BitpackedReader {
#[inline]
fn get_val(&self, doc: u32) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
self.bit_unpacker.get(doc, &self.data)
}
#[inline]
@@ -30,7 +30,7 @@ impl Column for BitpackedReader {
self.normalized_header.max_value
}
#[inline]
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.normalized_header.num_vals
}
}

View File

@@ -36,7 +36,7 @@ impl BinarySerializable for Block {
}
}
fn compute_num_blocks(num_vals: u32) -> usize {
fn compute_num_blocks(num_vals: u64) -> usize {
(num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE
}
@@ -72,13 +72,13 @@ impl FastFieldCodec for BlockwiseLinearCodec {
// Estimate first_chunk and extrapolate
fn estimate(column: &dyn crate::Column) -> Option<f32> {
if column.num_vals() < 10 * CHUNK_SIZE as u32 {
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
return None;
}
let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect();
let line = Line::train(&VecColumn::from(&first_chunk));
for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u32);
let interpolated_val = line.eval(i as u64);
*buffer_val = buffer_val.wrapping_sub(interpolated_val);
}
let estimated_bit_width = first_chunk
@@ -95,7 +95,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
};
let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
// function metadata per block
+ metadata_per_block as u64 * (column.num_vals() as u64 / CHUNK_SIZE as u64);
+ metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64);
let num_bits_uncompressed = 64 * column.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
@@ -121,7 +121,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
assert!(!buffer.is_empty());
for (i, buffer_val) in buffer.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u32);
let interpolated_val = line.eval(i as u64);
*buffer_val = buffer_val.wrapping_sub(interpolated_val);
}
let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap();
@@ -161,9 +161,9 @@ pub struct BlockwiseLinearReader {
impl Column for BlockwiseLinearReader {
#[inline(always)]
fn get_val(&self, idx: u32) -> u64 {
let block_id = (idx / CHUNK_SIZE as u32) as usize;
let idx_within_block = idx % (CHUNK_SIZE as u32);
fn get_val(&self, idx: u64) -> u64 {
let block_id = (idx / CHUNK_SIZE as u64) as usize;
let idx_within_block = idx % (CHUNK_SIZE as u64);
let block = &self.blocks[block_id];
let interpoled_val: u64 = block.line.eval(idx_within_block);
let block_bytes = &self.data[block.data_start_offset..];
@@ -180,7 +180,7 @@ impl Column for BlockwiseLinearReader {
self.normalized_header.max_value
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.normalized_header.num_vals
}
}

View File

@@ -1,5 +1,5 @@
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
use std::ops::RangeInclusive;
use tantivy_bitpacker::minmax;
@@ -14,7 +14,7 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T;
fn get_val(&self, idx: u64) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
@@ -27,28 +27,21 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
#[inline]
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32);
*out = self.get_val(idx);
}
}
/// Get the positions of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
/// Return the positions of values which are in the provided range.
#[inline]
fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> {
let mut vals = Vec::new();
for idx in 0..self.num_vals() {
let val = self.get_val(idx);
if value_range.contains(&val) {
positions.push(idx);
if range.contains(&val) {
vals.push(idx);
}
}
vals
}
/// Returns the minimum value for this fast field.
@@ -68,7 +61,7 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
fn max_value(&self) -> T;
/// The number of values in the column.
fn num_vals(&self) -> u32;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
@@ -84,7 +77,7 @@ pub struct VecColumn<'a, T = u64> {
}
impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
fn get_val(&self, idx: u64) -> T {
(*self).get_val(idx)
}
@@ -96,7 +89,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
(*self).max_value()
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
(*self).num_vals()
}
@@ -110,7 +103,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
}
impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
fn get_val(&self, position: u64) -> T {
self.values[position as usize]
}
@@ -126,8 +119,8 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
self.max_value
}
fn num_vals(&self) -> u32 {
self.values.len() as u32
fn num_vals(&self) -> u64 {
self.values.len() as u64
}
fn get_range(&self, start: u64, output: &mut [T]) {
@@ -163,7 +156,7 @@ struct MonotonicMappingColumn<C, T, Input> {
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
///
/// The inverse of the mapping is required for:
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column.
///
@@ -195,7 +188,7 @@ where
Output: PartialOrd + Send + Sync + Clone,
{
#[inline]
fn get_val(&self, idx: u32) -> Output {
fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx);
self.monotonic_mapping.mapping(from_val)
}
@@ -210,7 +203,7 @@ where
self.monotonic_mapping.mapping(from_max_value)
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.from_column.num_vals()
}
@@ -222,17 +215,10 @@ where
)
}
fn get_docids_for_value_range(
&self,
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_docids_for_value_range(
fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
self.from_column.get_between_vals(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
)
}
@@ -240,7 +226,6 @@ where
// and we do not have any specialized implementation anyway.
}
/// Wraps an iterator into a `Column`.
pub struct IterColumn<T>(T);
impl<T> From<T> for IterColumn<T>
@@ -256,7 +241,7 @@ where
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd,
{
fn get_val(&self, idx: u32) -> T::Item {
fn get_val(&self, idx: u64) -> T::Item {
self.0.clone().nth(idx as usize).unwrap()
}
@@ -268,8 +253,8 @@ where
self.0.clone().last().unwrap()
}
fn num_vals(&self) -> u32 {
self.0.len() as u32
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {

View File

@@ -57,7 +57,7 @@ fn num_bits(val: u128) -> u8 {
/// metadata.
pub fn get_compact_space(
values_deduped_sorted: &BTreeSet<u128>,
total_num_values: u32,
total_num_values: u64,
cost_per_blank: usize,
) -> CompactSpace {
let mut compact_space_builder = CompactSpaceBuilder::new();

View File

@@ -14,7 +14,7 @@ use std::{
cmp::Ordering,
collections::BTreeSet,
io::{self, Write},
ops::{Range, RangeInclusive},
ops::RangeInclusive,
};
use common::{BinarySerializable, CountingWriter, VInt, VIntU128};
@@ -165,13 +165,13 @@ pub struct IPCodecParams {
bit_unpacker: BitUnpacker,
min_value: u128,
max_value: u128,
num_vals: u32,
num_vals: u64,
num_bits: u8,
}
impl CompactSpaceCompressor {
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u32) -> Self {
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u64) -> Self {
let mut values_sorted = BTreeSet::new();
values_sorted.extend(iter);
let total_num_values = num_vals;
@@ -200,7 +200,7 @@ impl CompactSpaceCompressor {
bit_unpacker: BitUnpacker::new(num_bits),
min_value,
max_value,
num_vals: total_num_values,
num_vals: total_num_values as u64,
num_bits,
},
}
@@ -267,7 +267,7 @@ impl BinarySerializable for IPCodecParams {
let _header_flags = u64::deserialize(reader)?;
let min_value = VIntU128::deserialize(reader)?.0;
let max_value = VIntU128::deserialize(reader)?.0;
let num_vals = VIntU128::deserialize(reader)?.0 as u32;
let num_vals = VIntU128::deserialize(reader)?.0 as u64;
let num_bits = u8::deserialize(reader)?;
let compact_space = CompactSpace::deserialize(reader)?;
@@ -284,7 +284,7 @@ impl BinarySerializable for IPCodecParams {
impl Column<u128> for CompactSpaceDecompressor {
#[inline]
fn get_val(&self, doc: u32) -> u128 {
fn get_val(&self, doc: u64) -> u128 {
self.get(doc)
}
@@ -296,7 +296,7 @@ impl Column<u128> for CompactSpaceDecompressor {
self.max_value()
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.params.num_vals
}
@@ -304,15 +304,8 @@ impl Column<u128> for CompactSpaceDecompressor {
fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> {
Box::new(self.iter())
}
#[inline]
fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<u128>,
positions_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.get_positions_for_value_range(value_range, positions_range, positions)
fn get_between_vals(&self, range: RangeInclusive<u128>) -> Vec<u64> {
self.get_between_vals(range)
}
}
@@ -347,19 +340,12 @@ impl CompactSpaceDecompressor {
/// Comparing on compact space: Real dataset 1.08 GElements/s
///
/// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
#[inline]
pub fn get_positions_for_value_range(
&self,
value_range: RangeInclusive<u128>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
if value_range.start() > value_range.end() {
return;
pub fn get_between_vals(&self, range: RangeInclusive<u128>) -> Vec<u64> {
if range.start() > range.end() {
return Vec::new();
}
let position_range = position_range.start..position_range.end.min(self.num_vals());
let from_value = *value_range.start();
let to_value = *value_range.end();
let from_value = *range.start();
let to_value = *range.end();
assert!(to_value >= from_value);
let compact_from = self.u128_to_compact(from_value);
let compact_to = self.u128_to_compact(to_value);
@@ -367,7 +353,7 @@ impl CompactSpaceDecompressor {
// Quick return, if both ranges fall into the same non-mapped space, the range can't cover
// any values, so we can early exit
match (compact_to, compact_from) {
(Err(pos1), Err(pos2)) if pos1 == pos2 => return,
(Err(pos1), Err(pos2)) if pos1 == pos2 => return Vec::new(),
_ => {}
}
@@ -389,28 +375,27 @@ impl CompactSpaceDecompressor {
});
let range = compact_from..=compact_to;
let scan_num_docs = position_range.end - position_range.start;
let mut positions = Vec::new();
let step_size = 4;
let cutoff = position_range.start + scan_num_docs - scan_num_docs % step_size;
let cutoff = self.params.num_vals - self.params.num_vals % step_size;
let mut push_if_in_range = |idx, val| {
if range.contains(&val) {
positions.push(idx);
}
};
let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data);
let get_val = |idx| self.params.bit_unpacker.get(idx as u64, &self.data);
// unrolled loop
for idx in (position_range.start..cutoff).step_by(step_size as usize) {
for idx in (0..cutoff).step_by(step_size as usize) {
let idx1 = idx;
let idx2 = idx + 1;
let idx3 = idx + 2;
let idx4 = idx + 3;
let val1 = get_val(idx1 as u32);
let val2 = get_val(idx2 as u32);
let val3 = get_val(idx3 as u32);
let val4 = get_val(idx4 as u32);
let val1 = get_val(idx1);
let val2 = get_val(idx2);
let val3 = get_val(idx3);
let val4 = get_val(idx4);
push_if_in_range(idx1, val1);
push_if_in_range(idx2, val2);
push_if_in_range(idx3, val3);
@@ -418,15 +403,17 @@ impl CompactSpaceDecompressor {
}
// handle rest
for idx in cutoff..position_range.end {
push_if_in_range(idx, get_val(idx as u32));
for idx in cutoff..self.params.num_vals {
push_if_in_range(idx, get_val(idx));
}
positions
}
#[inline]
fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
(0..self.params.num_vals)
.map(move |idx| self.params.bit_unpacker.get(idx, &self.data) as u64)
.map(move |idx| self.params.bit_unpacker.get(idx as u64, &self.data) as u64)
}
#[inline]
@@ -438,7 +425,7 @@ impl CompactSpaceDecompressor {
}
#[inline]
pub fn get(&self, idx: u32) -> u128 {
pub fn get(&self, idx: u64) -> u128 {
let compact = self.params.bit_unpacker.get(idx, &self.data);
self.compact_to_u128(compact)
}
@@ -456,9 +443,6 @@ impl CompactSpaceDecompressor {
mod tests {
use super::*;
use crate::format_version::read_format_version;
use crate::null_index_footer::read_null_index_footer;
use crate::serialize::U128Header;
use crate::{open_u128, serialize_u128};
#[test]
@@ -468,7 +452,7 @@ mod tests {
]
.into_iter()
.collect();
let compact_space = get_compact_space(ips, ips.len() as u32, 11);
let compact_space = get_compact_space(ips, ips.len() as u64, 11);
let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 17);
assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
@@ -499,30 +483,24 @@ mod tests {
#[test]
fn compact_space_amplitude_test() {
let ips = &[100000u128, 1000000].into_iter().collect();
let compact_space = get_compact_space(ips, ips.len() as u32, 1);
let compact_space = get_compact_space(ips, ips.len() as u64, 1);
let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 2);
}
fn test_all(mut data: OwnedBytes, expected: &[u128]) {
let _header = U128Header::deserialize(&mut data);
fn test_all(data: OwnedBytes, expected: &[u128]) {
let decompressor = CompactSpaceDecompressor::open(data).unwrap();
for (idx, expected_val) in expected.iter().cloned().enumerate() {
let val = decompressor.get(idx as u32);
let val = decompressor.get(idx as u64);
assert_eq!(val, expected_val);
let test_range = |range: RangeInclusive<u128>| {
let expected_positions = expected
.iter()
.positions(|val| range.contains(val))
.map(|pos| pos as u32)
.map(|pos| pos as u64)
.collect::<Vec<_>>();
let mut positions = Vec::new();
decompressor.get_positions_for_value_range(
range,
0..decompressor.num_vals(),
&mut positions,
);
let positions = decompressor.get_between_vals(range);
assert_eq!(positions, expected_positions);
};
@@ -537,16 +515,13 @@ mod tests {
let mut out = Vec::new();
serialize_u128(
|| u128_vals.iter().cloned(),
u128_vals.len() as u32,
u128_vals.len() as u64,
&mut out,
)
.unwrap();
let data = OwnedBytes::new(out);
let (data, _format_version) = read_format_version(data).unwrap();
let (data, _null_index_footer) = read_null_index_footer(data).unwrap();
test_all(data.clone(), u128_vals);
data
}
@@ -563,111 +538,26 @@ mod tests {
4_000_211_222u128,
333u128,
];
let mut data = test_aux_vals(vals);
let _header = U128Header::deserialize(&mut data);
let data = test_aux_vals(vals);
let decomp = CompactSpaceDecompressor::open(data).unwrap();
let complete_range = 0..vals.len() as u32;
for (pos, val) in vals.iter().enumerate() {
let val = *val as u128;
let pos = pos as u32;
let mut positions = Vec::new();
decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
assert_eq!(positions, vec![pos]);
}
// handle docid range out of bounds
let positions = get_positions_for_value_range_helper(&decomp, 0..=1, 1..u32::MAX);
assert_eq!(positions, vec![]);
let positions =
get_positions_for_value_range_helper(&decomp, 0..=1, complete_range.clone());
let positions = decomp.get_between_vals(0..=1);
assert_eq!(positions, vec![0]);
let positions =
get_positions_for_value_range_helper(&decomp, 0..=2, complete_range.clone());
let positions = decomp.get_between_vals(0..=2);
assert_eq!(positions, vec![0]);
let positions =
get_positions_for_value_range_helper(&decomp, 0..=3, complete_range.clone());
let positions = decomp.get_between_vals(0..=3);
assert_eq!(positions, vec![0, 2]);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99999u128..=99999u128,
complete_range.clone()
),
vec![3]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99999u128..=100000u128,
complete_range.clone()
),
vec![3, 4]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=100000u128,
complete_range.clone()
),
vec![3, 4]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=99999u128,
complete_range.clone()
),
vec![3]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=99998u128,
complete_range.clone()
),
vec![]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
333u128..=333u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
332u128..=333u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
332u128..=334u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
333u128..=334u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(decomp.get_between_vals(99999u128..=99999u128), vec![3]);
assert_eq!(decomp.get_between_vals(99999u128..=100000u128), vec![3, 4]);
assert_eq!(decomp.get_between_vals(99998u128..=100000u128), vec![3, 4]);
assert_eq!(decomp.get_between_vals(99998u128..=99999u128), vec![3]);
assert_eq!(decomp.get_between_vals(99998u128..=99998u128), vec![]);
assert_eq!(decomp.get_between_vals(333u128..=333u128), vec![8]);
assert_eq!(decomp.get_between_vals(332u128..=333u128), vec![8]);
assert_eq!(decomp.get_between_vals(332u128..=334u128), vec![8]);
assert_eq!(decomp.get_between_vals(333u128..=334u128), vec![8]);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
4_000_211_221u128..=5_000_000_000u128,
complete_range.clone()
),
decomp.get_between_vals(4_000_211_221u128..=5_000_000_000u128),
vec![6, 7]
);
}
@@ -690,32 +580,14 @@ mod tests {
4_000_211_222u128,
333u128,
];
let mut data = test_aux_vals(vals);
let _header = U128Header::deserialize(&mut data);
let data = test_aux_vals(vals);
let decomp = CompactSpaceDecompressor::open(data).unwrap();
let complete_range = 0..vals.len() as u32;
assert_eq!(
get_positions_for_value_range_helper(&decomp, 0..=5, complete_range.clone()),
vec![]
);
assert_eq!(
get_positions_for_value_range_helper(&decomp, 0..=100, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(&decomp, 0..=105, complete_range.clone()),
vec![0]
);
}
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
column: &C,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
) -> Vec<u32> {
let mut positions = Vec::new();
column.get_docids_for_value_range(value_range, doc_id_range, &mut positions);
positions
let positions = decomp.get_between_vals(0..=5);
assert_eq!(positions, vec![]);
let positions = decomp.get_between_vals(0..=100);
assert_eq!(positions, vec![0]);
let positions = decomp.get_between_vals(0..=105);
assert_eq!(positions, vec![0]);
}
#[test]
@@ -736,33 +608,13 @@ mod tests {
5_000_000_000,
];
let mut out = Vec::new();
serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap();
serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap();
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
let complete_range = 0..vals.len() as u32;
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 199..=200, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 199..=201, complete_range.clone()),
vec![0, 1]
);
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 200..=200, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(
&*decomp,
1_000_000..=1_000_000,
complete_range.clone()
),
vec![11]
);
assert_eq!(decomp.get_between_vals(199..=200), vec![0]);
assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]);
assert_eq!(decomp.get_between_vals(200..=200), vec![0]);
assert_eq!(decomp.get_between_vals(1_000_000..=1_000_000), vec![11]);
}
#[test]

View File

@@ -1,39 +0,0 @@
use std::io;
use common::BinarySerializable;
use ownedbytes::OwnedBytes;
const MAGIC_NUMBER: u16 = 4335u16;
const FASTFIELD_FORMAT_VERSION: u8 = 1;
pub(crate) fn append_format_version(output: &mut impl io::Write) -> io::Result<()> {
FASTFIELD_FORMAT_VERSION.serialize(output)?;
MAGIC_NUMBER.serialize(output)?;
Ok(())
}
pub(crate) fn read_format_version(data: OwnedBytes) -> io::Result<(OwnedBytes, u8)> {
let (data, magic_number_bytes) = data.rsplit(2);
let magic_number = u16::deserialize(&mut magic_number_bytes.as_slice())?;
if magic_number != MAGIC_NUMBER {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("magic number mismatch {} != {}", magic_number, MAGIC_NUMBER),
));
}
let (data, format_version_bytes) = data.rsplit(1);
let format_version = u8::deserialize(&mut format_version_bytes.as_slice())?;
if format_version > FASTFIELD_FORMAT_VERSION {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Unsupported fastfield format version: {}. Max supported version: {}",
format_version, FASTFIELD_FORMAT_VERSION
),
));
}
Ok((data, format_version))
}

View File

@@ -20,36 +20,28 @@ use std::sync::Arc;
use common::BinarySerializable;
use compact_space::CompactSpaceDecompressor;
use format_version::read_format_version;
use monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
};
use null_index_footer::read_null_index_footer;
use ownedbytes::OwnedBytes;
use serialize::{Header, U128Header};
use serialize::Header;
mod bitpacked;
mod blockwise_linear;
mod compact_space;
mod format_version;
mod line;
mod linear;
mod monotonic_mapping;
mod monotonic_mapping_u128;
mod null_index;
mod null_index_footer;
mod column;
mod gcd;
mod serialize;
/// TODO: remove when codec is used
pub use null_index::*;
use self::bitpacked::BitpackedCodec;
use self::blockwise_linear::BlockwiseLinearCodec;
pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
pub use self::column::{monotonic_map_column, Column, VecColumn};
use self::linear::LinearCodec;
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
@@ -100,49 +92,10 @@ impl FastFieldCodecType {
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
pub enum U128FastFieldCodecType {
/// This codec takes a large number space (u128) and reduces it to a compact number space, by
/// removing the holes.
CompactSpace = 1,
}
impl BinarySerializable for U128FastFieldCodecType {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
impl U128FastFieldCodecType {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::CompactSpace),
_ => None,
}
}
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128<Item: MonotonicallyMappableToU128>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceDecompressor::open(bytes)?;
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
StrictlyMonotonicMappingToInternal::<Item>::new().into();
@@ -150,9 +103,9 @@ pub fn open_u128<Item: MonotonicallyMappableToU128>(
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
pub fn open<T: MonotonicallyMappableToU64>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<T>>> {
let header = Header::deserialize(&mut bytes)?;
match header.codec_type {
FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
@@ -246,9 +199,9 @@ mod tests {
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u32);
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().copied().enumerate() {
let val = reader.get_val(doc as u32);
let val = reader.get_val(doc as u64);
assert_eq!(
val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
@@ -258,18 +211,13 @@ mod tests {
if !data.is_empty() {
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
let expected_positions: Vec<u32> = data
let expected_positions: Vec<u64> = data
.iter()
.enumerate()
.filter(|(_, el)| **el == data[test_rand_idx])
.map(|(pos, _)| pos as u32)
.map(|(pos, _)| pos as u64)
.collect();
let mut positions = Vec::new();
reader.get_docids_for_value_range(
data[test_rand_idx]..=data[test_rand_idx],
0..data.len() as u32,
&mut positions,
);
let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
assert_eq!(expected_positions, positions);
}
Some((estimation, actual_compression))
@@ -481,7 +429,7 @@ mod bench {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
let val = col.get_val(pos as u64);
sum = sum.wrapping_add(val);
}
sum
@@ -493,7 +441,7 @@ mod bench {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
let val = col.get_val(pos as u64);
sum = sum.wrapping_add(val);
}
sum

View File

@@ -1,5 +1,5 @@
use std::io;
use std::num::NonZeroU32;
use std::num::NonZeroU64;
use common::{BinarySerializable, VInt};
@@ -29,7 +29,7 @@ pub struct Line {
/// compute_slope(y0, y1)
/// = compute_slope(y0 + X % 2^64, y1 + X % 2^64)
/// `
fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU64) -> u64 {
let dy = y1.wrapping_sub(y0);
let sign = dy <= (1 << 63);
let abs_dy = if sign {
@@ -43,7 +43,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
return 0u64;
}
let abs_slope = (abs_dy << 32) / num_vals.get() as u64;
let abs_slope = (abs_dy << 32) / num_vals.get();
if sign {
abs_slope
} else {
@@ -62,8 +62,8 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
impl Line {
#[inline(always)]
pub fn eval(&self, x: u32) -> u64 {
let linear_part = ((x as u64).wrapping_mul(self.slope) >> 32) as i32 as u64;
pub fn eval(&self, x: u64) -> u64 {
let linear_part = (x.wrapping_mul(self.slope) >> 32) as i32 as u64;
self.intercept.wrapping_add(linear_part)
}
@@ -75,7 +75,7 @@ impl Line {
Self::train_from(
first_val,
last_val,
num_vals as u32,
num_vals,
sample_positions_and_values.iter().cloned(),
)
}
@@ -84,11 +84,11 @@ impl Line {
fn train_from(
first_val: u64,
last_val: u64,
num_vals: u32,
num_vals: u64,
positions_and_values: impl Iterator<Item = (u64, u64)>,
) -> Self {
// TODO replace with let else
let idx_last_val = if let Some(idx_last_val) = NonZeroU32::new(num_vals - 1) {
let idx_last_val = if let Some(idx_last_val) = NonZeroU64::new(num_vals - 1) {
idx_last_val
} else {
return Line::default();
@@ -129,7 +129,7 @@ impl Line {
};
let heuristic_shift = y0.wrapping_sub(MID_POINT);
line.intercept = positions_and_values
.map(|(pos, y)| y.wrapping_sub(line.eval(pos as u32)))
.map(|(pos, y)| y.wrapping_sub(line.eval(pos)))
.min_by_key(|&val| val.wrapping_sub(heuristic_shift))
.unwrap_or(0u64); //< Never happens.
line
@@ -199,7 +199,7 @@ mod tests {
let line = Line::train(&VecColumn::from(&ys));
ys.iter()
.enumerate()
.map(|(x, y)| y.wrapping_sub(line.eval(x as u32)))
.map(|(x, y)| y.wrapping_sub(line.eval(x as u64)))
.max()
}

View File

@@ -19,7 +19,7 @@ pub struct LinearReader {
impl Column for LinearReader {
#[inline]
fn get_val(&self, doc: u32) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
let interpoled_val: u64 = self.linear_params.line.eval(doc);
let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data);
interpoled_val.wrapping_add(bitpacked_diff)
@@ -37,7 +37,7 @@ impl Column for LinearReader {
}
#[inline]
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.header.num_vals
}
}
@@ -93,7 +93,7 @@ impl FastFieldCodec for LinearCodec {
.iter()
.enumerate()
.map(|(pos, actual_value)| {
let calculated_value = line.eval(pos as u32);
let calculated_value = line.eval(pos as u64);
actual_value.wrapping_sub(calculated_value)
})
.max()
@@ -108,7 +108,7 @@ impl FastFieldCodec for LinearCodec {
let mut bit_packer = BitPacker::new();
for (pos, actual_value) in column.iter().enumerate() {
let calculated_value = line.eval(pos as u32);
let calculated_value = line.eval(pos as u64);
let offset = actual_value.wrapping_sub(calculated_value);
bit_packer.write(offset, num_bits, write)?;
}
@@ -140,7 +140,7 @@ impl FastFieldCodec for LinearCodec {
let estimated_bit_width = sample_positions_and_values
.into_iter()
.map(|(pos, actual_value)| {
let interpolated_val = line.eval(pos as u32);
let interpolated_val = line.eval(pos as u64);
actual_value.wrapping_sub(interpolated_val)
})
.map(|diff| ((diff as f32 * 1.5) * 2.0) as u64)

View File

@@ -90,7 +90,7 @@ fn bench_ip() {
{
let mut data = vec![];
for dataset in dataset.chunks(500_000) {
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
}
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression 50_000 chunks {:.4}", compression);
@@ -103,7 +103,7 @@ fn bench_ip() {
let mut data = vec![];
{
print_time!("creation");
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
}
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
@@ -115,15 +115,9 @@ fn bench_ip() {
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
// Sample some ranges
let mut doc_values = Vec::new();
for value in dataset.iter().take(1110).skip(1100).cloned() {
doc_values.clear();
print_time!("get range");
decompressor.get_docids_for_value_range(
value..=value,
0..decompressor.num_vals(),
&mut doc_values,
);
let doc_values = decompressor.get_between_vals(value..=value);
println!("{:?}", doc_values.len());
}
}

View File

@@ -1,453 +0,0 @@
use std::convert::TryInto;
use std::io::{self, Write};
use common::BinarySerializable;
use itertools::Itertools;
use ownedbytes::OwnedBytes;
use super::{get_bit_at, set_bit_at};
/// For the `DenseCodec`, `data` which contains the encoded blocks.
/// Each block consists of [u8; 12]. The first 8 bytes is a bitvec for 64 elements.
/// The last 4 bytes are the offset, the number of set bits so far.
///
/// When translating the original index to a dense index, the correct block can be computed
/// directly `orig_idx/64`. Inside the block the position is `orig_idx%64`.
///
/// When translating a dense index to the original index, we can use the offset to find the correct
/// block. Direct computation is not possible, but we can employ a linear or binary search.
pub struct DenseCodec {
// data consists of blocks of 64 bits.
//
// The format is &[(u64, u32)]
// u64 is the bitvec
// u32 is the offset of the block, the number of set bits so far.
//
// At the end one block is appended, to store the number of values in the index in offset.
data: OwnedBytes,
}
const ELEMENTS_PER_BLOCK: u32 = 64;
const BLOCK_BITVEC_SIZE: usize = 8;
const BLOCK_OFFSET_SIZE: usize = 4;
const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE;
#[inline]
fn count_ones(bitvec: u64, pos_in_bitvec: u32) -> u32 {
if pos_in_bitvec == 63 {
bitvec.count_ones()
} else {
let mask = (1u64 << (pos_in_bitvec + 1)) - 1;
let masked_bitvec = bitvec & mask;
masked_bitvec.count_ones()
}
}
#[derive(Clone, Copy)]
struct DenseIndexBlock {
bitvec: u64,
offset: u32,
}
impl From<[u8; SERIALIZED_BLOCK_SIZE]> for DenseIndexBlock {
fn from(data: [u8; SERIALIZED_BLOCK_SIZE]) -> Self {
let bitvec = u64::from_le_bytes(data[..BLOCK_BITVEC_SIZE].try_into().unwrap());
let offset = u32::from_le_bytes(data[BLOCK_BITVEC_SIZE..].try_into().unwrap());
Self { bitvec, offset }
}
}
impl DenseCodec {
/// Open the DenseCodec from OwnedBytes
pub fn open(data: OwnedBytes) -> Self {
Self { data }
}
#[inline]
/// Check if value at position is not null.
pub fn exists(&self, idx: u32) -> bool {
let block_pos = idx / ELEMENTS_PER_BLOCK;
let bitvec = self.dense_index_block(block_pos).bitvec;
let pos_in_bitvec = idx % ELEMENTS_PER_BLOCK;
get_bit_at(bitvec, pos_in_bitvec)
}
#[inline]
fn dense_index_block(&self, block_pos: u32) -> DenseIndexBlock {
dense_index_block(&self.data, block_pos)
}
/// Return the number of non-null values in an index
pub fn num_non_null_vals(&self) -> u32 {
let last_block = (self.data.len() / SERIALIZED_BLOCK_SIZE) - 1;
self.dense_index_block(last_block as u32).offset
}
#[inline]
/// Translate from the original index to the codec index.
pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> {
let block_pos = idx / ELEMENTS_PER_BLOCK;
let index_block = self.dense_index_block(block_pos);
let pos_in_block_bit_vec = idx % ELEMENTS_PER_BLOCK;
let ones_in_block = count_ones(index_block.bitvec, pos_in_block_bit_vec);
if get_bit_at(index_block.bitvec, pos_in_block_bit_vec) {
// -1 is ok, since idx does exist, so there's at least one
Some(index_block.offset + ones_in_block - 1)
} else {
None
}
}
/// Translate positions from the codec index to the original index.
///
/// # Panics
///
/// May panic if any `idx` is greater than the column length.
pub fn translate_codec_idx_to_original_idx<'a>(
&'a self,
iter: impl Iterator<Item = u32> + 'a,
) -> impl Iterator<Item = u32> + 'a {
let mut block_pos = 0u32;
iter.map(move |dense_idx| {
// update block_pos to limit search scope
block_pos = find_block(dense_idx, block_pos, &self.data);
let index_block = self.dense_index_block(block_pos);
// The next offset is higher than dense_idx and therefore:
// dense_idx <= offset + num_set_bits in block
let mut num_set_bits = 0;
for idx_in_bitvec in 0..ELEMENTS_PER_BLOCK {
if get_bit_at(index_block.bitvec, idx_in_bitvec) {
num_set_bits += 1;
}
if num_set_bits == (dense_idx - index_block.offset + 1) {
let orig_idx = block_pos * ELEMENTS_PER_BLOCK + idx_in_bitvec as u32;
return orig_idx;
}
}
panic!("Internal Error: Offset calculation in dense idx seems to be wrong.");
})
}
}
#[inline]
fn dense_index_block(data: &[u8], block_pos: u32) -> DenseIndexBlock {
let data_start_pos = block_pos as usize * SERIALIZED_BLOCK_SIZE;
let block_data: [u8; SERIALIZED_BLOCK_SIZE] = data[data_start_pos..][..SERIALIZED_BLOCK_SIZE]
.try_into()
.unwrap();
block_data.into()
}
#[inline]
/// Finds the block position containing the dense_idx.
///
/// # Correctness
/// dense_idx needs to be smaller than the number of values in the index
///
/// The last offset number is equal to the number of values in the index.
fn find_block(dense_idx: u32, mut block_pos: u32, data: &[u8]) -> u32 {
loop {
let offset = dense_index_block(data, block_pos).offset;
if offset > dense_idx {
return block_pos - 1;
}
block_pos += 1;
}
}
/// Iterator over all values, true if set, otherwise false
pub fn serialize_dense_codec(
iter: impl Iterator<Item = bool>,
mut out: impl Write,
) -> io::Result<()> {
let mut offset: u32 = 0;
for chunk in &iter.chunks(ELEMENTS_PER_BLOCK as usize) {
let mut block: u64 = 0;
for (pos, is_bit_set) in chunk.enumerate() {
if is_bit_set {
set_bit_at(&mut block, pos as u64);
}
}
block.serialize(&mut out)?;
offset.serialize(&mut out)?;
offset += block.count_ones() as u32;
}
// Add sentinal block for the offset
let block: u64 = 0;
block.serialize(&mut out)?;
offset.serialize(&mut out)?;
Ok(())
}
#[cfg(test)]
mod tests {
use proptest::prelude::{any, prop, *};
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
fn random_bitvec() -> BoxedStrategy<Vec<bool>> {
prop_oneof![
1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..100),
1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..64),
1 => prop::collection::vec(proptest::bool::weighted(0.0), 0..100),
1 => prop::collection::vec(proptest::bool::weighted(0.0), 0..64),
8 => vec![any::<bool>()],
2 => prop::collection::vec(any::<bool>(), 0..50),
]
.boxed()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_with_random_bitvecs(bitvec1 in random_bitvec(), bitvec2 in random_bitvec(), bitvec3 in random_bitvec()) {
let mut bitvec = Vec::new();
bitvec.extend_from_slice(&bitvec1);
bitvec.extend_from_slice(&bitvec2);
bitvec.extend_from_slice(&bitvec3);
test_null_index(bitvec);
}
}
#[test]
fn dense_codec_test_one_block_false() {
let mut iter = vec![false; 64];
iter.push(true);
test_null_index(iter);
}
fn test_null_index(data: Vec<bool>) {
let mut out = vec![];
serialize_dense_codec(data.iter().cloned(), &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out));
let orig_idx_with_value: Vec<u32> = data
.iter()
.enumerate()
.filter(|(_pos, val)| **val)
.map(|(pos, _val)| pos as u32)
.collect();
assert_eq!(
null_index
.translate_codec_idx_to_original_idx(0..orig_idx_with_value.len() as u32)
.collect_vec(),
orig_idx_with_value
);
for (dense_idx, orig_idx) in orig_idx_with_value.iter().enumerate() {
assert_eq!(
null_index.translate_to_codec_idx(*orig_idx),
Some(dense_idx as u32)
);
}
for (pos, value) in data.iter().enumerate() {
assert_eq!(null_index.exists(pos as u32), *value);
}
}
#[test]
fn dense_codec_test_translation() {
let mut out = vec![];
let iter = ([true, false, true, false]).iter().cloned();
serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out));
assert_eq!(
null_index
.translate_codec_idx_to_original_idx(0..2)
.collect_vec(),
vec![0, 2]
);
}
#[test]
fn dense_codec_translate() {
let mut out = vec![];
let iter = ([true, false, true, false]).iter().cloned();
serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out));
assert_eq!(null_index.translate_to_codec_idx(0), Some(0));
assert_eq!(null_index.translate_to_codec_idx(2), Some(1));
}
#[test]
fn dense_codec_test_small() {
let mut out = vec![];
let iter = ([true, false, true, false]).iter().cloned();
serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out));
assert!(null_index.exists(0));
assert!(!null_index.exists(1));
assert!(null_index.exists(2));
assert!(!null_index.exists(3));
}
#[test]
fn dense_codec_test_large() {
let mut docs = vec![];
docs.extend((0..1000).map(|_idx| false));
docs.extend((0..=1000).map(|_idx| true));
let iter = docs.iter().cloned();
let mut out = vec![];
serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out));
assert!(!null_index.exists(0));
assert!(!null_index.exists(100));
assert!(!null_index.exists(999));
assert!(null_index.exists(1000));
assert!(null_index.exists(1999));
assert!(null_index.exists(2000));
assert!(!null_index.exists(2001));
}
#[test]
fn test_count_ones() {
let mut block = 0;
set_bit_at(&mut block, 0);
set_bit_at(&mut block, 2);
assert_eq!(count_ones(block, 0), 1);
assert_eq!(count_ones(block, 1), 1);
assert_eq!(count_ones(block, 2), 2);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::*;
const TOTAL_NUM_VALUES: u32 = 1_000_000;
fn gen_bools(fill_ratio: f64) -> DenseCodec {
let mut out = Vec::new();
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let bools: Vec<_> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.collect();
serialize_dense_codec(bools.into_iter(), &mut out).unwrap();
let codec = DenseCodec::open(OwnedBytes::new(out));
codec
}
fn random_range_iterator(start: u32, end: u32, step_size: u32) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start;
std::iter::from_fn(move || {
current += rng.gen_range(1..step_size + 1);
if current >= end {
None
} else {
Some(current)
}
})
}
fn walk_over_data(codec: &DenseCodec, max_step_size: u32) -> Option<u32> {
walk_over_data_from_positions(
codec,
random_range_iterator(0, TOTAL_NUM_VALUES, max_step_size),
)
}
fn walk_over_data_from_positions(
codec: &DenseCodec,
positions: impl Iterator<Item = u32>,
) -> Option<u32> {
let mut dense_idx: Option<u32> = None;
for idx in positions {
dense_idx = dense_idx.or(codec.translate_to_codec_idx(idx));
}
dense_idx
}
#[bench]
fn bench_dense_codec_translate_orig_to_dense_90percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_dense_codec_translate_orig_to_dense_50percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.5f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_dense_codec_translate_orig_to_dense_full_scan_10percent(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_dense_codec_translate_orig_to_dense_full_scan_90percent(bench: &mut Bencher) {
let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_dense_codec_translate_orig_to_dense_10percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_dense_codec_translate_dense_to_orig_90percent_filled_random_stride_big_step(
bench: &mut Bencher,
) {
let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_null_vals();
bench.iter(|| {
codec
.translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 50_000))
.last()
});
}
#[bench]
fn bench_dense_codec_translate_dense_to_orig_90percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_null_vals();
bench.iter(|| {
codec
.translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 100))
.last()
});
}
#[bench]
fn bench_dense_codec_translate_dense_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_null_vals();
bench.iter(|| {
codec
.translate_codec_idx_to_original_idx(0..num_vals)
.last()
});
}
}

View File

@@ -1,13 +0,0 @@
pub use dense::{serialize_dense_codec, DenseCodec};
mod dense;
#[inline]
fn get_bit_at(input: u64, n: u32) -> bool {
input & (1 << n) != 0
}
#[inline]
fn set_bit_at(input: &mut u64, n: u64) {
*input |= 1 << n;
}

View File

@@ -1,144 +0,0 @@
use std::io::{self, Write};
use std::ops::Range;
use common::{BinarySerializable, CountingWriter, VInt};
use ownedbytes::OwnedBytes;
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub(crate) enum FastFieldCardinality {
Single = 1,
}
impl BinarySerializable for FastFieldCardinality {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
impl FastFieldCardinality {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::Single),
_ => None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum NullIndexCodec {
Full = 1,
}
impl BinarySerializable for NullIndexCodec {
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
self.to_code().serialize(wrt)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let code = u8::deserialize(reader)?;
let codec_type: Self = Self::from_code(code)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
Ok(codec_type)
}
}
impl NullIndexCodec {
pub(crate) fn to_code(self) -> u8 {
self as u8
}
pub(crate) fn from_code(code: u8) -> Option<Self> {
match code {
1 => Some(Self::Full),
_ => None,
}
}
}
#[derive(Debug, Clone, Eq, PartialEq)]
pub(crate) struct NullIndexFooter {
pub(crate) cardinality: FastFieldCardinality,
pub(crate) null_index_codec: NullIndexCodec,
// Unused for NullIndexCodec::Full
pub(crate) null_index_byte_range: Range<u64>,
}
impl BinarySerializable for NullIndexFooter {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.cardinality.serialize(writer)?;
self.null_index_codec.serialize(writer)?;
VInt(self.null_index_byte_range.start).serialize(writer)?;
VInt(self.null_index_byte_range.end - self.null_index_byte_range.start)
.serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let cardinality = FastFieldCardinality::deserialize(reader)?;
let null_index_codec = NullIndexCodec::deserialize(reader)?;
let null_index_byte_range_start = VInt::deserialize(reader)?.0;
let null_index_byte_range_end = VInt::deserialize(reader)?.0 + null_index_byte_range_start;
Ok(Self {
cardinality,
null_index_codec,
null_index_byte_range: null_index_byte_range_start..null_index_byte_range_end,
})
}
}
pub(crate) fn append_null_index_footer(
output: &mut impl io::Write,
null_index_footer: NullIndexFooter,
) -> io::Result<()> {
let mut counting_write = CountingWriter::wrap(output);
null_index_footer.serialize(&mut counting_write)?;
let footer_payload_len = counting_write.written_bytes();
BinarySerializable::serialize(&(footer_payload_len as u16), &mut counting_write)?;
Ok(())
}
pub(crate) fn read_null_index_footer(
data: OwnedBytes,
) -> io::Result<(OwnedBytes, NullIndexFooter)> {
let (data, null_footer_length_bytes) = data.rsplit(2);
let footer_length = u16::deserialize(&mut null_footer_length_bytes.as_slice())?;
let (data, null_index_footer_bytes) = data.rsplit(footer_length as usize);
let null_index_footer = NullIndexFooter::deserialize(&mut null_index_footer_bytes.as_ref())?;
Ok((data, null_index_footer))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn null_index_footer_deser_test() {
let null_index_footer = NullIndexFooter {
cardinality: FastFieldCardinality::Single,
null_index_codec: NullIndexCodec::Full,
null_index_byte_range: 100..120,
};
let mut out = vec![];
null_index_footer.serialize(&mut out).unwrap();
assert_eq!(
null_index_footer,
NullIndexFooter::deserialize(&mut &out[..]).unwrap()
);
}
}

View File

@@ -28,18 +28,14 @@ use ownedbytes::OwnedBytes;
use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::compact_space::CompactSpaceCompressor;
use crate::format_version::append_format_version;
use crate::linear::LinearCodec;
use crate::monotonic_mapping::{
StrictlyMonotonicFn, StrictlyMonotonicMappingToInternal,
StrictlyMonotonicMappingToInternalGCDBaseval,
};
use crate::null_index_footer::{
append_null_index_footer, FastFieldCardinality, NullIndexCodec, NullIndexFooter,
};
use crate::{
monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64,
U128FastFieldCodecType, VecColumn, ALL_CODEC_TYPES,
VecColumn, ALL_CODEC_TYPES,
};
/// The normalized header gives some parameters after applying the following
@@ -50,14 +46,14 @@ use crate::{
#[derive(Debug, Copy, Clone)]
pub struct NormalizedHeader {
/// The number of values in the underlying column.
pub num_vals: u32,
pub num_vals: u64,
/// The max value of the underlying column.
pub max_value: u64,
}
#[derive(Debug, Copy, Clone)]
pub(crate) struct Header {
pub num_vals: u32,
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
pub gcd: Option<NonZeroU64>,
@@ -102,29 +98,6 @@ impl Header {
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) struct U128Header {
pub num_vals: u32,
pub codec_type: U128FastFieldCodecType,
}
impl BinarySerializable for U128Header {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.num_vals as u64).serialize(writer)?;
self.codec_type.serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let num_vals = VInt::deserialize(reader)?.0 as u32;
let codec_type = U128FastFieldCodecType::deserialize(reader)?;
Ok(U128Header {
num_vals,
codec_type,
})
}
}
pub fn normalize_column<C: Column>(
from_column: C,
min_value: u64,
@@ -137,7 +110,7 @@ pub fn normalize_column<C: Column>(
impl BinarySerializable for Header {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.num_vals as u64).serialize(writer)?;
VInt(self.num_vals).serialize(writer)?;
VInt(self.min_value).serialize(writer)?;
VInt(self.max_value - self.min_value).serialize(writer)?;
if let Some(gcd) = self.gcd {
@@ -150,7 +123,7 @@ impl BinarySerializable for Header {
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let num_vals = VInt::deserialize(reader)?.0 as u32;
let num_vals = VInt::deserialize(reader)?.0;
let min_value = VInt::deserialize(reader)?.0;
let amplitude = VInt::deserialize(reader)?.0;
let max_value = min_value + amplitude;
@@ -191,25 +164,13 @@ pub fn estimate<T: MonotonicallyMappableToU64>(
/// Serializes u128 values with the compact space codec.
pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
iter_gen: F,
num_vals: u32,
num_vals: u64,
output: &mut impl io::Write,
) -> io::Result<()> {
let header = U128Header {
num_vals,
codec_type: U128FastFieldCodecType::CompactSpace,
};
header.serialize(output)?;
// TODO write header, to later support more codecs
let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
compressor.compress_into(iter_gen(), output).unwrap();
let null_index_footer = NullIndexFooter {
cardinality: FastFieldCardinality::Single,
null_index_codec: NullIndexCodec::Full,
null_index_byte_range: 0..0,
};
append_null_index_footer(output, null_index_footer)?;
append_format_version(output)?;
Ok(())
}
@@ -233,15 +194,6 @@ pub fn serialize<T: MonotonicallyMappableToU64>(
let normalized_column = header.normalize_column(column);
assert_eq!(normalized_column.min_value(), 0u64);
serialize_given_codec(normalized_column, header.codec_type, output)?;
let null_index_footer = NullIndexFooter {
cardinality: FastFieldCardinality::Single,
null_index_codec: NullIndexCodec::Full,
null_index_byte_range: 0..0,
};
append_null_index_footer(output, null_index_footer)?;
append_format_version(output)?;
Ok(())
}
@@ -306,18 +258,6 @@ pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
mod tests {
use super::*;
#[test]
fn test_serialize_deserialize_u128_header() {
let original = U128Header {
num_vals: 11,
codec_type: U128FastFieldCodecType::CompactSpace,
};
let mut out = Vec::new();
original.serialize(&mut out).unwrap();
let restored = U128Header::deserialize(&mut &out[..]).unwrap();
assert_eq!(restored, original);
}
#[test]
fn test_serialize_deserialize() {
let original = [1u64, 5u64, 10u64];
@@ -331,7 +271,7 @@ mod tests {
let col = VecColumn::from(&[false, true][..]);
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
// 5 bytes of header, 1 byte of value, 7 bytes of padding.
assert_eq!(buffer.len(), 3 + 5 + 8 + 4 + 2);
assert_eq!(buffer.len(), 5 + 8);
}
#[test]
@@ -340,7 +280,7 @@ mod tests {
let col = VecColumn::from(&[true][..]);
serialize(col, &mut buffer, &ALL_CODEC_TYPES).unwrap();
// 5 bytes of header, 0 bytes of value, 7 bytes of padding.
assert_eq!(buffer.len(), 3 + 5 + 7 + 4 + 2);
assert_eq!(buffer.len(), 5 + 7);
}
#[test]
@@ -350,6 +290,6 @@ mod tests {
let col = VecColumn::from(&vals[..]);
serialize(col, &mut buffer, &[FastFieldCodecType::Bitpacked]).unwrap();
// Values are stored over 3 bits.
assert_eq!(buffer.len(), 3 + 7 + (3 * 80 / 8) + 7 + 4 + 2);
assert_eq!(buffer.len(), 7 + (3 * 80 / 8) + 7);
}
}

View File

@@ -1,14 +1,10 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.4.0"
version = "0.3.0"
edition = "2021"
description = "Expose data as static slice"
license = "MIT"
documentation = "https://docs.rs/ownedbytes/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@@ -80,21 +80,6 @@ impl OwnedBytes {
(left, right)
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Right will hold `split_len` bytes.
///
/// This operation is cheap and does not require to copy any memory.
/// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped.
#[inline]
#[must_use]
pub fn rsplit(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let data_len = self.data.len();
self.split(data_len - split_len)
}
/// Splits the right part of the `OwnedBytes` at the given offset.
///
/// `self` is truncated to `split_len`, left with the remaining bytes.

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.19.0"
version = "0.18.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -5,8 +5,7 @@ use combine::parser::range::{take_while, take_while1};
use combine::parser::repeat::escaped;
use combine::parser::Parser;
use combine::{
attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
skip_many1, value,
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
};
use once_cell::sync::Lazy;
use regex::Regex;
@@ -63,20 +62,6 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
})
}
// word variant that allows more characters, e.g. for range queries that don't allow field
// specifier
fn relaxed_word<'a>() -> impl Parser<&'a str, Output = String> {
(
satisfy(|c: char| {
!c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
}),
many(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
/// Parses a date time according to rfc3339
/// 2015-08-02T18:54:42+02
/// 2021-04-13T19:46:26.266051969+00:00
@@ -196,8 +181,8 @@ fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let range_term_val = || {
attempt(date_time())
.or(word())
.or(negative_number())
.or(relaxed_word())
.or(char('*').with(value("*".to_string())))
};
@@ -265,17 +250,6 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
})
}
/// Function that parses a set out of a Stream
/// Supports ranges like: `IN [val1 val2 val3]`
fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_list = between(char('['), char(']'), sep_by(term_val(), spaces()));
let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements);
(optional(attempt(field_name().skip(spaces()))), set_content)
.map(|(field, elements)| UserInputLeaf::Set { field, elements })
}
fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
@@ -290,7 +264,6 @@ fn leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
string("NOT").skip(spaces1()).with(leaf()).map(negate),
))
.or(attempt(range().map(UserInputAst::from)))
.or(attempt(set().map(UserInputAst::from)))
.or(literal().map(UserInputAst::from))
.parse_stream(input)
.into_result()
@@ -676,34 +649,6 @@ mod test {
.expect("Cannot parse date range")
.0;
assert_eq!(res6, expected_flexible_dates);
// IP Range Unbounded
let expected_weight = UserInputLeaf::Range {
field: Some("ip".to_string()),
lower: UserInputBound::Inclusive("::1".to_string()),
upper: UserInputBound::Unbounded,
};
let res1 = range()
.parse("ip: >=::1")
.expect("Cannot parse ip v6 format")
.0;
let res2 = range()
.parse("ip:[::1 TO *}")
.expect("Cannot parse ip v6 format")
.0;
assert_eq!(res1, expected_weight);
assert_eq!(res2, expected_weight);
// IP Range Bounded
let expected_weight = UserInputLeaf::Range {
field: Some("ip".to_string()),
lower: UserInputBound::Inclusive("::0.0.0.50".to_string()),
upper: UserInputBound::Exclusive("::0.0.0.52".to_string()),
};
let res1 = range()
.parse("ip:[::0.0.0.50 TO ::0.0.0.52}")
.expect("Cannot parse ip v6 format")
.0;
assert_eq!(res1, expected_weight);
}
#[test]
@@ -760,14 +705,6 @@ mod test {
test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
}
#[test]
fn test_parse_test_query_set() {
test_parse_query_to_ast_helper("abc: IN [a b c]", r#""abc": IN ["a" "b" "c"]"#);
test_parse_query_to_ast_helper("abc: IN [1]", r#""abc": IN ["1"]"#);
test_parse_query_to_ast_helper("abc: IN []", r#""abc": IN []"#);
test_parse_query_to_ast_helper("IN [1 2]", r#"IN ["1" "2"]"#);
}
#[test]
fn test_parse_test_query_other() {
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");

View File

@@ -12,10 +12,6 @@ pub enum UserInputLeaf {
lower: UserInputBound,
upper: UserInputBound,
},
Set {
field: Option<String>,
elements: Vec<String>,
},
}
impl Debug for UserInputLeaf {
@@ -35,19 +31,6 @@ impl Debug for UserInputLeaf {
upper.display_upper(formatter)?;
Ok(())
}
UserInputLeaf::Set { field, elements } => {
if let Some(ref field) = field {
write!(formatter, "\"{}\": ", field)?;
}
write!(formatter, "IN [")?;
for (i, element) in elements.iter().enumerate() {
if i != 0 {
write!(formatter, " ")?;
}
write!(formatter, "\"{}\"", element)?;
}
write!(formatter, "]")
}
UserInputLeaf::All => write!(formatter, "*"),
}
}

View File

@@ -11,7 +11,7 @@ use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation};
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader};
use crate::schema::{Cardinality, Type};
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
@@ -194,7 +194,13 @@ fn get_ff_reader_and_validate(
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
let field_type = reader.schema().get_field_entry(field).field_type();
if let Some((_ff_type, field_cardinality)) = type_and_cardinality(field_type) {
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
if ff_type == FastType::Date {
return Err(TantivyError::InvalidArgument(
"Unsupported field type date in aggregation".to_string(),
));
}
if cardinality != field_cardinality {
return Err(TantivyError::InvalidArgument(format!(
"Invalid field cardinality on field {} expected {:?}, but got {:?}",

View File

@@ -4,7 +4,9 @@
//! intermediate average results, which is the sum and the number of values. The actual average is
//! calculated on the step from intermediate to final aggregation result tree.
use rustc_hash::FxHashMap;
use std::collections::HashMap;
use fnv::FnvHashMap;
use serde::{Deserialize, Serialize};
use super::agg_req::BucketAggregationInternal;
@@ -12,12 +14,11 @@ use super::bucket::GetDocCount;
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
use super::metric::{SingleMetricResult, Stats};
use super::Key;
use crate::schema::Schema;
use crate::TantivyError;
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
/// The final aggegation result.
pub struct AggregationResults(pub FxHashMap<String, AggregationResult>);
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
impl AggregationResults {
pub(crate) fn get_value_from_aggregation(
@@ -130,12 +131,9 @@ pub enum BucketResult {
}
impl BucketResult {
pub(crate) fn empty_from_req(
req: &BucketAggregationInternal,
schema: &Schema,
) -> crate::Result<Self> {
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
empty_bucket.into_final_bucket_result(req, schema)
empty_bucket.into_final_bucket_result(req)
}
}
@@ -147,7 +145,7 @@ pub enum BucketEntries<T> {
/// Vector format bucket entries
Vec(Vec<T>),
/// HashMap format bucket entries
HashMap(FxHashMap<String, T>),
HashMap(FnvHashMap<String, T>),
}
/// This is the default entry for a bucket, which contains a key, count, and optionally
@@ -178,9 +176,6 @@ pub enum BucketEntries<T> {
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BucketEntry {
#[serde(skip_serializing_if = "Option::is_none")]
/// The string representation of the bucket.
pub key_as_string: Option<String>,
/// The identifier of the bucket.
pub key: Key,
/// Number of documents in the bucket.
@@ -245,10 +240,4 @@ pub struct RangeBucketEntry {
/// The to range of the bucket. Equals `f64::MAX` when `None`.
#[serde(skip_serializing_if = "Option::is_none")]
pub to: Option<f64>,
/// The optional string representation for the `from` range.
#[serde(skip_serializing_if = "Option::is_none")]
pub from_as_string: Option<String>,
/// The optional string representation for the `to` range.
#[serde(skip_serializing_if = "Option::is_none")]
pub to_as_string: Option<String>,
}

View File

@@ -10,12 +10,12 @@ use crate::aggregation::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor,
};
use crate::aggregation::agg_result::BucketEntry;
use crate::aggregation::f64_from_fastfield_u64;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
};
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::aggregation::{f64_from_fastfield_u64, format_date};
use crate::schema::{Schema, Type};
use crate::schema::Type;
use crate::{DocId, TantivyError};
/// Histogram is a bucket aggregation, where buckets are created dynamically for given `interval`.
@@ -331,10 +331,10 @@ impl SegmentHistogramCollector {
.expect("unexpected fast field cardinatility");
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0]));
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1]));
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2]));
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3]));
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0] as u64));
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1] as u64));
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2] as u64));
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3] as u64));
let bucket_pos0 = get_bucket_num(val0);
let bucket_pos1 = get_bucket_num(val1);
@@ -371,7 +371,7 @@ impl SegmentHistogramCollector {
)?;
}
for &doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get_val(doc), &self.field_type);
let val = f64_from_fastfield_u64(accessor.get_val(doc as u64), &self.field_type);
if !bounds.contains(val) {
continue;
}
@@ -451,7 +451,6 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<Vec<BucketEntry>> {
// Generate the full list of buckets without gaps.
//
@@ -492,9 +491,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
sub_aggregation: empty_sub_aggregation.clone(),
},
})
.map(|intermediate_bucket| {
intermediate_bucket.into_final_bucket_entry(sub_aggregation, schema)
})
.map(|intermediate_bucket| intermediate_bucket.into_final_bucket_entry(sub_aggregation))
.collect::<crate::Result<Vec<_>>>()
}
@@ -503,43 +500,20 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<Vec<BucketEntry>> {
let mut buckets = if histogram_req.min_doc_count() == 0 {
if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no
// gaps, since intermediate result does not contain empty buckets (filtered to
// reduce serialization size).
intermediate_buckets_to_final_buckets_fill_gaps(
buckets,
histogram_req,
sub_aggregation,
schema,
)?
intermediate_buckets_to_final_buckets_fill_gaps(buckets, histogram_req, sub_aggregation)
} else {
buckets
.into_iter()
.filter(|histogram_bucket| histogram_bucket.doc_count >= histogram_req.min_doc_count())
.map(|histogram_bucket| {
histogram_bucket.into_final_bucket_entry(sub_aggregation, schema)
})
.collect::<crate::Result<Vec<_>>>()?
};
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
let field = schema
.get_field(&histogram_req.field)
.ok_or_else(|| TantivyError::FieldNotFound(histogram_req.field.to_string()))?;
if schema.get_field_entry(field).field_type().is_date() {
for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(val) = bucket.key {
let key_as_string = format_date(val as i64)?;
bucket.key_as_string = Some(key_as_string);
}
}
.map(|histogram_bucket| histogram_bucket.into_final_bucket_entry(sub_aggregation))
.collect::<crate::Result<Vec<_>>>()
}
Ok(buckets)
}
/// Applies req extended_bounds/hard_bounds on the min_max value
@@ -1398,63 +1372,6 @@ mod tests {
Ok(())
}
#[test]
fn histogram_date_test_single_segment() -> crate::Result<()> {
histogram_date_test_with_opt(true)
}
#[test]
fn histogram_date_test_multi_segment() -> crate::Result<()> {
histogram_date_test_with_opt(false)
}
fn histogram_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
let agg_req: Aggregations = vec![(
"histogram".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "date".to_string(),
interval: 86400000000.0, // one day in microseconds
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let agg_res = exec_request(agg_req, &index)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["histogram"]["buckets"][0]["key"], 1546300800000000.0);
assert_eq!(
res["histogram"]["buckets"][0]["key_as_string"],
"2019-01-01T00:00:00Z"
);
assert_eq!(res["histogram"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["histogram"]["buckets"][1]["key"], 1546387200000000.0);
assert_eq!(
res["histogram"]["buckets"][1]["key_as_string"],
"2019-01-02T00:00:00Z"
);
assert_eq!(res["histogram"]["buckets"][1]["doc_count"], 5);
assert_eq!(res["histogram"]["buckets"][2]["key"], 1546473600000000.0);
assert_eq!(
res["histogram"]["buckets"][2]["key_as_string"],
"2019-01-03T00:00:00Z"
);
assert_eq!(res["histogram"]["buckets"][3], Value::Null);
Ok(())
}
#[test]
fn histogram_invalid_request() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;

View File

@@ -1,8 +1,7 @@
use std::fmt::Debug;
use std::ops::Range;
use fastfield_codecs::MonotonicallyMappableToU64;
use rustc_hash::FxHashMap;
use fnv::FnvHashMap;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{
@@ -12,9 +11,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
};
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
use crate::aggregation::{
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey,
};
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
use crate::schema::Type;
use crate::{DocId, TantivyError};
@@ -179,12 +176,12 @@ impl SegmentRangeCollector {
) -> crate::Result<IntermediateBucketResult> {
let field_type = self.field_type;
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
let buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets
.into_iter()
.map(move |range_bucket| {
Ok((
range_to_string(&range_bucket.range, &field_type)?,
range_to_string(&range_bucket.range, &field_type),
range_bucket
.bucket
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
@@ -212,8 +209,8 @@ impl SegmentRangeCollector {
let key = range
.key
.clone()
.map(|key| Ok(Key::Str(key)))
.unwrap_or_else(|| range_to_key(&range.range, &field_type))?;
.map(Key::Str)
.unwrap_or_else(|| range_to_key(&range.range, &field_type));
let to = if range.range.end == u64::MAX {
None
} else {
@@ -231,7 +228,6 @@ impl SegmentRangeCollector {
sub_aggregation,
)?)
};
Ok(SegmentRangeAndBucketEntry {
range: range.range.clone(),
bucket: SegmentRangeBucketEntry {
@@ -267,10 +263,10 @@ impl SegmentRangeCollector {
.as_single()
.expect("unexpected fast field cardinality");
for docs in iter.by_ref() {
let val1 = accessor.get_val(docs[0]);
let val2 = accessor.get_val(docs[1]);
let val3 = accessor.get_val(docs[2]);
let val4 = accessor.get_val(docs[3]);
let val1 = accessor.get_val(docs[0] as u64);
let val2 = accessor.get_val(docs[1] as u64);
let val3 = accessor.get_val(docs[2] as u64);
let val4 = accessor.get_val(docs[3] as u64);
let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3);
@@ -282,7 +278,7 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
}
for &doc in iter.remainder() {
let val = accessor.get_val(doc);
let val = accessor.get_val(doc as u64);
let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
}
@@ -406,45 +402,34 @@ fn extend_validate_ranges(
Ok(converted_buckets)
}
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> crate::Result<String> {
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> String {
// is_start is there for malformed requests, e.g. ig the user passes the range u64::MIN..0.0,
// it should be rendered as "*-0" and not "*-*"
let to_str = |val: u64, is_start: bool| {
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
Ok("*".to_string())
} else if *field_type == Type::Date {
let val = i64::from_u64(val);
format_date(val)
"*".to_string()
} else {
Ok(f64_from_fastfield_u64(val, field_type).to_string())
f64_from_fastfield_u64(val, field_type).to_string()
}
};
Ok(format!(
"{}-{}",
to_str(range.start, true)?,
to_str(range.end, false)?
))
format!("{}-{}", to_str(range.start, true), to_str(range.end, false))
}
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> crate::Result<Key> {
Ok(Key::Str(range_to_string(range, field_type)?))
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> Key {
Key::Str(range_to_string(range, field_type))
}
#[cfg(test)]
mod tests {
use fastfield_codecs::MonotonicallyMappableToU64;
use serde_json::Value;
use super::*;
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
};
use crate::aggregation::tests::{
exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_with_num_docs,
};
use crate::aggregation::tests::{exec_request_with_query, get_test_index_with_num_docs};
pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>,
@@ -582,77 +567,6 @@ mod tests {
Ok(())
}
#[test]
fn range_date_test_single_segment() -> crate::Result<()> {
range_date_test_with_opt(true)
}
#[test]
fn range_date_test_multi_segment() -> crate::Result<()> {
range_date_test_with_opt(false)
}
fn range_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
let agg_req: Aggregations = vec![(
"date_ranges".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "date".to_string(),
ranges: vec![
RangeAggregationRange {
key: None,
from: None,
to: Some(1546300800000000.0f64),
},
RangeAggregationRange {
key: None,
from: Some(1546300800000000.0f64),
to: Some(1546387200000000.0f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let agg_res = exec_request(agg_req, &index)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(
res["date_ranges"]["buckets"][0]["from_as_string"],
Value::Null
);
assert_eq!(
res["date_ranges"]["buckets"][0]["key"],
"*-2019-01-01T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][1]["from_as_string"],
"2019-01-01T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][1]["to_as_string"],
"2019-01-02T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][2]["from_as_string"],
"2019-01-02T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][2]["to_as_string"],
Value::Null
);
Ok(())
}
#[test]
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug;
use fnv::FnvHashMap;
use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::{CustomOrder, Order, OrderTarget};
@@ -199,7 +199,7 @@ impl TermsAggregationInternal {
#[derive(Clone, Debug, PartialEq)]
/// Container to store term_ids and their buckets.
struct TermBuckets {
pub(crate) entries: FxHashMap<u32, TermBucketEntry>,
pub(crate) entries: FnvHashMap<u32, TermBucketEntry>,
blueprint: Option<SegmentAggregationResultsCollector>,
}
@@ -397,7 +397,7 @@ impl SegmentTermCollector {
.expect("internal error: inverted index not loaded for term aggregation");
let term_dict = inverted_index.terms();
let mut dict: FxHashMap<String, IntermediateTermBucketEntry> = Default::default();
let mut dict: FnvHashMap<String, IntermediateTermBucketEntry> = Default::default();
let mut buffer = vec![];
for (term_id, entry) in entries {
term_dict
@@ -1129,9 +1129,9 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);

View File

@@ -7,7 +7,6 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::SegmentAggregationResultsCollector;
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector};
use crate::schema::Schema;
use crate::{SegmentReader, TantivyError};
/// The default max bucket count, before the aggregation fails.
@@ -17,7 +16,6 @@ pub const MAX_BUCKET_COUNT: u32 = 65000;
///
/// The collector collects all aggregations by the underlying aggregation request.
pub struct AggregationCollector {
schema: Schema,
agg: Aggregations,
max_bucket_count: u32,
}
@@ -27,9 +25,8 @@ impl AggregationCollector {
///
/// Aggregation fails when the total bucket count is higher than max_bucket_count.
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>, schema: Schema) -> Self {
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
Self {
schema,
agg,
max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT),
}
@@ -116,7 +113,7 @@ impl Collector for AggregationCollector {
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> crate::Result<Self::Fruit> {
let res = merge_fruits(segment_fruits)?;
res.into_final_bucket_result(self.agg.clone(), &self.schema)
res.into_final_bucket_result(self.agg.clone())
}
}

View File

@@ -1,18 +0,0 @@
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::TantivyError;
pub(crate) fn format_date(val: i64) -> crate::Result<String> {
let datetime =
OffsetDateTime::from_unix_timestamp_nanos(1_000 * (val as i128)).map_err(|err| {
TantivyError::InvalidArgument(format!(
"Could not convert {:?} to OffsetDateTime, err {:?}",
val, err
))
})?;
let key_as_string = datetime
.format(&Rfc3339)
.map_err(|_err| TantivyError::InvalidArgument("Could not serialize date".to_string()))?;
Ok(key_as_string)
}

View File

@@ -3,14 +3,15 @@
//! indices.
use std::cmp::Ordering;
use std::collections::HashMap;
use fnv::FnvHashMap;
use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::agg_req::{
Aggregations, AggregationsInternal, BucketAggregationInternal, BucketAggregationType,
MetricAggregation, RangeAggregation,
MetricAggregation,
};
use super::agg_result::{AggregationResult, BucketResult, RangeBucketEntry};
use super::bucket::{
@@ -19,11 +20,9 @@ use super::bucket::{
};
use super::metric::{IntermediateAverage, IntermediateStats};
use super::segment_agg_result::SegmentMetricResultCollector;
use super::{format_date, Key, SerializedKey, VecWithNames};
use super::{Key, SerializedKey, VecWithNames};
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
use crate::aggregation::bucket::TermsAggregationInternal;
use crate::schema::Schema;
use crate::TantivyError;
/// Contains the intermediate aggregation result, which is optimized to be merged with other
/// intermediate results.
@@ -37,12 +36,8 @@ pub struct IntermediateAggregationResults {
impl IntermediateAggregationResults {
/// Convert intermediate result and its aggregation request to the final result.
pub fn into_final_bucket_result(
self,
req: Aggregations,
schema: &Schema,
) -> crate::Result<AggregationResults> {
self.into_final_bucket_result_internal(&(req.into()), schema)
pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result<AggregationResults> {
self.into_final_bucket_result_internal(&(req.into()))
}
/// Convert intermediate result and its aggregation request to the final result.
@@ -52,19 +47,18 @@ impl IntermediateAggregationResults {
pub(crate) fn into_final_bucket_result_internal(
self,
req: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<AggregationResults> {
// Important assumption:
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
// request
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
let mut results: HashMap<String, AggregationResult> = HashMap::new();
if let Some(buckets) = self.buckets {
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets, schema)?
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)?
} else {
// When there are no buckets, we create empty buckets, so that the serialized json
// format is constant
add_empty_final_buckets_to_result(&mut results, &req.buckets, schema)?
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
};
if let Some(metrics) = self.metrics {
@@ -138,7 +132,7 @@ impl IntermediateAggregationResults {
}
fn convert_and_add_final_metrics_to_result(
results: &mut FxHashMap<String, AggregationResult>,
results: &mut HashMap<String, AggregationResult>,
metrics: VecWithNames<IntermediateMetricResult>,
) {
results.extend(
@@ -149,7 +143,7 @@ fn convert_and_add_final_metrics_to_result(
}
fn add_empty_final_metrics_to_result(
results: &mut FxHashMap<String, AggregationResult>,
results: &mut HashMap<String, AggregationResult>,
req_metrics: &VecWithNames<MetricAggregation>,
) -> crate::Result<()> {
results.extend(req_metrics.iter().map(|(key, req)| {
@@ -163,30 +157,27 @@ fn add_empty_final_metrics_to_result(
}
fn add_empty_final_buckets_to_result(
results: &mut FxHashMap<String, AggregationResult>,
results: &mut HashMap<String, AggregationResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
schema: &Schema,
) -> crate::Result<()> {
let requested_buckets = req_buckets.iter();
for (key, req) in requested_buckets {
let empty_bucket =
AggregationResult::BucketResult(BucketResult::empty_from_req(req, schema)?);
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
results.insert(key.to_string(), empty_bucket);
}
Ok(())
}
fn convert_and_add_final_buckets_to_result(
results: &mut FxHashMap<String, AggregationResult>,
results: &mut HashMap<String, AggregationResult>,
buckets: VecWithNames<IntermediateBucketResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
schema: &Schema,
) -> crate::Result<()> {
assert_eq!(buckets.len(), req_buckets.len());
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
for ((key, bucket), req) in buckets_with_request {
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req, schema)?);
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req)?);
results.insert(key, result);
}
Ok(())
@@ -276,21 +267,13 @@ impl IntermediateBucketResult {
pub(crate) fn into_final_bucket_result(
self,
req: &BucketAggregationInternal,
schema: &Schema,
) -> crate::Result<BucketResult> {
match self {
IntermediateBucketResult::Range(range_res) => {
let mut buckets: Vec<RangeBucketEntry> = range_res
.buckets
.into_iter()
.map(|(_, bucket)| {
bucket.into_final_bucket_entry(
&req.sub_aggregation,
schema,
req.as_range()
.expect("unexpected aggregation, expected histogram aggregation"),
)
})
.map(|(_, bucket)| bucket.into_final_bucket_entry(&req.sub_aggregation))
.collect::<crate::Result<Vec<_>>>()?;
buckets.sort_by(|left, right| {
@@ -305,7 +288,7 @@ impl IntermediateBucketResult {
.keyed;
let buckets = if is_keyed {
let mut bucket_map =
FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
for bucket in buckets {
bucket_map.insert(bucket.key.to_string(), bucket);
}
@@ -321,12 +304,11 @@ impl IntermediateBucketResult {
req.as_histogram()
.expect("unexpected aggregation, expected histogram aggregation"),
&req.sub_aggregation,
schema,
)?;
let buckets = if req.as_histogram().unwrap().keyed {
let mut bucket_map =
FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
for bucket in buckets {
bucket_map.insert(bucket.key.to_string(), bucket);
}
@@ -340,7 +322,6 @@ impl IntermediateBucketResult {
req.as_term()
.expect("unexpected aggregation, expected term aggregation"),
&req.sub_aggregation,
schema,
),
}
}
@@ -415,13 +396,13 @@ impl IntermediateBucketResult {
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Range aggregation including error counts
pub struct IntermediateRangeBucketResult {
pub(crate) buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry>,
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
}
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Term aggregation including error counts
pub struct IntermediateTermBucketResult {
pub(crate) entries: FxHashMap<String, IntermediateTermBucketEntry>,
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
pub(crate) sum_other_doc_count: u64,
pub(crate) doc_count_error_upper_bound: u64,
}
@@ -431,7 +412,6 @@ impl IntermediateTermBucketResult {
self,
req: &TermsAggregation,
sub_aggregation_req: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<BucketResult> {
let req = TermsAggregationInternal::from_req(req);
let mut buckets: Vec<BucketEntry> = self
@@ -440,12 +420,11 @@ impl IntermediateTermBucketResult {
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
.map(|(key, entry)| {
Ok(BucketEntry {
key_as_string: None,
key: Key::Str(key),
doc_count: entry.doc_count,
sub_aggregation: entry
.sub_aggregation
.into_final_bucket_result_internal(sub_aggregation_req, schema)?,
.into_final_bucket_result_internal(sub_aggregation_req)?,
})
})
.collect::<crate::Result<_>>()?;
@@ -520,8 +499,8 @@ trait MergeFruits {
}
fn merge_maps<V: MergeFruits + Clone>(
entries_left: &mut FxHashMap<SerializedKey, V>,
mut entries_right: FxHashMap<SerializedKey, V>,
entries_left: &mut FnvHashMap<SerializedKey, V>,
mut entries_right: FnvHashMap<SerializedKey, V>,
) {
for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.remove(name) {
@@ -550,15 +529,13 @@ impl IntermediateHistogramBucketEntry {
pub(crate) fn into_final_bucket_entry(
self,
req: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<BucketEntry> {
Ok(BucketEntry {
key_as_string: None,
key: Key::F64(self.key),
doc_count: self.doc_count,
sub_aggregation: self
.sub_aggregation
.into_final_bucket_result_internal(req, schema)?,
.into_final_bucket_result_internal(req)?,
})
}
}
@@ -595,38 +572,16 @@ impl IntermediateRangeBucketEntry {
pub(crate) fn into_final_bucket_entry(
self,
req: &AggregationsInternal,
schema: &Schema,
range_req: &RangeAggregation,
) -> crate::Result<RangeBucketEntry> {
let mut range_bucket_entry = RangeBucketEntry {
Ok(RangeBucketEntry {
key: self.key,
doc_count: self.doc_count,
sub_aggregation: self
.sub_aggregation
.into_final_bucket_result_internal(req, schema)?,
.into_final_bucket_result_internal(req)?,
to: self.to,
from: self.from,
to_as_string: None,
from_as_string: None,
};
// If we have a date type on the histogram buckets, we add the `key_as_string` field as
// rfc339
let field = schema
.get_field(&range_req.field)
.ok_or_else(|| TantivyError::FieldNotFound(range_req.field.to_string()))?;
if schema.get_field_entry(field).field_type().is_date() {
if let Some(val) = range_bucket_entry.to {
let key_as_string = format_date(val as i64)?;
range_bucket_entry.to_as_string = Some(key_as_string);
}
if let Some(val) = range_bucket_entry.from {
let key_as_string = format_date(val as i64)?;
range_bucket_entry.from_as_string = Some(key_as_string);
}
}
Ok(range_bucket_entry)
})
}
}
@@ -671,7 +626,7 @@ mod tests {
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
let mut map = HashMap::new();
let mut buckets = FxHashMap::default();
let mut buckets = FnvHashMap::default();
for (key, doc_count) in data {
buckets.insert(
key.to_string(),
@@ -698,7 +653,7 @@ mod tests {
data: &[(String, u64, String, u64)],
) -> IntermediateAggregationResults {
let mut map = HashMap::new();
let mut buckets: FxHashMap<_, _> = Default::default();
let mut buckets: FnvHashMap<_, _> = Default::default();
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
buckets.insert(
key.to_string(),

View File

@@ -60,10 +60,10 @@ impl SegmentAverageCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val1 = field.get_val(docs[0]);
let val2 = field.get_val(docs[1]);
let val3 = field.get_val(docs[2]);
let val4 = field.get_val(docs[3]);
let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -74,7 +74,7 @@ impl SegmentAverageCollector {
self.data.collect(val4);
}
for &doc in iter.remainder() {
let val = field.get_val(doc);
let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type);
self.data.collect(val);
}

View File

@@ -166,10 +166,10 @@ impl SegmentStatsCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val1 = field.get_val(docs[0]);
let val2 = field.get_val(docs[1]);
let val3 = field.get_val(docs[2]);
let val4 = field.get_val(docs[3]);
let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -180,7 +180,7 @@ impl SegmentStatsCollector {
self.stats.collect(val4);
}
for &doc in iter.remainder() {
let val = field.get_val(doc);
let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val);
}
@@ -222,7 +222,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -300,7 +300,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();

View File

@@ -12,7 +12,7 @@
//!
//! ## Prerequisite
//! Currently aggregations work only on [fast fields](`crate::fastfield`). Single value fast fields
//! of type `u64`, `f64`, `i64`, `date` and fast fields on text fields.
//! of type `u64`, `f64`, `i64` and fast fields on text fields.
//!
//! ## Usage
//! To use aggregations, build an aggregation request by constructing
@@ -53,10 +53,9 @@
//! use tantivy::query::AllQuery;
//! use tantivy::aggregation::agg_result::AggregationResults;
//! use tantivy::IndexReader;
//! use tantivy::schema::Schema;
//!
//! # #[allow(dead_code)]
//! fn aggregate_on_index(reader: &IndexReader, schema: Schema) {
//! fn aggregate_on_index(reader: &IndexReader) {
//! let agg_req: Aggregations = vec![
//! (
//! "average".to_string(),
@@ -68,7 +67,7 @@
//! .into_iter()
//! .collect();
//!
//! let collector = AggregationCollector::from_aggs(agg_req, None, schema);
//! let collector = AggregationCollector::from_aggs(agg_req, None);
//!
//! let searcher = reader.searcher();
//! let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
@@ -158,7 +157,6 @@ mod agg_req_with_accessor;
pub mod agg_result;
pub mod bucket;
mod collector;
mod date;
pub mod intermediate_agg_result;
pub mod metric;
mod segment_agg_result;
@@ -169,7 +167,6 @@ pub use collector::{
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
MAX_BUCKET_COUNT,
};
pub(crate) use date::format_date;
use fastfield_codecs::MonotonicallyMappableToU64;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
@@ -286,11 +283,11 @@ impl Display for Key {
/// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics.
///
/// # Panics
/// Only `u64`, `f64`, `date`, and `i64` are supported.
/// Only `u64`, `f64`, and `i64` are supported.
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
match field_type {
Type::U64 => val as f64,
Type::I64 | Type::Date => i64::from_u64(val) as f64,
Type::I64 => i64::from_u64(val) as f64,
Type::F64 => f64::from_u64(val),
_ => {
panic!("unexpected type {:?}. This should not happen", field_type)
@@ -298,9 +295,10 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
}
}
/// Converts the `f64` value to fast field value space, which is always u64.
/// Converts the `f64` value to fast field value space.
///
/// If the fast field has `u64`, values are stored unchanged as `u64` in the fast field.
/// If the fast field has `u64`, values are stored as `u64` in the fast field.
/// A `f64` value of e.g. `2.0` therefore needs to be converted to `1u64`.
///
/// If the fast field has `f64` values are converted and stored to `u64` using a
/// monotonic mapping.
@@ -310,7 +308,7 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> Option<u64> {
match field_type {
Type::U64 => Some(val as u64),
Type::I64 | Type::Date => Some((val as i64).to_u64()),
Type::I64 => Some((val as i64).to_u64()),
Type::F64 => Some(val.to_u64()),
_ => None,
}
@@ -319,7 +317,6 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> Option<u64> {
#[cfg(test)]
mod tests {
use serde_json::Value;
use time::OffsetDateTime;
use super::agg_req::{Aggregation, Aggregations, BucketAggregation};
use super::bucket::RangeAggregation;
@@ -335,7 +332,7 @@ mod tests {
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{DateTime, Index, Term};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
Aggregation::Metric(MetricAggregation::Average(
@@ -361,7 +358,7 @@ mod tests {
index: &Index,
query: Option<(&str, &str)>,
) -> crate::Result<Value> {
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req, None);
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -555,10 +552,10 @@ mod tests {
let searcher = reader.searcher();
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
intermediate_agg_result
.into_final_bucket_result(agg_req, &index.schema())
.into_final_bucket_result(agg_req)
.unwrap()
} else {
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req, None);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
@@ -651,7 +648,6 @@ mod tests {
.set_fast()
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("dummy_text", STRING);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
@@ -669,7 +665,6 @@ mod tests {
// writing the segment
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800).unwrap()),
score_field => 1u64,
score_field_f64 => 1f64,
score_field_i64 => 1i64,
@@ -678,7 +673,6 @@ mod tests {
))?;
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
score_field => 3u64,
score_field_f64 => 3f64,
score_field_i64 => 3i64,
@@ -687,21 +681,18 @@ mod tests {
))?;
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
score_field => 5u64,
score_field_f64 => 5f64,
score_field_i64 => 5i64,
))?;
index_writer.add_document(doc!(
text_field => "nohit",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
score_field => 6u64,
score_field_f64 => 6f64,
score_field_i64 => 6i64,
))?;
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
score_field => 7u64,
score_field_f64 => 7f64,
score_field_i64 => 7i64,
@@ -709,14 +700,12 @@ mod tests {
index_writer.commit()?;
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400).unwrap()),
score_field => 11u64,
score_field_f64 => 11f64,
score_field_i64 => 11i64,
))?;
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400 + 86400).unwrap()),
score_field => 14u64,
score_field_f64 => 14f64,
score_field_i64 => 14i64,
@@ -724,7 +713,6 @@ mod tests {
index_writer.add_document(doc!(
text_field => "cool",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400 + 86400).unwrap()),
score_field => 44u64,
score_field_f64 => 44.5f64,
score_field_i64 => 44i64,
@@ -735,7 +723,6 @@ mod tests {
// no hits segment
index_writer.add_document(doc!(
text_field => "nohit",
date_field => DateTime::from_utc(OffsetDateTime::from_unix_timestamp(1_546_300_800 + 86400 + 86400).unwrap()),
score_field => 44u64,
score_field_f64 => 44.5f64,
score_field_i64 => 44i64,
@@ -808,7 +795,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
@@ -1008,10 +995,9 @@ mod tests {
// Test de/serialization roundtrip on intermediate_agg_result
let res: IntermediateAggregationResults =
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
res.into_final_bucket_result(agg_req.clone(), &index.schema())
.unwrap()
res.into_final_bucket_result(agg_req.clone()).unwrap()
} else {
let collector = AggregationCollector::from_aggs(agg_req.clone(), None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req.clone(), None);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
@@ -1069,7 +1055,7 @@ mod tests {
);
// Test empty result set
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req, None);
let searcher = reader.searcher();
searcher.search(&query_with_no_hits, &collector).unwrap();
@@ -1134,7 +1120,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
@@ -1247,7 +1233,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1278,7 +1264,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1309,7 +1295,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1348,7 +1334,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1377,7 +1363,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1406,7 +1392,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1443,7 +1429,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1478,7 +1464,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1517,7 +1503,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1547,7 +1533,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =
@@ -1604,7 +1590,7 @@ mod tests {
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let collector = AggregationCollector::from_aggs(agg_req_1, None);
let searcher = reader.searcher();
let agg_res: AggregationResults =

View File

@@ -616,7 +616,7 @@ mod tests {
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
&format!("/facet/{}", thread_rng().sample(&uniform)),
);
doc
})

View File

@@ -177,7 +177,7 @@ where
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_val(doc);
let value = self.fast_field_reader.get_val(doc as u64);
if (self.predicate)(value) {
self.segment_collector.collect(doc, score)
}

View File

@@ -94,7 +94,7 @@ impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc);
let value = self.ff_reader.get_val(doc as u64);
self.histogram_computer.add_value(value);
}

View File

@@ -172,33 +172,17 @@ pub trait Collector: Sync + Send {
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
match (reader.alive_bitset(), self.requires_scoring()) {
(Some(alive_bitset), true) => {
weight.for_each(reader, &mut |doc, score| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;
}
(Some(alive_bitset), false) => {
weight.for_each_no_score(reader, &mut |doc| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, 0.0);
}
})?;
}
(None, true) => {
weight.for_each(reader, &mut |doc, score| {
if let Some(alive_bitset) = reader.alive_bitset() {
weight.for_each(reader, &mut |doc, score| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
})?;
}
(None, false) => {
weight.for_each_no_score(reader, &mut |doc| {
segment_collector.collect(doc, 0.0);
})?;
}
}
})?;
} else {
weight.for_each(reader, &mut |doc, score| {
segment_collector.collect(doc, score);
})?;
}
Ok(segment_collector.harvest())
}
}

View File

@@ -201,7 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get_val(doc);
let val = self.reader.get_val(doc as u64);
self.vals.push(val);
}

View File

@@ -137,7 +137,7 @@ struct ScorerByFastFieldReader {
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc)
self.ff_reader.get_val(doc as u64)
}
}
@@ -458,7 +458,7 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc);
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// // Well.. For the sake of the example we use a simple logarithm
/// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -567,8 +567,8 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc);
/// let boosted: u64 = boosted_reader.get_val(doc);
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let boosted: u64 = boosted_reader.get_val(doc as u64);
/// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order
/// // for free.

View File

@@ -149,8 +149,7 @@ impl IndexBuilder {
/// Creates a new index using the [`RamDirectory`].
///
/// The index will be allocated in anonymous memory.
/// This is useful for indexing small set of documents
/// for instances like unit test or temporary in memory index.
/// This should only be used for unit tests.
pub fn create_in_ram(self) -> Result<Index, TantivyError> {
let ram_directory = RamDirectory::create();
self.create(ram_directory)

View File

@@ -133,7 +133,7 @@ impl SegmentMeta {
/// associated with a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&match component {
path.push_str(&*match component {
SegmentComponent::Postings => ".idx".to_string(),
SegmentComponent::Positions => ".pos".to_string(),
SegmentComponent::Terms => ".term".to_string(),

View File

@@ -230,18 +230,6 @@ impl InvertedIndexReader {
Ok(())
}
/// Read the block postings for all terms.
/// This method is for an advanced usage only.
///
/// If you know which terms to pre-load, prefer using [`Self::warm_postings`] instead.
pub async fn warm_postings_full(&self, with_positions: bool) -> crate::AsyncIoResult<()> {
self.postings_file_slice.read_bytes_async().await?;
if with_positions {
self.positions_file_slice.read_bytes_async().await?;
}
Ok(())
}
/// Returns the number of documents containing the term asynchronously.
pub async fn doc_freq_async(&self, term: &Term) -> crate::AsyncIoResult<u32> {
Ok(self

View File

@@ -4,7 +4,7 @@ use std::{fmt, io};
use crate::collector::Collector;
use crate::core::{Executor, SegmentReader};
use crate::query::{EnableScoring, Query};
use crate::query::Query;
use crate::schema::{Document, Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
@@ -199,12 +199,7 @@ impl Searcher {
executor: &Executor,
) -> crate::Result<C::Fruit> {
let scoring_enabled = collector.requires_scoring();
let enabled_scoring = if scoring_enabled {
EnableScoring::Enabled(self)
} else {
EnableScoring::Disabled(self.schema())
};
let weight = query.weight(enabled_scoring)?;
let weight = query.weight(self, scoring_enabled)?;
let segment_readers = self.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {

View File

@@ -55,7 +55,7 @@ impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
impl Drop for DirectoryLockGuard {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&self.path) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}

View File

@@ -6,7 +6,7 @@ pub use self::writer::BytesFastFieldWriter;
#[cfg(test)]
mod tests {
use crate::query::{EnableScoring, TermQuery};
use crate::query::TermQuery;
use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value, FAST, INDEXED, STORED};
use crate::{DocAddress, DocSet, Index, Searcher, Term};
@@ -82,7 +82,7 @@ mod tests {
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let term_weight = term_query.specialized_weight(EnableScoring::Enabled(&searcher))?;
let term_weight = term_query.specialized_weight(&searcher, true)?;
let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0)?;
assert_eq!(term_scorer.doc(), 0u32);
Ok(())
@@ -95,8 +95,7 @@ mod tests {
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let term_weight_err =
term_query.specialized_weight(EnableScoring::Disabled(searcher.schema()));
let term_weight_err = term_query.specialized_weight(&searcher, false);
assert!(matches!(
term_weight_err,
Err(crate::TantivyError::SchemaError(_))

View File

@@ -1,9 +1,10 @@
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::directory::{FileSlice, OwnedBytes};
use crate::fastfield::MultiValueIndex;
use crate::fastfield::MultiValueLength;
use crate::DocId;
/// Reader for byte array fast fields
@@ -18,7 +19,7 @@ use crate::DocId;
/// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: MultiValueIndex,
idx_reader: Arc<dyn Column<u64>>,
values: OwnedBytes,
}
@@ -28,31 +29,42 @@ impl BytesFastFieldReader {
values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?;
Ok(BytesFastFieldReader {
idx_reader: MultiValueIndex::new(idx_reader),
values,
})
Ok(BytesFastFieldReader { idx_reader, values })
}
/// returns the multivalue index
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
fn range(&self, doc: DocId) -> Range<u64> {
let idx = doc as u64;
let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end
}
/// Returns the bytes associated with the given `doc`
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
let range = self.idx_reader.range(doc);
let range = self.range(doc);
&self.values.as_slice()[range.start as usize..range.end as usize]
}
/// Returns the length of the bytes associated with the given `doc`
pub fn num_bytes(&self, doc: DocId) -> u64 {
let range = self.idx_reader.range(doc);
(range.end - range.start) as u64
let range = self.range(doc);
range.end - range.start
}
/// Returns the overall number of bytes in this bytes fast field.
pub fn total_num_bytes(&self) -> u32 {
self.values.len() as u32
pub fn total_num_bytes(&self) -> u64 {
self.values.len() as u64
}
}
impl MultiValueLength for BytesFastFieldReader {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_bytes(doc_id)
}
fn get_total_len(&self) -> u64 {
self.total_num_bytes()
}
}

View File

@@ -27,16 +27,16 @@ pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex};
pub use self::multivalued::{
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader,
MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader,
MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValuedFastFieldWriter,
MultiValuedU128FastFieldReader,
};
pub(crate) use self::readers::type_and_cardinality;
pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType};
pub use self::serializer::{Column, CompositeFastFieldSerializer};
use self::writer::unexpected_value;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::{Type, Value};
use crate::DateTime;
use crate::{DateTime, DocId};
mod alive_bitset;
mod bytes;
@@ -47,6 +47,17 @@ mod readers;
mod serializer;
mod writer;
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
/// for a doc_id
pub trait MultiValueLength {
/// returns the positions for a docid
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64>;
/// returns the num of values associated with a doc_id
fn get_len(&self, doc_id: DocId) -> u64;
/// returns the sum of num values for all doc_ids
fn get_total_len(&self) -> u64;
}
/// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime).
pub trait FastValue:
@@ -173,9 +184,9 @@ mod tests {
#[test]
pub fn test_fastfield() {
let test_fastfield = fastfield_codecs::serialize_and_load(&[100u64, 200u64, 300u64][..]);
assert_eq!(test_fastfield.get_val(0), 100);
assert_eq!(test_fastfield.get_val(1), 200);
assert_eq!(test_fastfield.get_val(2), 300);
assert_eq!(test_fastfield.get_val(0u64), 100);
assert_eq!(test_fastfield.get_val(1u64), 200);
assert_eq!(test_fastfield.get_val(2u64), 300);
}
#[test]
@@ -207,7 +218,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 34);
assert_eq!(file.len(), 25);
let composite_file = CompositeFile::open(&file)?;
let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?;
let fast_field_reader = open::<u64>(fast_field_bytes)?;
@@ -256,7 +267,7 @@ mod tests {
serializer.close()?;
}
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
assert_eq!(file.len(), 53);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite
@@ -297,7 +308,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 26);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite
@@ -336,7 +347,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80049);
assert_eq!(file.len(), 80040);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite
@@ -378,7 +389,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 49_usize);
assert_eq!(file.len(), 40_usize);
{
let fast_fields_composite = CompositeFile::open(&file)?;
@@ -391,7 +402,7 @@ mod tests {
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get_val(doc as u32), i);
assert_eq!(fast_field_reader.get_val(doc as u64), i);
}
let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]);
@@ -473,7 +484,7 @@ mod tests {
let fast_field_reader = open::<u64>(data)?;
for a in 0..n {
assert_eq!(fast_field_reader.get_val(a as u32), permutation[a as usize]);
assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
}
}
Ok(())
@@ -822,7 +833,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 33);
assert_eq!(file.len(), 24);
let composite_file = CompositeFile::open(&file)?;
let data = composite_file.open_read(field).unwrap().read_bytes()?;
let fast_field_reader = open::<bool>(data)?;
@@ -860,7 +871,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 45);
assert_eq!(file.len(), 36);
let composite_file = CompositeFile::open(&file)?;
let data = composite_file.open_read(field).unwrap().read_bytes()?;
let fast_field_reader = open::<bool>(data)?;
@@ -892,7 +903,7 @@ mod tests {
}
let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?;
assert_eq!(file.len(), 32);
assert_eq!(file.len(), 23);
let data = composite_file.open_read(field).unwrap().read_bytes()?;
let fast_field_reader = open::<bool>(data)?;
assert_eq!(fast_field_reader.get_val(0), false);
@@ -926,10 +937,10 @@ mod tests {
pub fn test_gcd_date() -> crate::Result<()> {
let size_prec_sec =
test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Seconds)?;
assert_eq!(size_prec_sec, 5 + 4 + 28 + (1_000 * 13) / 8); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
assert_eq!(size_prec_sec, 28 + (1_000 * 13) / 8); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
let size_prec_micro =
test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Microseconds)?;
assert_eq!(size_prec_micro, 5 + 4 + 26 + (1_000 * 33) / 8); // 33 bits per val = ceil(log_2(number of microsecsseconds in 2hours);
assert_eq!(size_prec_micro, 26 + (1_000 * 33) / 8); // 33 bits per val = ceil(log_2(number of microsecsseconds in 2hours);
Ok(())
}
@@ -965,7 +976,7 @@ mod tests {
let test_fastfield = open::<DateTime>(file.read_bytes()?)?;
for (i, time) in times.iter().enumerate() {
assert_eq!(test_fastfield.get_val(i as u32), time.truncate(precision));
assert_eq!(test_fastfield.get_val(i as u64), time.truncate(precision));
}
Ok(len)
}

View File

@@ -1,148 +0,0 @@
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::DocId;
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndex {
idx: Arc<dyn Column<u64>>,
}
impl MultiValueIndex {
pub(crate) fn new(idx: Arc<dyn Column<u64>>) -> Self {
Self { idx }
}
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, doc: DocId) -> Range<u32> {
let start = self.idx.get_val(doc) as u32;
let end = self.idx.get_val(doc + 1) as u32;
start..end
}
/// Given a range of documents, returns the Range of value offsets fo
/// these documents.
///
/// For instance, `given start_doc..end_doc`,
/// if we assume Document #start_doc end #end_doc both
/// have values, this function returns `start..end`
/// such that `value_column.get(start_doc)` is the first value of
/// `start_doc` (well, if there is one), and `value_column.get(end_doc - 1)`
/// is the last value of `end_doc`.
///
/// The passed end range is allowed to be out of bounds, in which case
/// it will be clipped to make it valid.
#[inline]
pub(crate) fn docid_range_to_position_range(&self, range: Range<DocId>) -> Range<u32> {
let end_docid = range.end.min(self.num_docs() - 1) + 1;
let start_docid = range.start.min(end_docid);
let start = self.idx.get_val(start_docid) as u32;
let end = self.idx.get_val(end_docid) as u32;
assert!(start <= end);
start..end
}
/// returns the num of values associated with a doc_id
pub(crate) fn num_vals_for_doc(&self, doc: DocId) -> u32 {
let range = self.range(doc);
range.end - range.start
}
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u32 {
self.idx.max_value() as u32
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_docs(&self) -> u32 {
self.idx.num_vals() - 1
}
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
/// Positions are converted inplace to docids.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
/// increasing positions.
///
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
pub(crate) fn positions_to_docids(&self, doc_id_range: Range<u32>, positions: &mut Vec<u32>) {
if positions.is_empty() {
return;
}
let mut cur_doc = doc_id_range.start;
let mut last_doc = None;
assert!(self.idx.get_val(doc_id_range.start) as u32 <= positions[0]);
let mut write_doc_pos = 0;
for i in 0..positions.len() {
let pos = positions[i];
loop {
let end = self.idx.get_val(cur_doc + 1) as u32;
if end > pos {
positions[write_doc_pos] = cur_doc;
write_doc_pos += if last_doc == Some(cur_doc) { 0 } else { 1 };
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
positions.truncate(write_doc_pos);
}
}
#[cfg(test)]
mod tests {
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::IterColumn;
use crate::fastfield::MultiValueIndex;
fn index_to_pos_helper(
index: &MultiValueIndex,
doc_id_range: Range<u32>,
positions: &[u32],
) -> Vec<u32> {
let mut positions = positions.to_vec();
index.positions_to_docids(doc_id_range, &mut positions);
positions
}
#[test]
fn test_positions_to_docid() {
let offsets = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
let column = IterColumn::from(offsets.into_iter());
let index = MultiValueIndex::new(Arc::new(column));
assert_eq!(index.num_docs(), 5);
{
let positions = vec![10u32, 11, 15, 20, 21, 22];
assert_eq!(index_to_pos_helper(&index, 0..5, &positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 1..5, &positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
}
}
}

View File

@@ -1,9 +1,7 @@
mod index;
mod reader;
mod writer;
use fastfield_codecs::FastFieldCodecType;
pub use index::MultiValueIndex;
pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader};
pub(crate) use self::writer::MultivalueStartIndex;
@@ -517,7 +515,7 @@ mod bench {
for val in block {
doc.add_u64(field, *val);
}
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.add_document(&doc);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
@@ -575,7 +573,7 @@ mod bench {
for val in block {
doc.add_u64(field, *val);
}
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.add_document(&doc);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
@@ -608,7 +606,7 @@ mod bench {
for val in block {
doc.add_u64(field, *val);
}
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.add_document(&doc);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), Some(&doc_id_mapping))

View File

@@ -3,8 +3,7 @@ use std::sync::Arc;
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::MultiValueIndex;
use crate::fastfield::FastValue;
use crate::fastfield::{FastValue, MultiValueLength};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -14,10 +13,9 @@ use crate::DocId;
/// The `vals_reader` will access the concatenated list of all
/// values for all reader.
/// The `idx_reader` associated, for each document, the index of its first value.
/// Stores the start position for each document.
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: MultiValueIndex,
idx_reader: Arc<dyn Column<u64>>,
vals_reader: Arc<dyn Column<Item>>,
}
@@ -27,32 +25,36 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
vals_reader: Arc<dyn Column<Item>>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader: MultiValueIndex::new(idx_reader),
idx_reader,
vals_reader,
}
}
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let idx = doc as u64;
let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end
}
/// Returns the array of values associated with the given `doc`.
#[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<Item>) {
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader
.get_range(range.start as u64, &mut vals[..]);
self.vals_reader.get_range(range.start, &mut vals[..]);
}
/// Returns the array of values associated with the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.idx_reader.range(doc);
let range = self.range(doc);
self.get_vals_for_range(range, vals);
}
/// returns the multivalue index
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
@@ -73,14 +75,28 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> u32 {
self.idx_reader.num_vals_for_doc(doc)
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
}
/// Returns the overall number of values in this field.
/// Returns the overall number of values in this field .
#[inline]
pub fn total_num_vals(&self) -> u32 {
self.idx_reader.total_num_vals()
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
}
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
fn get_range(&self, doc_id: DocId) -> Range<u64> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_vals(doc_id) as u64
}
fn get_total_len(&self) -> u64 {
self.total_num_vals() as u64
}
}
@@ -93,7 +109,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)]
pub struct MultiValuedU128FastFieldReader<T: MonotonicallyMappableToU128> {
idx_reader: MultiValueIndex,
idx_reader: Arc<dyn Column<u64>>,
vals_reader: Arc<dyn Column<T>>,
}
@@ -103,15 +119,24 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
vals_reader: Arc<dyn Column<T>>,
) -> MultiValuedU128FastFieldReader<T> {
Self {
idx_reader: MultiValueIndex::new(idx_reader),
idx_reader,
vals_reader,
}
}
/// Returns `[start, end)`, such that the values associated
/// to the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get_val(doc as u64);
let end = self.idx_reader.get_val(doc as u64 + 1);
start..end
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_first_val(&self, doc: DocId) -> Option<T> {
let range = self.idx_reader.range(doc);
let range = self.range(doc);
if range.is_empty() {
return None;
}
@@ -120,25 +145,26 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns the array of values associated to the given `doc`.
#[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<T>) {
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<T>) {
let len = (range.end - range.start) as usize;
vals.resize(len, T::from_u128(0));
self.vals_reader
.get_range(range.start as u64, &mut vals[..]);
}
/// Returns the index reader
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
self.vals_reader.get_range(range.start, &mut vals[..]);
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<T>) {
let range = self.idx_reader.range(doc);
let range = self.range(doc);
self.get_vals_for_range(range, vals);
}
/// Returns all docids which are in the provided value range
pub fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<DocId> {
let positions = self.vals_reader.get_between_vals(range);
positions_to_docids(&positions, self.idx_reader.as_ref())
}
/// Iterates over all elements in the fast field
pub fn iter(&self) -> impl Iterator<Item = T> + '_ {
self.vals_reader.iter()
@@ -164,44 +190,85 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> u32 {
self.idx_reader.num_vals_for_doc(doc)
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
}
/// Returns the overall number of values in this field. It does not include deletes.
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u32 {
assert_eq!(
self.vals_reader.num_vals(),
self.get_index_reader().total_num_vals()
);
self.idx_reader.total_num_vals()
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
}
impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFieldReader<T> {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_vals(doc_id) as u64
}
fn get_total_len(&self) -> u64 {
self.total_num_vals() as u64
}
}
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically increasing
/// positions.
///
/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a
/// docid to its value position.
fn positions_to_docids<C: Column + ?Sized>(positions: &[u64], idx_reader: &C) -> Vec<DocId> {
let mut docs = vec![];
let mut cur_doc = 0u32;
let mut last_doc = None;
for pos in positions {
loop {
let end = idx_reader.get_val(cur_doc as u64 + 1);
if end > *pos {
// avoid duplicates
if Some(cur_doc) == last_doc {
break;
}
docs.push(cur_doc);
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
/// Returns the docids matching given doc_id_range and value_range.
#[inline]
pub fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let position_range = self
.get_index_reader()
.docid_range_to_position_range(doc_id_range.clone());
self.vals_reader
.get_docids_for_value_range(value_range, position_range, positions);
self.idx_reader.positions_to_docids(doc_id_range, positions);
}
docs
}
#[cfg(test)]
mod tests {
use fastfield_codecs::VecColumn;
use crate::core::Index;
use crate::fastfield::multivalued::reader::positions_to_docids;
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
#[test]
fn test_positions_to_docid() {
let positions = vec![10u64, 11, 15, 20, 21, 22];
let offsets = vec![0, 10, 12, 15, 22, 23];
{
let column = VecColumn::from(&offsets);
let docids = positions_to_docids(&positions, &column);
assert_eq!(docids, vec![1, 3, 4]);
}
}
#[test]
fn test_multifastfield_reader() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -3,7 +3,7 @@ use std::io;
use fastfield_codecs::{
Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
};
use rustc_hash::FxHashMap;
use fnv::FnvHashMap;
use super::get_fastfield_codecs_for_multivalue;
use crate::fastfield::writer::unexpected_value;
@@ -144,7 +144,7 @@ impl MultiValuedFastFieldWriter {
pub fn serialize(
mut self,
serializer: &mut CompositeFastFieldSerializer,
term_mapping_opt: Option<&FxHashMap<UnorderedTermId, TermOrdinal>>,
term_mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
{
@@ -219,7 +219,7 @@ pub(crate) struct MultivalueStartIndex<'a, C: Column> {
impl<'a, C: Column> MultivalueStartIndex<'a, C> {
pub fn new(column: &'a C, doc_id_map: &'a DocIdMapping) -> Self {
assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u32 + 1);
assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u64 + 1);
let (min, max) =
tantivy_bitpacker::minmax(iter_remapped_multivalue_index(doc_id_map, column))
.unwrap_or((0u64, 0u64));
@@ -232,7 +232,7 @@ impl<'a, C: Column> MultivalueStartIndex<'a, C> {
}
}
impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> {
fn get_val(&self, _idx: u32) -> u64 {
fn get_val(&self, _idx: u64) -> u64 {
unimplemented!()
}
@@ -244,8 +244,8 @@ impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> {
self.max
}
fn num_vals(&self) -> u32 {
(self.doc_id_map.num_new_doc_ids() + 1) as u32
fn num_vals(&self) -> u64 {
(self.doc_id_map.num_new_doc_ids() + 1) as u64
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
@@ -262,7 +262,7 @@ fn iter_remapped_multivalue_index<'a, C: Column>(
) -> impl Iterator<Item = u64> + 'a {
let mut offset = 0;
std::iter::once(0).chain(doc_id_map.iter_old_doc_ids().map(move |old_doc| {
let num_vals_for_doc = column.get_val(old_doc + 1) - column.get_val(old_doc);
let num_vals_for_doc = column.get_val(old_doc as u64 + 1) - column.get_val(old_doc as u64);
offset += num_vals_for_doc;
offset as u64
}))
@@ -369,7 +369,7 @@ impl MultiValueU128FastFieldWriter {
serializer.create_u128_fast_field_with_idx(
self.field,
iter_gen,
self.vals.len() as u32,
self.vals.len() as u64,
1,
)?;
}

View File

@@ -90,7 +90,7 @@ impl CompositeFastFieldSerializer {
&mut self,
field: Field,
iter_gen: F,
num_vals: u32,
num_vals: u64,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);

View File

@@ -3,7 +3,7 @@ use std::io;
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
@@ -256,7 +256,7 @@ impl FastFieldsWriter {
pub fn serialize(
self,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in self.term_id_writers {
@@ -363,7 +363,7 @@ impl U128FastFieldWriter {
serializer.create_u128_fast_field_with_idx(
self.field,
iter_gen,
self.val_count as u32,
self.val_count as u64,
0,
)?;
} else {
@@ -371,7 +371,7 @@ impl U128FastFieldWriter {
serializer.create_u128_fast_field_with_idx(
self.field,
iter_gen,
self.val_count as u32,
self.val_count as u64,
0,
)?;
}
@@ -511,7 +511,7 @@ impl IntFastFieldWriter {
vals: &self.vals,
min_value: min,
max_value: max,
num_vals: self.val_count as u32,
num_vals: self.val_count as u64,
};
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
@@ -526,7 +526,7 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker,
min_value: u64,
max_value: u64,
num_vals: u32,
num_vals: u64,
}
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
@@ -538,7 +538,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// # Panics
///
/// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 {
fn get_val(&self, _doc: u64) -> u64 {
unimplemented!()
}
@@ -562,7 +562,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
self.max_value
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.num_vals
}
}

View File

@@ -34,7 +34,7 @@ mod tests {
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fieldnorm::{FieldNormReader, FieldNormsSerializer, FieldNormsWriter};
use crate::query::{EnableScoring, Query, TermQuery};
use crate::query::{Query, TermQuery};
use crate::schema::{
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, TEXT,
};
@@ -112,7 +112,7 @@ mod tests {
Term::from_field_text(text, "hello"),
IndexRecordOption::WithFreqs,
);
let weight = query.weight(EnableScoring::Enabled(&searcher))?;
let weight = query.weight(&searcher, true)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
@@ -141,7 +141,7 @@ mod tests {
Term::from_field_text(text, "hello"),
IndexRecordOption::WithFreqs,
);
let weight = query.weight(EnableScoring::Enabled(&searcher))?;
let weight = query.weight(&searcher, true)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);

View File

@@ -9,7 +9,7 @@ use crate::DocId;
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
/// of each document for each field with field norms.
///
/// `FieldNormsWriter` stores a `Vec<u8>` for each tracked field, using a
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
/// byte per document per field.
pub struct FieldNormsWriter {
fieldnorms_buffers: Vec<Option<Vec<u8>>>,

View File

@@ -19,9 +19,9 @@ use crate::indexer::index_writer_status::IndexWriterStatus;
use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper;
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
use crate::query::{EnableScoring, Query, TermQuery};
use crate::query::{Query, TermQuery};
use crate::schema::{Document, IndexRecordOption, Term};
use crate::{FutureResult, Opstamp};
use crate::{FutureResult, IndexReader, Opstamp};
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
// in the `memory_arena` goes below MARGIN_IN_BYTES.
@@ -57,6 +57,7 @@ pub struct IndexWriter {
_directory_lock: Option<DirectoryLock>,
index: Index,
index_reader: IndexReader,
memory_arena_in_bytes_per_thread: usize,
@@ -94,7 +95,7 @@ fn compute_deleted_bitset(
// document that were inserted before it.
delete_op
.target
.for_each_no_score(segment_reader, &mut |doc_matching_delete_query| {
.for_each(segment_reader, &mut |doc_matching_delete_query, _| {
if doc_opstamps.is_deleted(doc_matching_delete_query, delete_op.opstamp) {
alive_bitset.remove(doc_matching_delete_query);
might_have_changed = true;
@@ -297,6 +298,8 @@ impl IndexWriter {
memory_arena_in_bytes_per_thread,
index: index.clone(),
index_reader: index.reader()?,
index_writer_status: IndexWriterStatus::from(document_receiver),
operation_sender: document_sender,
@@ -678,7 +681,8 @@ impl IndexWriter {
/// only after calling `commit()`.
#[doc(hidden)]
pub fn delete_query(&self, query: Box<dyn Query>) -> crate::Result<Opstamp> {
let weight = query.weight(EnableScoring::Disabled(&self.index.schema()))?;
let weight = query.weight(&self.index_reader.searcher(), false)?;
let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation {
opstamp,
@@ -759,7 +763,8 @@ impl IndexWriter {
match user_op {
UserOperation::Delete(term) => {
let query = TermQuery::new(term, IndexRecordOption::Basic);
let weight = query.weight(EnableScoring::Disabled(&self.index.schema()))?;
let weight = query.weight(&self.index_reader.searcher(), false)?;
let delete_operation = DeleteOperation {
opstamp,
target: weight,
@@ -1467,7 +1472,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc))
.map(|doc| fast_field_reader.get_val(doc as u64))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(())
@@ -1528,7 +1533,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc))
.map(|doc| fast_field_reader.get_val(doc as u64))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
Ok(())
@@ -1586,25 +1591,6 @@ mod tests {
(existing_ids, deleted_ids)
}
fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
let mut id_list = Vec::new();
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
id_list.push(id);
}
IndexingOp::DeleteDoc { id } => {
id_list.retain(|el| *el != id);
}
IndexingOp::DeleteDocQuery { id } => {
id_list.retain(|el| *el != id);
}
_ => {}
}
}
id_list
}
fn test_operation_strategy(
ops: &[IndexingOp],
sort_index: bool,
@@ -1614,9 +1600,7 @@ mod tests {
let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED);
let ips_field = schema_builder.add_ip_addr_field(
"ips",
IpAddrOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
IpAddrOptions::default().set_fast(Cardinality::MultiValues),
);
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
let i64_field = schema_builder.add_i64_field("i64", INDEXED);
@@ -1635,7 +1619,6 @@ mod tests {
);
let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED);
let multi_text_fields = schema_builder.add_text_field("multi_text_fields", TEXT | STORED);
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
@@ -1675,19 +1658,11 @@ mod tests {
let ip_exists = |id| id % 3 != 0; // 0 does not exist
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
let multi_text_field_text2 = "test2 test3 test1 test2 test3 test1";
// rotate right
let multi_text_field_text3 = "test3 test1 test2 test3 test1 test2";
let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
let ip_from_id = Ipv6Addr::from_u128(id as u128);
if !ip_exists(id) {
// every 3rd doc has no ip field
@@ -1703,17 +1678,14 @@ mod tests {
multi_bools => (id % 2u64) == 0,
text_field => id.to_string(),
facet_field => facet,
large_text_field => LOREM,
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
large_text_field=> LOREM
))?;
} else {
index_writer.add_document(doc!(id_field=>id,
bytes_field => id.to_le_bytes().as_slice(),
ip_field => ip,
ips_field => ip,
ips_field => ip,
ip_field => ip_from_id,
ips_field => ip_from_id,
ips_field => ip_from_id,
multi_numbers=> id,
multi_numbers => id,
bool_field => (id % 2u64) != 0,
@@ -1724,10 +1696,7 @@ mod tests {
multi_bools => (id % 2u64) == 0,
text_field => id.to_string(),
facet_field => facet,
large_text_field => LOREM,
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
large_text_field=> LOREM
))?;
}
}
@@ -1756,7 +1725,6 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let num_segments_before_merge = searcher.segment_readers().len();
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?;
@@ -1768,7 +1736,6 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok());
}
}
let num_segments_after_merge = searcher.segment_readers().len();
old_reader.reload()?;
let old_searcher = old_reader.searcher();
@@ -1780,7 +1747,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc))
.map(move |doc| ff_reader.get_val(doc as u64))
})
.collect();
@@ -1791,27 +1758,11 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc))
.map(move |doc| ff_reader.get_val(doc as u64))
})
.collect();
let (expected_ids_and_num_occurrences, deleted_ids) = expected_ids(ops);
let id_list = get_id_list(ops);
// multivalue fast field content
let mut all_ips = Vec::new();
let mut num_ips = 0;
for segment_reader in searcher.segment_readers().iter() {
let ip_reader = segment_reader.fast_fields().ip_addrs(ips_field).unwrap();
for doc in segment_reader.doc_ids_alive() {
let mut vals = vec![];
ip_reader.get_vals(doc, &mut vals);
all_ips.extend_from_slice(&vals);
}
num_ips += ip_reader.total_num_vals();
}
let num_docs_expected = expected_ids_and_num_occurrences
.iter()
.map(|(_, id_occurrences)| *id_occurrences as usize)
@@ -1833,30 +1784,6 @@ mod tests {
.collect::<HashSet<_>>()
);
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| ip_exists(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32);
expected_multi_ips.sort();
all_ips.sort();
assert_eq!(expected_multi_ips, all_ips);
// Test fastfield num_docs
let num_docs: usize = searcher
.segment_readers()
.iter()
.map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().ip_addrs(ips_field).unwrap();
ff_reader.get_index_reader().num_docs() as usize
})
.sum();
assert_eq!(num_docs, num_docs_expected);
}
// Load all ips addr
let ips: HashSet<Ipv6Addr> = searcher
.segment_readers()
@@ -1864,7 +1791,7 @@ mod tests {
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap();
segment_reader.doc_ids_alive().flat_map(move |doc| {
let val = ff_reader.get_val(doc);
let val = ff_reader.get_val(doc as u64);
if val == Ipv6Addr::from_u128(0) {
// TODO Fix null handling
None
@@ -1921,7 +1848,7 @@ mod tests {
ff_reader.get_vals(doc, &mut vals);
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert_eq!(id_reader.get_val(doc), vals[0]);
assert_eq!(id_reader.get_val(doc as u64), vals[0]);
let mut bool_vals = vec![];
bool_ff_reader.get_vals(doc, &mut bool_vals);
@@ -1995,21 +1922,11 @@ mod tests {
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(text_field), count);
assert_eq!(get_num_hits(i64_field), count);
assert_eq!(get_num_hits(f64_field), count);
assert_eq!(get_num_hits(id_field), count);
// Test multi text
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_expected
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_expected
);
let assert_field = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(assert_field(text_field), count);
assert_eq!(assert_field(i64_field), count);
assert_eq!(assert_field(f64_field), count);
assert_eq!(assert_field(id_field), count);
// Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
@@ -2060,51 +1977,6 @@ mod tests {
assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count);
}
}
// assert data is like expected
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !ip_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
// let query = gen_query_inclusive("ip", ip, ip);
// assert_eq!(do_search_ip_field(&query), count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !ip_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
// let query = gen_query_inclusive("ip", ip, ip);
// assert_eq!(do_search_ip_field(&query), count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
}
// test facets
for segment_reader in searcher.segment_readers().iter() {
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
@@ -2117,7 +1989,7 @@ mod tests {
facet_reader
.facet_from_ord(facet_ords[0], &mut facet)
.unwrap();
let id = ff_reader.get_val(doc_id);
let id = ff_reader.get_val(doc_id as u64);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected);
@@ -2126,40 +1998,6 @@ mod tests {
Ok(())
}
#[test]
fn test_ip_range_query_multivalue_bug() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 2 },
IndexingOp::Commit,
IndexingOp::AddDoc { id: 1 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::Commit,
IndexingOp::Merge
],
true,
false
)
.is_ok());
}
#[test]
fn test_ff_num_ips_regression() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::Commit,
IndexingOp::DeleteDocQuery { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::Commit,
],
false,
true
)
.is_ok());
}
#[test]
fn test_minimal() {
assert!(test_operation_strategy(
@@ -2169,7 +2007,7 @@ mod tests {
IndexingOp::DeleteDoc { id: 13 }
],
true,
true
false
)
.is_ok());

View File

@@ -1,6 +1,6 @@
use fastfield_codecs::MonotonicallyMappableToU64;
use fnv::FnvHashMap;
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
@@ -52,7 +52,7 @@ use crate::{DatePrecision, DateTime, DocId, Term};
/// path map to the same index position as long as the probability is relatively low.
#[derive(Default)]
struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>,
positions_per_path: FnvHashMap<u32, IndexingPosition>,
}
impl IndexingPositionsPerPath {
@@ -67,12 +67,11 @@ pub(crate) fn index_json_values<'a>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_value_res in json_values {
let json_value = json_value_res?;
@@ -260,65 +259,29 @@ pub(crate) fn set_string_and_get_terms(
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
expand_dots_enabled: bool,
}
/// Splits a json path supplied to the query parser in such a way that
/// `.` can be escaped.
///
/// In other words,
/// - `k8s.node` ends up as `["k8s", "node"]`.
/// - `k8s\.node` ends up as `["k8s.node"]`.
fn split_json_path(json_path: &str) -> Vec<String> {
let mut escaped_state: bool = false;
let mut json_path_segments = Vec::new();
let mut buffer = String::new();
for ch in json_path.chars() {
if escaped_state {
buffer.push(ch);
escaped_state = false;
continue;
}
match ch {
'\\' => {
escaped_state = true;
}
'.' => {
let new_segment = std::mem::take(&mut buffer);
json_path_segments.push(new_segment);
}
_ => {
buffer.push(ch);
}
}
}
json_path_segments.push(buffer);
json_path_segments
}
impl<'a> JsonTermWriter<'a> {
pub fn from_field_and_json_path(
field: Field,
json_path: &str,
expand_dots_enabled: bool,
term_buffer: &'a mut Term,
) -> Self {
term_buffer.set_field_and_type(field, Type::Json);
let mut json_term_writer = Self::wrap(term_buffer, expand_dots_enabled);
for segment in split_json_path(json_path) {
json_term_writer.push_path_segment(&segment);
let mut json_term_writer = Self::wrap(term_buffer);
for segment in json_path.split('.') {
json_term_writer.push_path_segment(segment);
}
json_term_writer
}
pub fn wrap(term_buffer: &'a mut Term, expand_dots_enabled: bool) -> Self {
pub fn wrap(term_buffer: &'a mut Term) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);
path_stack.push(0);
Self {
term_buffer,
path_stack,
expand_dots_enabled,
}
}
@@ -340,24 +303,11 @@ impl<'a> JsonTermWriter<'a> {
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
}
if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
self.term_buffer
.append_bytes(segment.as_bytes())
.iter_mut()
.for_each(|byte| {
if *byte == b'.' {
*byte = JSON_PATH_SEGMENT_SEP;
}
});
} else {
self.term_buffer.append_bytes(segment.as_bytes());
}
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
self.term_buffer.append_bytes(segment.as_bytes());
self.term_buffer.append_bytes(&[JSON_PATH_SEGMENT_SEP]);
self.path_stack.push(self.term_buffer.len_bytes());
}
@@ -400,7 +350,7 @@ impl<'a> JsonTermWriter<'a> {
#[cfg(test)]
mod tests {
use super::{split_json_path, JsonTermWriter};
use super::JsonTermWriter;
use crate::schema::{Field, Type};
use crate::Term;
@@ -408,7 +358,7 @@ mod tests {
fn test_json_writer() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("attributes");
json_writer.push_path_segment("color");
json_writer.set_str("red");
@@ -442,7 +392,7 @@ mod tests {
fn test_string_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
@@ -455,7 +405,7 @@ mod tests {
fn test_i64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
assert_eq!(
@@ -468,7 +418,7 @@ mod tests {
fn test_u64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
assert_eq!(
@@ -481,7 +431,7 @@ mod tests {
fn test_f64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
assert_eq!(
@@ -494,7 +444,7 @@ mod tests {
fn test_bool_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_fast_value(true);
assert_eq!(
@@ -507,7 +457,7 @@ mod tests {
fn test_push_after_set_path_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("attribute");
json_writer.set_str("something");
json_writer.push_path_segment("color");
@@ -522,7 +472,7 @@ mod tests {
fn test_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.push_path_segment("hue");
json_writer.pop_path_segment();
@@ -537,7 +487,7 @@ mod tests {
fn test_json_writer_path() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
assert_eq!(json_writer.path(), b"color");
json_writer.push_path_segment("hue");
@@ -545,79 +495,4 @@ mod tests {
json_writer.set_str("pink");
assert_eq!(json_writer.path(), b"color\x01hue");
}
#[test]
fn test_json_path_expand_dots_disabled() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"color.hue");
}
#[test]
fn test_json_path_expand_dots_enabled() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"color\x01hue");
}
#[test]
fn test_json_path_expand_dots_enabled_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
json_writer.push_path_segment("hello");
assert_eq!(json_writer.path(), b"hello");
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"hello\x01color\x01hue");
json_writer.pop_path_segment();
assert_eq!(json_writer.path(), b"hello");
}
#[test]
fn test_split_json_path_simple() {
let json_path = split_json_path("titi.toto");
assert_eq!(&json_path, &["titi", "toto"]);
}
#[test]
fn test_split_json_path_single_segment() {
let json_path = split_json_path("toto");
assert_eq!(&json_path, &["toto"]);
}
#[test]
fn test_split_json_path_trailing_dot() {
let json_path = split_json_path("toto.");
assert_eq!(&json_path, &["toto", ""]);
}
#[test]
fn test_split_json_path_heading_dot() {
let json_path = split_json_path(".toto");
assert_eq!(&json_path, &["", "toto"]);
}
#[test]
fn test_split_json_path_escaped_dot() {
let json_path = split_json_path(r#"toto\.titi"#);
assert_eq!(&json_path, &["toto.titi"]);
let json_path_2 = split_json_path(r#"k8s\.container\.name"#);
assert_eq!(&json_path_2, &["k8s.container.name"]);
}
#[test]
fn test_split_json_path_escaped_backslash() {
let json_path = split_json_path(r#"toto\\titi"#);
assert_eq!(&json_path, &[r#"toto\titi"#]);
}
#[test]
fn test_split_json_path_escaped_normal_letter() {
let json_path = split_json_path(r#"toto\titi"#);
assert_eq!(&json_path, &[r#"tototiti"#]);
}
}

View File

@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{
get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer,
MultiValueIndex, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
MultiValueLength, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -348,29 +348,9 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
&segment_and_ff_readers,
)?;
let num_vals = segment_and_ff_readers
.iter()
.map(|(segment_reader, reader)| {
// TODO implement generic version, implement reverse scan, all - deletes
if let Some(alive_bitset) = segment_reader.alive_bitset() {
alive_bitset
.iter_alive()
.map(|doc| reader.num_vals(doc))
.sum()
} else {
reader.total_num_vals() as u32
}
})
.sum();
let fast_field_readers = segment_and_ff_readers
.into_iter()
.map(|(_, ff_reader)| ff_reader)
@@ -385,7 +365,12 @@ impl IndexMerger {
})
};
fast_field_serializer.create_u128_fast_field_with_idx(field, iter_gen, num_vals, 1)?;
fast_field_serializer.create_u128_fast_field_with_idx(
field,
iter_gen,
doc_id_mapping.len() as u64,
1,
)?;
Ok(())
}
@@ -412,13 +397,13 @@ impl IndexMerger {
let iter_gen = || {
doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize];
fast_field_reader.get_val(doc_addr.doc_id)
fast_field_reader.get_val(doc_addr.doc_id as u64)
})
};
fast_field_serializer.create_u128_fast_field_with_idx(
field,
iter_gen,
doc_id_mapping.len() as u32,
doc_id_mapping.len() as u64,
0,
)?;
Ok(())
@@ -525,8 +510,8 @@ impl IndexMerger {
doc_id_reader_pair
.into_iter()
.kmerge_by(|a, b| {
let val1 = a.2.get_val(a.0);
let val2 = b.2.get_val(b.0);
let val1 = a.2.get_val(a.0 as u64);
let val2 = b.2.get_val(b.0 as u64);
if sort_by_field.order == Order::Asc {
val1 < val2
} else {
@@ -544,11 +529,11 @@ impl IndexMerger {
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
// `MultiValuedFastFieldReader`
//
fn write_1_n_fast_field_idx_generic(
fn write_1_n_fast_field_idx_generic<T: MultiValueLength + Send + Sync>(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
segment_and_ff_readers: &[(&SegmentReader, &MultiValueIndex)],
segment_and_ff_readers: &[(&SegmentReader, T)],
) -> crate::Result<()> {
let column =
RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping);
@@ -582,12 +567,7 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
&segment_and_ff_readers,
)
}
@@ -717,12 +697,7 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
&segment_and_ff_readers,
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field);
@@ -829,7 +804,7 @@ impl IndexMerger {
// Let's compute the list of non-empty posting lists
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
let segment_reader = &self.readers[segment_ord];
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord];
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
let segment_postings = inverted_index
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
let alive_bitset_opt = segment_reader.alive_bitset();
@@ -1064,7 +1039,7 @@ mod tests {
};
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery};
use crate::schema::{
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
TextFieldIndexing, INDEXED, TEXT,
@@ -1977,7 +1952,7 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let mut term_scorer = term_query
.specialized_weight(EnableScoring::Enabled(&searcher))?
.specialized_weight(&searcher, true)?
.specialized_scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(term_scorer.doc(), 0);
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
@@ -1992,7 +1967,7 @@ mod tests {
assert_eq!(searcher.segment_readers().len(), 2);
for segment_reader in searcher.segment_readers() {
let mut term_scorer = term_query
.specialized_weight(EnableScoring::Enabled(&searcher))?
.specialized_weight(&searcher, true)?
.specialized_scorer(segment_reader, 1.0)?;
// the difference compared to before is intrinsic to the bm25 formula. no worries
// there.
@@ -2017,7 +1992,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let mut term_scorer = term_query
.specialized_weight(EnableScoring::Enabled(&searcher))?
.specialized_weight(&searcher, true)?
.specialized_scorer(segment_reader, 1.0)?;
// the difference compared to before is intrinsic to the bm25 formula. no worries there.
for doc in segment_reader.doc_ids_alive() {

View File

@@ -190,13 +190,13 @@ mod tests {
assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get_val(3), 3u64);
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get_val(2), 20u64);
assert_eq!(fast_field.get_val(1), 100u64);
assert_eq!(fast_field.get_val(2u64), 20u64);
assert_eq!(fast_field.get_val(1u64), 100u64);
} else {
assert_eq!(fast_field.get_val(2), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2u64), 10u64);
assert_eq!(fast_field.get_val(1u64), 20u64);
}
assert_eq!(fast_field.get_val(0), 1_000u64);
assert_eq!(fast_field.get_val(0u64), 1_000u64);
// test new field norm mapping
{
@@ -545,7 +545,7 @@ mod bench_sorted_index_merge {
// add values in order of the new doc_ids
let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get_val(doc_id);
val = field_reader.get_val(doc_id as u64);
}
val

View File

@@ -58,15 +58,13 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::schema::{self, Schema};
use crate::{Index, Term};
#[test]
fn test_advance_delete_bug() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?;
let mut index_writer = index.writer_for_tests()?;
// there must be one deleted document in the segment
@@ -77,48 +75,7 @@ mod tests_mmap {
index_writer.add_document(doc!(text_field=>"c"))?;
}
index_writer.commit()?;
index_writer.commit()?;
Ok(())
}
#[test]
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
}

View File

@@ -447,8 +447,8 @@ impl SegmentUpdater {
let segment_entries = segment_updater.purge_deletes(opstamp)?;
segment_updater.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp, payload)?;
// let _ = garbage_collect_files(segment_updater.clone());
// segment_updater.consider_merge_options();
let _ = garbage_collect_files(segment_updater.clone());
segment_updater.consider_merge_options();
Ok(opstamp)
})
}

View File

@@ -180,7 +180,7 @@ impl SegmentWriter {
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
match field_entry.field_type() {
match *field_entry.field_type() {
FieldType::Facet(_) => {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
@@ -307,7 +307,7 @@ impl SegmentWriter {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::JsonObject(json_options) => {
FieldType::JsonObject(_) => {
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
@@ -315,7 +315,6 @@ impl SegmentWriter {
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
ctx,
@@ -558,7 +557,7 @@ mod tests {
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut term_stream = term_dict.stream().unwrap();
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("bool");
json_term_writer.set_fast_value(true);
@@ -649,7 +648,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("token");
let term_info = inv_index
@@ -693,7 +692,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("two tokens");
let term_info = inv_index
@@ -738,7 +737,7 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
json_term_writer.push_path_segment("mykey");
json_term_writer.push_path_segment("field");
json_term_writer.set_str("hello");
@@ -786,87 +785,4 @@ mod tests {
// On release this was [2, 1]. (< note the decreasing values)
assert_eq!(positions, &[2, 5]);
}
#[test]
fn test_multiple_field_value_and_long_tokens() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "roller-coaster".to_string(),
tokens: vec![Token {
offset_from: 0,
offset_to: 14,
position: 0,
text: "rollercoaster".to_string(),
position_length: 2,
}],
};
doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inv_index = seg_reader.inverted_index(text).unwrap();
let term = Term::from_field_text(text, "rollercoaster");
let mut postings = inv_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0u32);
let mut positions = Vec::new();
postings.positions(&mut positions);
assert_eq!(positions, &[0, 3]); //< as opposed to 0, 2 if we had a position length of 1.
}
#[test]
fn test_last_token_not_ending_last() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![Token { // Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
}],
};
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inv_index = seg_reader.inverted_index(text).unwrap();
let term = Term::from_field_text(text, "hello");
let mut postings = inv_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0u32);
let mut positions = Vec::new();
postings.positions(&mut positions);
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
}
}

View File

@@ -12,7 +12,7 @@ pub(crate) struct RemappedDocIdColumn<'a> {
fast_field_readers: Vec<Arc<dyn Column<u64>>>,
min_value: u64,
max_value: u64,
num_vals: u32,
num_vals: u64,
}
fn compute_min_max_val(
@@ -32,7 +32,7 @@ fn compute_min_max_val(
// we need to recompute the max / min
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get_val(doc_id))
.map(|doc_id| u64_reader.get_val(doc_id as u64))
.minmax()
.into_option()
}
@@ -73,13 +73,13 @@ impl<'a> RemappedDocIdColumn<'a> {
fast_field_readers,
min_value,
max_value,
num_vals: doc_id_mapping.len() as u32,
num_vals: doc_id_mapping.len() as u64,
}
}
}
impl<'a> Column for RemappedDocIdColumn<'a> {
fn get_val(&self, _doc: u32) -> u64 {
fn get_val(&self, _doc: u64) -> u64 {
unimplemented!()
}
@@ -90,7 +90,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> {
.map(|old_doc_addr| {
let fast_field_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize];
fast_field_reader.get_val(old_doc_addr.doc_id)
fast_field_reader.get_val(old_doc_addr.doc_id as u64)
}),
)
}
@@ -102,7 +102,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> {
self.max_value
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.num_vals
}
}

View File

@@ -3,7 +3,7 @@ use std::cmp;
use fastfield_codecs::Column;
use super::flat_map_with_buffer::FlatMapWithBufferIter;
use crate::fastfield::{MultiValueIndex, MultiValuedFastFieldReader};
use crate::fastfield::{MultiValueLength, MultiValuedFastFieldReader};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
use crate::schema::Field;
use crate::{DocAddress, SegmentReader};
@@ -13,7 +13,7 @@ pub(crate) struct RemappedDocIdMultiValueColumn<'a> {
fast_field_readers: Vec<MultiValuedFastFieldReader<u64>>,
min_value: u64,
max_value: u64,
num_vals: u32,
num_vals: u64,
}
impl<'a> RemappedDocIdMultiValueColumn<'a> {
@@ -61,13 +61,13 @@ impl<'a> RemappedDocIdMultiValueColumn<'a> {
fast_field_readers,
min_value,
max_value,
num_vals: num_vals as u32,
num_vals: num_vals as u64,
}
}
}
impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
fn get_val(&self, _pos: u32) -> u64 {
fn get_val(&self, _pos: u64) -> u64 {
unimplemented!()
}
@@ -89,22 +89,22 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
self.max_value
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.num_vals
}
}
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a> {
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> {
doc_id_mapping: &'a SegmentDocIdMapping,
multi_value_length_readers: Vec<&'a MultiValueIndex>,
multi_value_length_readers: Vec<&'a T>,
min_value: u64,
max_value: u64,
num_vals: u32,
num_vals: u64,
}
impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
pub(crate) fn new(
segment_and_ff_readers: &'a [(&'a SegmentReader, &'a MultiValueIndex)],
segment_and_ff_readers: &'a [(&'a SegmentReader, T)],
doc_id_mapping: &'a SegmentDocIdMapping,
) -> Self {
// We go through a complete first pass to compute the minimum and the
@@ -115,19 +115,17 @@ impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
let mut multi_value_length_readers = Vec::with_capacity(segment_and_ff_readers.len());
for segment_and_ff_reader in segment_and_ff_readers {
let segment_reader = segment_and_ff_reader.0;
let multi_value_length_reader = segment_and_ff_reader.1;
let multi_value_length_reader = &segment_and_ff_reader.1;
if !segment_reader.has_deletes() {
max_value += multi_value_length_reader.total_num_vals() as u64;
max_value += multi_value_length_reader.get_total_len();
} else {
for doc in segment_reader.doc_ids_alive() {
max_value += multi_value_length_reader.num_vals_for_doc(doc) as u64;
max_value += multi_value_length_reader.get_len(doc);
}
}
num_vals += segment_reader.num_docs();
num_vals += segment_reader.num_docs() as u64;
multi_value_length_readers.push(multi_value_length_reader);
}
// The value range is always get_val(doc)..get_val(doc + 1)
num_vals += 1;
Self {
doc_id_mapping,
multi_value_length_readers,
@@ -138,8 +136,8 @@ impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
}
}
impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
fn get_val(&self, _pos: u32) -> u64 {
impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> {
fn get_val(&self, _pos: u64) -> u64 {
unimplemented!()
}
@@ -150,8 +148,8 @@ impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
move |old_doc_addr| {
let ff_reader =
&self.multi_value_length_readers[old_doc_addr.segment_ord as usize];
offset += ff_reader.num_vals_for_doc(old_doc_addr.doc_id);
offset as u64
offset += ff_reader.get_len(old_doc_addr.doc_id);
offset
},
)),
)
@@ -164,7 +162,7 @@ impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
self.max_value
}
fn num_vals(&self) -> u32 {
fn num_vals(&self) -> u64 {
self.num_vals
}
}

View File

@@ -277,8 +277,6 @@ pub mod fastfield;
pub mod fieldnorm;
pub mod positions;
pub mod postings;
/// Module containing the different query implementations.
pub mod query;
pub mod schema;
pub mod space_usage;
@@ -313,7 +311,7 @@ pub use crate::postings::Postings;
pub use crate::schema::{DateOptions, DatePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
const INDEX_FORMAT_VERSION: u32 = 4;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -821,7 +819,7 @@ pub mod tests {
fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("absent_text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
@@ -1003,7 +1001,7 @@ pub mod tests {
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let fast_field_float = schema_builder.add_f64_field("float", FAST);
let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("stored_int", STORED);
let stored_int_field = schema_builder.add_u64_field("text", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);

View File

@@ -3,7 +3,7 @@ use std::io;
use std::marker::PhantomData;
use std::ops::Range;
use rustc_hash::FxHashMap;
use fnv::FnvHashMap;
use super::stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
@@ -56,12 +56,12 @@ pub(crate) fn serialize_postings(
doc_id_map: Option<&DocIdMapping>,
schema: &Schema,
serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> {
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(ctx.term_index.iter());
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> =
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let field_offsets = make_field_partition(&term_offsets);
@@ -74,7 +74,7 @@ pub(crate) fn serialize_postings(
let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
@@ -170,7 +170,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.truncate_value_bytes(end_of_path_idx);
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32);
end_position = start_position + token.position_length as u32;
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);

View File

@@ -1,8 +1,8 @@
use crate::core::SegmentReader;
use crate::core::{Searcher, SegmentReader};
use crate::docset::{DocSet, TERMINATED};
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, Score};
/// Query that matches all of the documents.
@@ -12,7 +12,7 @@ use crate::{DocId, Score};
pub struct AllQuery;
impl Query for AllQuery {
fn weight(&self, _: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
fn weight(&self, _: &Searcher, _: bool) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(AllWeight))
}
}
@@ -72,7 +72,7 @@ impl Scorer for AllScorer {
mod tests {
use super::AllQuery;
use crate::docset::TERMINATED;
use crate::query::{EnableScoring, Query};
use crate::query::Query;
use crate::schema::{Schema, TEXT};
use crate::Index;
@@ -95,7 +95,7 @@ mod tests {
let index = create_test_index()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let weight = AllQuery.weight(EnableScoring::Disabled(&index.schema()))?;
let weight = AllQuery.weight(&searcher, false)?;
{
let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0)?;
@@ -118,7 +118,7 @@ mod tests {
let index = create_test_index()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let weight = AllQuery.weight(EnableScoring::Disabled(searcher.schema()))?;
let weight = AllQuery.weight(&searcher, false)?;
let reader = searcher.segment_reader(0);
{
let mut scorer = weight.scorer(reader, 2.0)?;

View File

@@ -33,7 +33,7 @@ where
&'a self,
term_dict: &'a TermDictionary,
) -> io::Result<TermStreamer<'a, &'a A>> {
let automaton: &A = &self.automaton;
let automaton: &A = &*self.automaton;
let term_stream_builder = term_dict.search(automaton);
term_stream_builder.into_stream()
}

View File

@@ -86,7 +86,10 @@ impl DocSet for BitSetDocSet {
self.doc
}
/// Returns the number of values set in the underlying bitset.
/// Returns half of the `max_doc`
/// This is quite a terrible heuristic,
/// but we don't have access to any better
/// value.
fn size_hint(&self) -> u32 {
self.docs.len() as u32
}

View File

@@ -1,6 +1,7 @@
use super::boolean_weight::BooleanWeight;
use crate::query::{EnableScoring, Occur, Query, SumWithCoordsCombiner, TermQuery, Weight};
use crate::query::{Occur, Query, SumWithCoordsCombiner, TermQuery, Weight};
use crate::schema::{IndexRecordOption, Term};
use crate::Searcher;
/// The boolean query returns a set of documents
/// that matches the Boolean combination of constituent subqueries.
@@ -142,15 +143,17 @@ impl From<Vec<(Occur, Box<dyn Query>)>> for BooleanQuery {
}
impl Query for BooleanQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let sub_weights = self
.subqueries
.iter()
.map(|&(ref occur, ref subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
})
.collect::<crate::Result<_>>()?;
Ok(Box::new(BooleanWeight::new(
sub_weights,
enable_scoring.is_scoring_enabled(),
scoring_enabled,
Box::new(SumWithCoordsCombiner::default),
)))
}

View File

@@ -5,7 +5,7 @@ use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::term_query::TermScorer;
use crate::query::weight::{for_each_docset, for_each_pruning_scorer, for_each_scorer};
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::{
intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer,
Union, Weight,
@@ -219,24 +219,6 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
Ok(())
}
fn for_each_no_score(
&self,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
for_each_docset(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_docset(scorer.as_mut(), callback);
}
}
Ok(())
}
/// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold.
///

View File

@@ -15,8 +15,7 @@ mod tests {
use crate::query::score_combiner::SumWithCoordsCombiner;
use crate::query::term_query::TermScorer;
use crate::query::{
EnableScoring, Intersection, Occur, Query, QueryParser, RequiredOptionalScorer, Scorer,
TermQuery,
Intersection, Occur, Query, QueryParser, RequiredOptionalScorer, Scorer, TermQuery,
};
use crate::schema::*;
use crate::{assert_nearly_equals, DocAddress, DocId, Index, Score};
@@ -55,7 +54,7 @@ mod tests {
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("+a")?;
let searcher = index.reader()?.searcher();
let weight = query.weight(EnableScoring::Enabled(&searcher))?;
let weight = query.weight(&searcher, true)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<TermScorer>());
Ok(())
@@ -68,13 +67,13 @@ mod tests {
let searcher = index.reader()?.searcher();
{
let query = query_parser.parse_query("+a +b +c")?;
let weight = query.weight(EnableScoring::Enabled(&searcher))?;
let weight = query.weight(&searcher, true)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<Intersection<TermScorer>>());
}
{
let query = query_parser.parse_query("+a +(b c)")?;
let weight = query.weight(EnableScoring::Enabled(&searcher))?;
let weight = query.weight(&searcher, true)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
}
@@ -88,7 +87,7 @@ mod tests {
let searcher = index.reader()?.searcher();
{
let query = query_parser.parse_query("+a b")?;
let weight = query.weight(EnableScoring::Enabled(&searcher))?;
let weight = query.weight(&searcher, true)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<RequiredOptionalScorer<
Box<dyn Scorer>,
@@ -98,7 +97,7 @@ mod tests {
}
{
let query = query_parser.parse_query("+a b")?;
let weight = query.weight(EnableScoring::Disabled(searcher.schema()))?;
let weight = query.weight(&searcher, false)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<TermScorer>());
}
@@ -242,9 +241,7 @@ mod tests {
let searcher = reader.searcher();
let boolean_query =
BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let boolean_weight = boolean_query
.weight(EnableScoring::Enabled(&searcher))
.unwrap();
let boolean_weight = boolean_query.weight(&searcher, true).unwrap();
{
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(boolean_scorer.doc(), 0u32);

View File

@@ -2,8 +2,8 @@ use std::fmt;
use crate::fastfield::AliveBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, Term};
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
/// `BoostQuery` is a wrapper over a query used to boost its score.
///
@@ -38,9 +38,9 @@ impl fmt::Debug for BoostQuery {
}
impl Query for BoostQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let weight_without_boost = self.query.weight(enable_scoring)?;
let boosted_weight = if enable_scoring.is_scoring_enabled() {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let weight_without_boost = self.query.weight(searcher, scoring_enabled)?;
let boosted_weight = if scoring_enabled {
Box::new(BoostWeight::new(weight_without_boost, self.boost))
} else {
weight_without_boost

View File

@@ -1,7 +1,7 @@
use std::fmt;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, TantivyError, Term};
/// `ConstScoreQuery` is a wrapper over a query to provide a constant score.
/// It can avoid unnecessary score computation on the wrapped query.
@@ -36,9 +36,9 @@ impl fmt::Debug for ConstScoreQuery {
}
impl Query for ConstScoreQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let inner_weight = self.query.weight(enable_scoring)?;
Ok(if enable_scoring.is_scoring_enabled() {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let inner_weight = self.query.weight(searcher, scoring_enabled)?;
Ok(if scoring_enabled {
Box::new(ConstWeight::new(inner_weight, self.score))
} else {
inner_weight

View File

@@ -1,7 +1,7 @@
use tantivy_query_grammar::Occur;
use crate::query::{BooleanWeight, DisjunctionMaxCombiner, EnableScoring, Query, Weight};
use crate::{Score, Term};
use crate::query::{BooleanWeight, DisjunctionMaxCombiner, Query, Weight};
use crate::{Score, Searcher, Term};
/// The disjunction max query кeturns documents matching one or more wrapped queries,
/// called query clauses or clauses.
@@ -91,16 +91,16 @@ impl Clone for DisjunctionMaxQuery {
}
impl Query for DisjunctionMaxQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let disjuncts = self
.disjuncts
.iter()
.map(|disjunct| Ok((Occur::Should, disjunct.weight(enable_scoring)?)))
.map(|disjunct| Ok((Occur::Should, disjunct.weight(searcher, scoring_enabled)?)))
.collect::<crate::Result<_>>()?;
let tie_breaker = self.tie_breaker;
Ok(Box::new(BooleanWeight::new(
disjuncts,
enable_scoring.is_scoring_enabled(),
scoring_enabled,
Box::new(move || DisjunctionMaxCombiner::with_tie_breaker(tie_breaker)),
)))
}

View File

@@ -1,7 +1,7 @@
use super::Scorer;
use crate::docset::TERMINATED;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::query::{Explanation, Query, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader};
/// `EmptyQuery` is a dummy `Query` in which no document matches.
@@ -11,7 +11,11 @@ use crate::{DocId, DocSet, Score, Searcher, SegmentReader};
pub struct EmptyQuery;
impl Query for EmptyQuery {
fn weight(&self, _enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
fn weight(
&self,
_searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(EmptyWeight))
}

View File

@@ -5,8 +5,9 @@ use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA};
use once_cell::sync::Lazy;
use tantivy_fst::Automaton;
use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
use crate::query::{AutomatonWeight, Query, Weight};
use crate::schema::Term;
use crate::Searcher;
use crate::TantivyError::InvalidArgument;
pub(crate) struct DfaWrapper(pub DFA);
@@ -157,7 +158,11 @@ impl FuzzyTermQuery {
}
impl Query for FuzzyTermQuery {
fn weight(&self, _enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
fn weight(
&self,
_searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight()?))
}
}

View File

@@ -1,3 +1,5 @@
//! Query Module
mod all_query;
mod automaton_weight;
mod bitset;
@@ -16,7 +18,6 @@ mod phrase_query;
mod query;
mod query_parser;
mod range_query;
mod range_query_ip_fastfield;
mod regex_query;
mod reqopt_scorer;
mod scorer;
@@ -49,7 +50,7 @@ pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::{intersect_scorers, Intersection};
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query::{Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;

View File

@@ -1,6 +1,4 @@
mod more_like_this;
/// Module containing the different query implementations.
mod query;
pub use self::more_like_this::MoreLikeThis;

View File

@@ -1,7 +1,7 @@
use super::MoreLikeThis;
use crate::query::{EnableScoring, Query, Weight};
use crate::query::{Query, Weight};
use crate::schema::{Field, Value};
use crate::DocAddress;
use crate::{DocAddress, Result, Searcher};
/// A query that matches all of the documents similar to a document
/// or a set of field values provided.
@@ -42,23 +42,16 @@ impl MoreLikeThisQuery {
}
impl Query for MoreLikeThisQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let searcher = match enable_scoring {
EnableScoring::Enabled(searcher) => searcher,
EnableScoring::Disabled(_) => {
let err = "MoreLikeThisQuery requires to enable scoring.".to_string();
return Err(crate::TantivyError::InvalidArgument(err));
}
};
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<dyn Weight>> {
match &self.target {
TargetDocument::DocumentAdress(doc_address) => self
.mlt
.query_with_document(searcher, *doc_address)?
.weight(enable_scoring),
.weight(searcher, scoring_enabled),
TargetDocument::DocumentFields(doc_fields) => self
.mlt
.query_with_document_fields(searcher, doc_fields)?
.weight(enable_scoring),
.weight(searcher, scoring_enabled),
}
}
}

View File

@@ -14,7 +14,7 @@ pub mod tests {
use super::*;
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
use crate::core::Index;
use crate::query::{EnableScoring, QueryParser, Weight};
use crate::query::{QueryParser, Weight};
use crate::schema::{Schema, Term, TEXT};
use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED};
@@ -79,8 +79,7 @@ pub mod tests {
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let phrase_weight =
phrase_query.phrase_weight(EnableScoring::Disabled(searcher.schema()))?;
let phrase_weight = phrase_query.phrase_weight(&searcher, false)?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
@@ -360,9 +359,7 @@ pub mod tests {
let matching_docs = |query: &str| {
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let phrase_query = query_parser.parse_query(query).unwrap();
let phrase_weight = phrase_query
.weight(EnableScoring::Disabled(searcher.schema()))
.unwrap();
let phrase_weight = phrase_query.weight(&searcher, false).unwrap();
let mut phrase_scorer = phrase_weight
.scorer(searcher.segment_reader(0), 1.0f32)
.unwrap();

View File

@@ -1,6 +1,7 @@
use super::PhraseWeight;
use crate::core::searcher::Searcher;
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Query, Weight};
use crate::query::{Query, Weight};
use crate::schema::{Field, IndexRecordOption, Term};
/// `PhraseQuery` matches a specific sequence of words.
@@ -66,7 +67,7 @@ impl PhraseQuery {
/// Slop allowed for the phrase.
///
/// The query will match if its terms are separated by `slop` terms at most.
/// By default the slop is 0 meaning query terms need to be adjacent.
/// By default the slop is 0 meaning query terms need to be adjacent.
pub fn set_slop(&mut self, value: u32) {
self.slop = value;
}
@@ -90,9 +91,10 @@ impl PhraseQuery {
/// a specialized type [`PhraseWeight`] instead of a Boxed trait.
pub(crate) fn phrase_weight(
&self,
enable_scoring: EnableScoring<'_>,
searcher: &Searcher,
scoring_enabled: bool,
) -> crate::Result<PhraseWeight> {
let schema = enable_scoring.schema();
let schema = searcher.schema();
let field_entry = schema.get_field_entry(self.field);
let has_positions = field_entry
.field_type()
@@ -107,11 +109,8 @@ impl PhraseQuery {
)));
}
let terms = self.phrase_terms();
let bm25_weight_opt = match enable_scoring {
EnableScoring::Enabled(searcher) => Some(Bm25Weight::for_terms(searcher, &terms)?),
EnableScoring::Disabled(_) => None,
};
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight_opt);
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
if self.slop > 0 {
weight.slop(self.slop);
}
@@ -123,8 +122,8 @@ impl Query for PhraseQuery {
/// Create the weight associated with a query.
///
/// See [`Weight`].
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let phrase_weight = self.phrase_weight(enable_scoring)?;
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let phrase_weight = self.phrase_weight(searcher, scoring_enabled)?;
Ok(Box::new(phrase_weight))
}

View File

@@ -50,7 +50,8 @@ pub struct PhraseScorer<TPostings: Postings> {
right: Vec<u32>,
phrase_count: u32,
fieldnorm_reader: FieldNormReader,
similarity_weight_opt: Option<Bm25Weight>,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
slop: u32,
}
@@ -244,11 +245,11 @@ fn intersection_exists_with_slop(left: &[u32], right: &[u32], slop: u32) -> bool
}
impl<TPostings: Postings> PhraseScorer<TPostings> {
// If similarity_weight is None, then scoring is disabled.
pub fn new(
term_postings: Vec<(usize, TPostings)>,
similarity_weight_opt: Option<Bm25Weight>,
similarity_weight: Bm25Weight,
fieldnorm_reader: FieldNormReader,
scoring_enabled: bool,
slop: u32,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings
@@ -269,8 +270,9 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
left: Vec::with_capacity(100),
right: Vec::with_capacity(100),
phrase_count: 0u32,
similarity_weight_opt,
similarity_weight,
fieldnorm_reader,
scoring_enabled,
slop,
};
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
@@ -284,7 +286,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
fn phrase_match(&mut self) -> bool {
if self.similarity_weight_opt.is_some() {
if self.scoring_enabled {
let count = self.compute_phrase_count();
self.phrase_count = count;
count > 0u32
@@ -386,11 +388,8 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
fn score(&mut self) -> Score {
let doc = self.doc();
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
similarity_weight.score(fieldnorm_id, self.phrase_count)
} else {
1.0f32
}
self.similarity_weight
.score(fieldnorm_id, self.phrase_count)
}
}

View File

@@ -10,28 +10,30 @@ use crate::{DocId, DocSet, Score};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
similarity_weight_opt: Option<Bm25Weight>,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
slop: u32,
}
impl PhraseWeight {
/// Creates a new phrase weight.
/// If `similarity_weight_opt` is None, then scoring is disabled
pub fn new(
phrase_terms: Vec<(usize, Term)>,
similarity_weight_opt: Option<Bm25Weight>,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
) -> PhraseWeight {
let slop = 0;
PhraseWeight {
phrase_terms,
similarity_weight_opt,
similarity_weight,
scoring_enabled,
slop,
}
}
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field();
if self.similarity_weight_opt.is_some() {
if self.scoring_enabled {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
return Ok(fieldnorm_reader);
}
@@ -44,10 +46,7 @@ impl PhraseWeight {
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight_opt = self
.similarity_weight_opt
.as_ref()
.map(|similarity_weight| similarity_weight.boost_by(boost));
let similarity_weight = self.similarity_weight.boost_by(boost);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut term_postings_list = Vec::new();
if reader.has_deletes() {
@@ -75,8 +74,9 @@ impl PhraseWeight {
}
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight_opt,
similarity_weight,
fieldnorm_reader,
self.scoring_enabled,
self.slop,
)))
}
@@ -108,9 +108,7 @@ impl Weight for PhraseWeight {
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
}
explanation.add_detail(self.similarity_weight.explain(fieldnorm_id, phrase_count));
Ok(explanation)
}
}
@@ -119,7 +117,7 @@ impl Weight for PhraseWeight {
mod tests {
use super::super::tests::create_index;
use crate::docset::TERMINATED;
use crate::query::{EnableScoring, PhraseQuery};
use crate::query::PhraseQuery;
use crate::{DocSet, Term};
#[test]
@@ -132,8 +130,7 @@ mod tests {
Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"),
]);
let enable_scoring = EnableScoring::Enabled(&searcher);
let phrase_weight = phrase_query.phrase_weight(enable_scoring).unwrap();
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();

View File

@@ -5,37 +5,8 @@ use downcast_rs::impl_downcast;
use super::Weight;
use crate::core::searcher::Searcher;
use crate::query::Explanation;
use crate::schema::Schema;
use crate::{DocAddress, Term};
/// Argument used in `Query::weight(..)`
#[derive(Copy, Clone)]
pub enum EnableScoring<'a> {
/// Pass this to enable scoring.
Enabled(&'a Searcher),
/// Pass this to disable scoring.
/// This can improve performance.
Disabled(&'a Schema),
}
impl<'a> EnableScoring<'a> {
/// Returns the schema.
pub fn schema(&self) -> &Schema {
match self {
EnableScoring::Enabled(searcher) => searcher.schema(),
EnableScoring::Disabled(schema) => schema,
}
}
/// Returns true if the scoring is enabled.
pub fn is_scoring_enabled(&self) -> bool {
match self {
EnableScoring::Enabled(_) => true,
EnableScoring::Disabled(_) => false,
}
}
}
/// The `Query` trait defines a set of documents and a scoring method
/// for those documents.
///
@@ -77,18 +48,18 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
/// can increase performances.
///
/// See [`Weight`].
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>>;
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>>;
/// Returns an `Explanation` for the score of the document.
fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> crate::Result<Explanation> {
let weight = self.weight(EnableScoring::Enabled(searcher))?;
let reader = searcher.segment_reader(doc_address.segment_ord);
let weight = self.weight(searcher, true)?;
weight.explain(reader, doc_address.doc_id)
}
/// Returns the number of documents matching the query.
fn count(&self, searcher: &Searcher) -> crate::Result<usize> {
let weight = self.weight(EnableScoring::Disabled(searcher.schema()))?;
let weight = self.weight(searcher, false)?;
let mut result = 0;
for reader in searcher.segment_readers() {
result += weight.count(reader)? as usize;
@@ -122,8 +93,8 @@ where T: 'static + Query + Clone
}
impl Query for Box<dyn Query> {
fn weight(&self, enabled_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
self.as_ref().weight(enabled_scoring)
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
self.as_ref().weight(searcher, scoring_enabled)
}
fn count(&self, searcher: &Searcher) -> crate::Result<usize> {

View File

@@ -15,11 +15,6 @@ pub enum LogicalLiteral {
lower: Bound<Term>,
upper: Bound<Term>,
},
Set {
field: Field,
value_type: Type,
elements: Vec<Term>,
},
All,
}
@@ -92,27 +87,6 @@ impl fmt::Debug for LogicalLiteral {
ref upper,
..
} => write!(formatter, "({:?} TO {:?})", lower, upper),
LogicalLiteral::Set { ref elements, .. } => {
const MAX_DISPLAYED: usize = 10;
write!(formatter, "IN [")?;
for (i, element) in elements.iter().enumerate() {
if i == 0 {
write!(formatter, "{:?}", element)?;
} else if i == MAX_DISPLAYED - 1 {
write!(
formatter,
", {:?}, ... ({} more)",
element,
elements.len() - i - 1
)?;
break;
} else {
write!(formatter, ", {:?}", element)?;
}
}
write!(formatter, "]")
}
LogicalLiteral::All => write!(formatter, "*"),
}
}

View File

@@ -13,11 +13,10 @@ use crate::indexer::{
};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
TermQuery, TermSetQuery,
TermQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
Schema, Term, Type,
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -183,6 +182,7 @@ pub struct QueryParser {
conjunction_by_default: bool,
tokenizer_manager: TokenizerManager,
boost: HashMap<Field, Score>,
field_names: HashMap<String, Field>,
}
fn all_negative(ast: &LogicalAst) -> bool {
@@ -195,6 +195,31 @@ fn all_negative(ast: &LogicalAst) -> bool {
}
}
// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
//
// This function operates directly on bytes (as opposed to codepoint), relying
// on a encoding property of utf-8 for its correctness.
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
let mut splitting_dots_pos = Vec::new();
let mut escape_state = false;
for (pos, b) in field_path.bytes().enumerate() {
if escape_state {
escape_state = false;
continue;
}
match b {
b'\\' => {
escape_state = true;
}
b'.' => {
splitting_dots_pos.push(pos);
}
_ => {}
}
}
splitting_dots_pos
}
impl QueryParser {
/// Creates a `QueryParser`, given
/// * schema - index Schema
@@ -204,19 +229,34 @@ impl QueryParser {
default_fields: Vec<Field>,
tokenizer_manager: TokenizerManager,
) -> QueryParser {
let field_names = schema
.fields()
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
.collect();
QueryParser {
schema,
default_fields,
tokenizer_manager,
conjunction_by_default: false,
boost: Default::default(),
field_names,
}
}
// Splits a full_path as written in a query, into a field name and a
// json path.
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
self.schema.find_field(full_path)
if let Some(field) = self.field_names.get(full_path) {
return Some((*field, ""));
}
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
while let Some(pos) = splitting_period_pos.pop() {
let (prefix, suffix) = full_path.split_at(pos);
if let Some(field) = self.field_names.get(prefix) {
return Some((*field, &suffix[1..]));
}
}
None
}
/// Creates a `QueryParser`, given
@@ -293,9 +333,7 @@ impl QueryParser {
) -> Result<Term, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let is_ip_and_fast = field_type.is_ip_addr() && field_type.is_fast();
if !field_type.is_indexed() && !is_ip_and_fast {
if !field_type.is_indexed() {
return Err(QueryParserError::FieldNotIndexed(
field_entry.name().to_string(),
));
@@ -442,14 +480,28 @@ impl QueryParser {
.into_iter()
.collect())
}
FieldType::JsonObject(ref json_options) => generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&self.tokenizer_manager,
json_options,
),
FieldType::JsonObject(ref json_options) => {
let option = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let index_record_option = option.index_option();
generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&text_analyzer,
index_record_option,
)
}
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => {
let facet_term = Term::from_facet(field, &facet);
@@ -631,31 +683,6 @@ impl QueryParser {
}));
Ok(logical_ast)
}
UserInputLeaf::Set {
field: full_field_opt,
elements,
} => {
let full_path = full_field_opt.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Set query need to target a specific field.".to_string(),
)
})?;
let (field, json_path) = self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
elements: elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.collect::<Result<Vec<_>, _>>()?,
field,
value_type,
}));
Ok(logical_ast)
}
}
}
}
@@ -674,7 +701,6 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}
}
@@ -713,32 +739,17 @@ fn generate_literals_for_json_object(
field: Field,
json_path: &str,
phrase: &str,
tokenizer_manager: &TokenizerManager,
json_options: &JsonObjectOptions,
text_analyzer: &TextAnalyzer,
index_record_option: IndexRecordOption,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let text_options = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = tokenizer_manager
.get(text_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: text_options.tokenizer().to_string(),
})?;
let index_record_option = text_options.index_option();
let mut logical_literals = Vec::new();
let mut term = Term::with_capacity(100);
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
&mut term,
);
let mut json_term_writer =
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
drop(json_term_writer);
if terms.len() <= 1 {
for (_, term) in terms {
@@ -1049,28 +1060,6 @@ mod test {
);
}
fn extract_query_term_json_path(query: &str) -> String {
let LogicalAst::Leaf(literal) = parse_query_to_logical_ast(query, false).unwrap() else {
panic!();
};
let LogicalLiteral::Term(term) = *literal else {
panic!();
};
std::str::from_utf8(term.value_bytes()).unwrap().to_string()
}
#[test]
fn test_json_field_query_with_espaced_dot() {
assert_eq!(
extract_query_term_json_path(r#"json.k8s.node.name:hello"#),
"k8s\u{1}node\u{1}name\0shello"
);
assert_eq!(
extract_query_term_json_path(r#"json.k8s\.node\.name:hello"#),
"k8s.node.name\0shello"
);
}
#[test]
fn test_json_field_possibly_a_number() {
test_parse_query_to_logical_ast_helper(
@@ -1525,6 +1514,13 @@ mod test {
assert_eq!(query_parser.split_full_path("firsty"), None);
}
#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}
#[test]
pub fn test_phrase_slop() {
test_parse_query_to_logical_ast_helper(
@@ -1543,29 +1539,4 @@ mod test {
false,
);
}
#[test]
pub fn test_term_set_query() {
test_parse_query_to_logical_ast_helper(
"title: IN [a b cd]",
r#"IN [Term(type=Str, field=0, "a"), Term(type=Str, field=0, "b"), Term(type=Str, field=0, "cd")]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"bytes: IN [AA== ABA= ABCD]",
r#"IN [Term(type=Bytes, field=12, [0]), Term(type=Bytes, field=12, [0, 16]), Term(type=Bytes, field=12, [0, 16, 131])]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"signed: IN [1 2 -3]",
r#"IN [Term(type=I64, field=2, 1), Term(type=I64, field=2, 2), Term(type=I64, field=2, -3)]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"float: IN [1.1 2.2 -3.3]",
r#"IN [Term(type=F64, field=10, 1.1), Term(type=F64, field=10, 2.2), Term(type=F64, field=10, -3.3)]"#,
false,
);
}
}

View File

@@ -3,16 +3,15 @@ use std::ops::{Bound, Range};
use common::BitSet;
use crate::core::SegmentReader;
use crate::core::{Searcher, SegmentReader};
use crate::error::TantivyError;
use crate::query::explanation::does_not_match;
use crate::query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DocId, Score};
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>,
transform: &Transform,
) -> Bound<TTo> {
@@ -30,17 +29,8 @@ pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
///
/// # Implementation
///
/// ## Default
/// The default implementation collects all documents _upfront_ into a `BitSet`.
/// This is done by iterating over the terms within the range and loading all docs for each
/// `TermInfo` from the inverted index (posting list) and put them into a `BitSet`.
/// Depending on the number of terms matched, this is a potentially expensive operation.
///
/// ## IP fast field
/// For IP fast fields a custom variant is used, by scanning the fast field. Unlike the default
/// variant we can walk in a lazy fashion over it, since the fastfield is implicit orderered by
/// DocId.
///
/// The current implement will iterate over the terms within the range
/// and append all of the document cross into a `BitSet`.
///
/// # Example
///
@@ -253,10 +243,13 @@ impl RangeQuery {
}
impl Query for RangeQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let field_type = schema.get_field_entry(self.field).field_type();
let value_type = field_type.value_type();
fn weight(
&self,
searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema();
let value_type = schema.get_field_entry(self.field).field_type().value_type();
if value_type != self.value_type {
let err_msg = format!(
"Create a range query of the type {:?}, when the field given was of type {:?}",
@@ -264,20 +257,11 @@ impl Query for RangeQuery {
);
return Err(TantivyError::SchemaError(err_msg));
}
if field_type.is_ip_addr() && field_type.is_fast() {
Ok(Box::new(IPFastFieldRangeWeight::new(
self.field,
&self.left_bound,
&self.right_bound,
)))
} else {
Ok(Box::new(RangeWeight {
field: self.field,
left_bound: self.left_bound.clone(),
right_bound: self.right_bound.clone(),
}))
}
Ok(Box::new(RangeWeight {
field: self.field,
left_bound: self.left_bound.clone(),
right_bound: self.right_bound.clone(),
}))
}
}
@@ -351,7 +335,7 @@ mod tests {
use super::RangeQuery;
use crate::collector::{Count, TopDocs};
use crate::query::QueryParser;
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, INDEXED, STORED, TEXT};
use crate::{doc, Index};
#[test]
@@ -525,24 +509,10 @@ mod tests {
Ok(())
}
#[test]
fn search_ip_range_test_posting_list() {
search_ip_range_test_opt(false);
}
#[test]
fn search_ip_range_test() {
search_ip_range_test_opt(true);
}
fn search_ip_range_test_opt(with_fast_field: bool) {
let mut schema_builder = Schema::builder();
let ip_field = if with_fast_field {
schema_builder.add_ip_addr_field("ip", INDEXED | STORED | FAST)
} else {
schema_builder.add_ip_addr_field("ip", INDEXED | STORED)
};
let text_field = schema_builder.add_text_field("text", TEXT | STORED);
let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let ip_addr_1 = IpAddr::from_str("127.0.0.10").unwrap().into_ipv6_addr();
@@ -550,22 +520,16 @@ mod tests {
{
let mut index_writer = index.writer(3_000_000).unwrap();
for _ in 0..1_000 {
index_writer
.add_document(doc!(
ip_field => ip_addr_1,
text_field => "BLUBBER"
))
.unwrap();
}
for _ in 0..1_000 {
index_writer
.add_document(doc!(
ip_field => ip_addr_2,
text_field => "BLOBBER"
))
.unwrap();
}
index_writer
.add_document(doc!(
ip_field => ip_addr_1
))
.unwrap();
index_writer
.add_document(doc!(
ip_field => ip_addr_2
))
.unwrap();
index_writer.commit().unwrap();
}
@@ -579,25 +543,24 @@ mod tests {
count
};
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
QueryParser::for_index(&index, vec![ip_field])
.parse_query(text)
.unwrap()
};
// Inclusive range
assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20]")),
2000
2
);
assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.10 TO 127.0.0.20]")),
2000
2
);
assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.11 TO 127.0.0.20]")),
1000
1
);
assert_eq!(
@@ -605,84 +568,9 @@ mod tests {
0
);
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.11 TO *]")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.11 TO *]")), 1);
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.21 TO *]")), 0);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.9]")), 0);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.10]")), 1000);
// Exclusive range
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.20}")),
1000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.21}")),
2000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.10 TO 127.0.0.20}")),
0
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.11 TO 127.0.0.20}")),
0
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.11 TO 127.0.0.19}")),
0
);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.11 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.10 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.21 TO *}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.20 TO *}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.19 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.9}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.10}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.11}")), 1000);
// Inclusive/Exclusive range
assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20}")),
1000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.20]")),
2000
);
// Intersection
assert_eq!(
get_num_hits(query_from_text(
"text:BLUBBER AND ip:[127.0.0.10 TO 127.0.0.10]"
)),
1000
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLOBBER AND ip:[127.0.0.10 TO 127.0.0.10]"
)),
0
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLOBBER AND ip:[127.0.0.20 TO 127.0.0.20]"
)),
1000
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLUBBER AND ip:[127.0.0.20 TO 127.0.0.20]"
)),
0
);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.10]")), 1);
}
}

View File

@@ -1,728 +0,0 @@
//! IP Fastfields support efficient scanning for range queries.
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use std::sync::Arc;
use common::BinarySerializable;
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::range_query::map_bound;
use super::{ConstScorer, Explanation, Scorer, Weight};
use crate::fastfield::MultiValuedU128FastFieldReader;
use crate::schema::{Cardinality, Field};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, TERMINATED};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight {
field: Field,
left_bound: Bound<Ipv6Addr>,
right_bound: Bound<Ipv6Addr>,
}
impl IPFastFieldRangeWeight {
pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
let ip_from_bound_raw_data = |data: &Vec<u8>| {
let left_ip_u128: u128 =
u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap());
Ipv6Addr::from_u128(left_ip_u128)
};
let left_bound = map_bound(left_bound, &ip_from_bound_raw_data);
let right_bound = map_bound(right_bound, &ip_from_bound_raw_data);
Self {
field,
left_bound,
right_bound,
}
}
}
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let field_type = reader.schema().get_field_entry(self.field).field_type();
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let docset = IpRangeDocSet::new(
value_range,
IpFastFieldCardinality::SingleValue(ip_addr_fast_field),
);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
Cardinality::MultiValues => {
let ip_addr_fast_field = reader.fast_fields().ip_addrs(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let docset = IpRangeDocSet::new(
value_range,
IpFastFieldCardinality::MultiValue(ip_addr_fast_field),
);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({}) does not match",
doc
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
fn bound_to_value_range(
left_bound: &Bound<Ipv6Addr>,
right_bound: &Bound<Ipv6Addr>,
min_value: Ipv6Addr,
max_value: Ipv6Addr,
) -> RangeInclusive<Ipv6Addr> {
let start_value = match left_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
Bound::Unbounded => min_value,
};
let end_value = match right_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
Bound::Unbounded => max_value,
};
start_value..=end_value
}
/// Helper to have a cursor over a vec of docids
struct VecCursor {
docs: Vec<u32>,
current_pos: usize,
}
impl VecCursor {
fn new() -> Self {
Self {
docs: Vec::with_capacity(32),
current_pos: 0,
}
}
fn next(&mut self) -> Option<u32> {
self.current_pos += 1;
self.current()
}
#[inline]
fn current(&self) -> Option<u32> {
self.docs.get(self.current_pos).map(|el| *el as u32)
}
fn get_cleared_data(&mut self) -> &mut Vec<u32> {
self.docs.clear();
self.current_pos = 0;
&mut self.docs
}
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
}
fn is_empty(&self) -> bool {
self.current_pos >= self.docs.len()
}
}
pub(crate) enum IpFastFieldCardinality {
SingleValue(Arc<dyn Column<Ipv6Addr>>),
MultiValue(MultiValuedU128FastFieldReader<Ipv6Addr>),
}
impl IpFastFieldCardinality {
fn num_docs(&self) -> u32 {
match self {
IpFastFieldCardinality::SingleValue(single_value) => single_value.num_vals(),
IpFastFieldCardinality::MultiValue(multi_value) => {
multi_value.get_index_reader().num_docs()
}
}
}
}
struct IpRangeDocSet {
/// The range filter on the values.
value_range: RangeInclusive<Ipv6Addr>,
ip_addr_fast_field: IpFastFieldCardinality,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
/// Number of docs range checked in a batch.
///
/// There are two patterns.
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
/// will come, so we start with small chunks
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
/// full scan.
fetch_horizon: u32,
/// Current batch of loaded docs.
loaded_docs: VecCursor,
last_seek_pos_opt: Option<u32>,
}
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl IpRangeDocSet {
fn new(
value_range: RangeInclusive<Ipv6Addr>,
ip_addr_fast_field: IpFastFieldCardinality,
) -> Self {
let mut ip_range_docset = Self {
value_range,
ip_addr_fast_field,
loaded_docs: VecCursor::new(),
next_fetch_start: 0,
fetch_horizon: DEFAULT_FETCH_HORIZON,
last_seek_pos_opt: None,
};
ip_range_docset.reset_fetch_range();
ip_range_docset.fetch_block();
ip_range_docset
}
fn reset_fetch_range(&mut self) {
self.fetch_horizon = DEFAULT_FETCH_HORIZON;
}
/// Returns true if more data could be fetched
fn fetch_block(&mut self) {
const MAX_HORIZON: u32 = 100_000;
while self.loaded_docs.is_empty() {
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
if finished_to_end {
break;
}
// Fetch more data, increase horizon. Horizon only gets reset when doing a seek.
self.fetch_horizon = (self.fetch_horizon * 2).min(MAX_HORIZON);
}
}
/// check if the distance between the seek calls is large
fn is_last_seek_distance_large(&self, new_seek: DocId) -> bool {
if let Some(last_seek_pos) = self.last_seek_pos_opt {
(new_seek - last_seek_pos) >= 128
} else {
true
}
}
/// Fetches a block for docid range [next_fetch_start .. next_fetch_start + HORIZON]
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.ip_addr_fast_field.num_docs();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
finished_to_end = true;
}
match &self.ip_addr_fast_field {
IpFastFieldCardinality::MultiValue(multi) => {
let last_value = self.loaded_docs.last_value();
multi.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
// In case of multivalues, we may have an overlap of the same docid between fetching
// blocks
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
}
IpFastFieldCardinality::SingleValue(single) => {
single.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
}
}
self.next_fetch_start = end;
finished_to_end
}
}
impl DocSet for IpRangeDocSet {
#[inline]
fn advance(&mut self) -> DocId {
if let Some(docid) = self.loaded_docs.next() {
docid as u32
} else {
if self.next_fetch_start >= self.ip_addr_fast_field.num_docs() as u32 {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
}
#[inline]
fn doc(&self) -> DocId {
self.loaded_docs
.current()
.map(|el| el as u32)
.unwrap_or(TERMINATED)
}
/// Advances the `DocSet` forward until reaching the target, or going to the
/// lowest [`DocId`] greater than the target.
///
/// If the end of the `DocSet` is reached, [`TERMINATED`] is returned.
///
/// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
fn seek(&mut self, target: DocId) -> DocId {
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
if target > self.next_fetch_start {
self.next_fetch_start = target;
}
let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target {
doc = self.advance();
}
self.last_seek_pos_opt = Some(target);
doc
}
fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{IpAddrOptions, Schema, FAST, STORED, STRING};
use crate::Index;
#[derive(Clone, Debug)]
pub struct Doc {
pub id: String,
pub ip: Ipv6Addr,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..10_000u64).prop_map(doc_from_id_1),
(1u64..10_000u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: id.to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: (id - 1).to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(ops).is_ok());
}
}
#[test]
fn ip_range_regression1_test() {
let ops = vec![doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn ip_range_regression2_test() {
let ops = vec![
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn ip_range_regression3_test() {
let ops = vec![doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
let ips_field = schema_builder.add_ip_addr_field(
"ips",
IpAddrOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
text_field => doc.id.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_ip_range_for_docs(docs: Vec<Doc>) -> crate::Result<()> {
let index = create_index_from_docs(&docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let test_sample = |sample_docs: Vec<Doc>| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive("ip", ips[0], ips[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ips", ips[0], ips[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ip", ips[0], ips[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search on multivalue ip field
let id_filter = sample_docs[0].id.to_string();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ips", ips[0], ips[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(vec![docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(vec![docs[0].clone(), docs[1].clone()]);
test_sample(vec![docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(vec![docs[1].clone(), docs[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id,
// Multiply by 1000, so that we create many buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
let index = create_index_from_docs(&docs);
index
}
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn excute_query(
field: &str,
ip_range: RangeInclusive<Ipv6Addr>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
}
}

Some files were not shown because too many files have changed in this diff Show More