Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
727d024a23 Bugfix position broken.
For Field with several FieldValues, with a
value that contained no token at all, the token position
was reinitialized to 0.

As a result, PhraseQueries can show some false positives.
In addition, after the computation of the position delta, we can
underflow u32, and end up with gigantic delta.

We haven't been able to actually explain the bug in 1629, but it
is assumed that in some corner case these delta can cause a panic.

Closes #1629
2022-10-20 10:19:41 +09:00
70 changed files with 596 additions and 2309 deletions

View File

@@ -1,8 +1,6 @@
Tantivy 0.19 Tantivy 0.19
================================ ================================
- Skip score calculation, when no scoring is required [#1646](https://github.com/quickwit-oss/tantivy/pull/1646) (@PSeitz)
- Limit fast fields to u32 (`get_val(u32)`) [#1644](https://github.com/quickwit-oss/tantivy/pull/1644) (@PSeitz)
- Major bugfix: Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz) - Major bugfix: Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396) - Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
The `DateTime` type has been updated to hold timestamps with microseconds precision. The `DateTime` type has been updated to hold timestamps with microseconds precision.
@@ -29,6 +27,7 @@ Tantivy 0.19
- [#1582](https://github.com/quickwit-oss/tantivy/pull/1582 (@PSeitz) - [#1582](https://github.com/quickwit-oss/tantivy/pull/1582 (@PSeitz)
- [#1611](https://github.com/quickwit-oss/tantivy/pull/1611 (@PSeitz) - [#1611](https://github.com/quickwit-oss/tantivy/pull/1611 (@PSeitz)
Tantivy 0.18 Tantivy 0.18
================================ ================================
@@ -45,10 +44,6 @@ Tantivy 0.18
- Add terms aggregation (@PSeitz) - Add terms aggregation (@PSeitz)
- Add support for zstd compression (@kryesh) - Add support for zstd compression (@kryesh)
Tantivy 0.18.1
================================
- Hotfix: positions computation. #1629 (@fmassot, @fulmicoton, @PSeitz)
Tantivy 0.17 Tantivy 0.17
================================ ================================

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.19.0-dev" version = "0.18.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -14,13 +14,12 @@ edition = "2021"
rust-version = "1.62" rust-version = "1.62"
[dependencies] [dependencies]
oneshot = "0.1.5" oneshot = "0.1.3"
base64 = "0.13.0" base64 = "0.13.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
once_cell = "1.10.0" once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] } regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "0.7"
tantivy-fst = "0.4.0" tantivy-fst = "0.4.0"
memmap2 = { version = "0.5.3", optional = true } memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true } lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
@@ -46,7 +45,7 @@ rust-stemmers = "1.2.0"
downcast-rs = "1.2.0" downcast-rs = "1.2.0"
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] } bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0" census = "0.4.0"
rustc-hash = "1.1.0" fnv = "1.0.7"
thiserror = "1.0.30" thiserror = "1.0.30"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = "0.5.0" fail = "0.5.0"
@@ -71,10 +70,10 @@ maplit = "1.0.2"
matches = "0.1.9" matches = "0.1.9"
pretty_assertions = "1.2.1" pretty_assertions = "1.2.1"
proptest = "1.0.0" proptest = "1.0.0"
criterion = "0.4" criterion = "0.3.5"
test-log = "0.2.10" test-log = "0.2.10"
env_logger = "0.9.0" env_logger = "0.9.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] } pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] }
futures = "0.3.21" futures = "0.3.21"
[dev-dependencies.fail] [dev-dependencies.fail]

View File

@@ -87,15 +87,15 @@ impl BitUnpacker {
} }
#[inline] #[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 { pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 { if self.num_bits == 0 {
return 0u64; return 0u64;
} }
let addr_in_bits = idx * self.num_bits as u32; let addr_in_bits = idx * self.num_bits;
let addr = addr_in_bits >> 3; let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
debug_assert!( debug_assert!(
addr + 8 <= data.len() as u32, addr + 8 <= data.len() as u64,
"The fast field field should have been padded with 7 bytes." "The fast field field should have been padded with 7 bytes."
); );
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8]) let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
@@ -130,7 +130,7 @@ mod test {
fn test_bitpacker_util(len: usize, num_bits: u8) { fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits); let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() { for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u32, &data), *val); assert_eq!(bitunpacker.get(i as u64, &data), *val);
} }
} }

View File

@@ -130,7 +130,7 @@ impl BlockedBitpacker {
let pos_in_block = idx % BLOCK_SIZE as usize; let pos_in_block = idx % BLOCK_SIZE as usize;
if let Some(metadata) = self.offset_and_bits.get(metadata_pos) { if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
let unpacked = BitUnpacker::new(metadata.num_bits()).get( let unpacked = BitUnpacker::new(metadata.num_bits()).get(
pos_in_block as u32, pos_in_block as u64,
&self.compressed_blocks[metadata.offset() as usize..], &self.compressed_blocks[metadata.offset() as usize..],
); );
unpacked + metadata.base_value() unpacked + metadata.base_value()

View File

@@ -157,7 +157,7 @@ fn vint_len(data: &[u8]) -> usize {
/// If the buffer does not start by a valid /// If the buffer does not start by a valid
/// vint payload /// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 { pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let (result, vlen) = read_u32_vint_no_advance(data); let (result, vlen) = read_u32_vint_no_advance(*data);
*data = &data[vlen..]; *data = &data[vlen..];
result result
} }

View File

@@ -105,7 +105,7 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>; type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) { fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get_val(doc) as f64; let value = self.fast_field_reader.get_val(doc as u64) as f64;
self.stats.count += 1; self.stats.count += 1;
self.stats.sum += value; self.stats.sum += value;
self.stats.squared_sum += value * value; self.stats.squared_sum += value * value;

View File

@@ -51,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
let product_id_reader = segment.fast_fields().u64(self.field)?; let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment let product_ids: Vec<ProductId> = segment
.doc_ids_alive() .doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc)) .map(|doc| product_id_reader.get_val(doc as u64))
.collect(); .collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter(); let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new(); let mut price_vals: Vec<Price> = Vec::new();

View File

@@ -65,7 +65,7 @@ mod tests {
b.iter(|| { b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
for _ in 0..n { for _ in 0..n {
a = column.get_val(a as u32); a = column.get_val(a as u64);
} }
a a
}); });
@@ -101,7 +101,7 @@ mod tests {
fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> { fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> {
let mut out = vec![]; let mut out = vec![];
let iter_gen = || data.iter().cloned(); let iter_gen = || data.iter().cloned();
serialize_u128(iter_gen, data.len() as u32, &mut out).unwrap(); serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap();
let out = OwnedBytes::new(out); let out = OwnedBytes::new(out);
open_u128::<u128>(out).unwrap() open_u128::<u128>(out).unwrap()
} }
@@ -111,15 +111,7 @@ mod tests {
let (major_item, _minor_item, data) = get_data_50percent_item(); let (major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| column.get_between_vals(major_item..=major_item));
let mut positions = Vec::new();
column.get_docids_for_value_range(
major_item..=major_item,
0..data.len() as u32,
&mut positions,
);
positions
});
} }
#[bench] #[bench]
@@ -127,15 +119,7 @@ mod tests {
let (_major_item, minor_item, data) = get_data_50percent_item(); let (_major_item, minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| column.get_between_vals(minor_item..=minor_item));
let mut positions = Vec::new();
column.get_docids_for_value_range(
minor_item..=minor_item,
0..data.len() as u32,
&mut positions,
);
positions
});
} }
#[bench] #[bench]
@@ -143,11 +127,7 @@ mod tests {
let (_major_item, _minor_item, data) = get_data_50percent_item(); let (_major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| column.get_between_vals(0..=u128::MAX));
let mut positions = Vec::new();
column.get_docids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
positions
});
} }
#[bench] #[bench]
@@ -157,7 +137,7 @@ mod tests {
b.iter(|| { b.iter(|| {
let mut a = 0u128; let mut a = 0u128;
for i in 0u64..column.num_vals() as u64 { for i in 0u64..column.num_vals() as u64 {
a += column.get_val(i as u32); a += column.get_val(i);
} }
a a
}); });
@@ -171,7 +151,7 @@ mod tests {
let n = column.num_vals(); let n = column.num_vals();
let mut a = 0u128; let mut a = 0u128;
for i in (0..n / 5).map(|val| val * 5) { for i in (0..n / 5).map(|val| val * 5) {
a += column.get_val(i); a += column.get_val(i as u64);
} }
a a
}); });
@@ -196,9 +176,9 @@ mod tests {
let n = permutation.len(); let n = permutation.len();
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation); let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| { b.iter(|| {
let mut a = 0; let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) { for i in (0..n / 7).map(|val| val * 7) {
a += column.get_val(i as u32); a += column.get_val(i as u64);
} }
a a
}); });
@@ -211,7 +191,7 @@ mod tests {
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation); let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| { b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
for i in 0u32..n as u32 { for i in 0u64..n as u64 {
a += column.get_val(i); a += column.get_val(i);
} }
a a
@@ -225,8 +205,8 @@ mod tests {
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation); let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| { b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
for i in 0..n { for i in 0..n as u64 {
a += column.get_val(i as u32); a += column.get_val(i);
} }
a a
}); });

View File

@@ -17,7 +17,7 @@ pub struct BitpackedReader {
impl Column for BitpackedReader { impl Column for BitpackedReader {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u64 { fn get_val(&self, doc: u64) -> u64 {
self.bit_unpacker.get(doc, &self.data) self.bit_unpacker.get(doc, &self.data)
} }
#[inline] #[inline]
@@ -30,7 +30,7 @@ impl Column for BitpackedReader {
self.normalized_header.max_value self.normalized_header.max_value
} }
#[inline] #[inline]
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.normalized_header.num_vals self.normalized_header.num_vals
} }
} }

View File

@@ -36,7 +36,7 @@ impl BinarySerializable for Block {
} }
} }
fn compute_num_blocks(num_vals: u32) -> usize { fn compute_num_blocks(num_vals: u64) -> usize {
(num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE (num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE
} }
@@ -72,13 +72,13 @@ impl FastFieldCodec for BlockwiseLinearCodec {
// Estimate first_chunk and extrapolate // Estimate first_chunk and extrapolate
fn estimate(column: &dyn crate::Column) -> Option<f32> { fn estimate(column: &dyn crate::Column) -> Option<f32> {
if column.num_vals() < 10 * CHUNK_SIZE as u32 { if column.num_vals() < 10 * CHUNK_SIZE as u64 {
return None; return None;
} }
let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect(); let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect();
let line = Line::train(&VecColumn::from(&first_chunk)); let line = Line::train(&VecColumn::from(&first_chunk));
for (i, buffer_val) in first_chunk.iter_mut().enumerate() { for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u32); let interpolated_val = line.eval(i as u64);
*buffer_val = buffer_val.wrapping_sub(interpolated_val); *buffer_val = buffer_val.wrapping_sub(interpolated_val);
} }
let estimated_bit_width = first_chunk let estimated_bit_width = first_chunk
@@ -95,7 +95,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
}; };
let num_bits = estimated_bit_width as u64 * column.num_vals() as u64 let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
// function metadata per block // function metadata per block
+ metadata_per_block as u64 * (column.num_vals() as u64 / CHUNK_SIZE as u64); + metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64);
let num_bits_uncompressed = 64 * column.num_vals(); let num_bits_uncompressed = 64 * column.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32) Some(num_bits as f32 / num_bits_uncompressed as f32)
} }
@@ -121,7 +121,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
assert!(!buffer.is_empty()); assert!(!buffer.is_empty());
for (i, buffer_val) in buffer.iter_mut().enumerate() { for (i, buffer_val) in buffer.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u32); let interpolated_val = line.eval(i as u64);
*buffer_val = buffer_val.wrapping_sub(interpolated_val); *buffer_val = buffer_val.wrapping_sub(interpolated_val);
} }
let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap(); let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap();
@@ -161,9 +161,9 @@ pub struct BlockwiseLinearReader {
impl Column for BlockwiseLinearReader { impl Column for BlockwiseLinearReader {
#[inline(always)] #[inline(always)]
fn get_val(&self, idx: u32) -> u64 { fn get_val(&self, idx: u64) -> u64 {
let block_id = (idx / CHUNK_SIZE as u32) as usize; let block_id = (idx / CHUNK_SIZE as u64) as usize;
let idx_within_block = idx % (CHUNK_SIZE as u32); let idx_within_block = idx % (CHUNK_SIZE as u64);
let block = &self.blocks[block_id]; let block = &self.blocks[block_id];
let interpoled_val: u64 = block.line.eval(idx_within_block); let interpoled_val: u64 = block.line.eval(idx_within_block);
let block_bytes = &self.data[block.data_start_offset..]; let block_bytes = &self.data[block.data_start_offset..];
@@ -180,7 +180,7 @@ impl Column for BlockwiseLinearReader {
self.normalized_header.max_value self.normalized_header.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.normalized_header.num_vals self.normalized_header.num_vals
} }
} }

View File

@@ -1,5 +1,5 @@
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive}; use std::ops::RangeInclusive;
use tantivy_bitpacker::minmax; use tantivy_bitpacker::minmax;
@@ -14,7 +14,7 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
/// # Panics /// # Panics
/// ///
/// May panic if `idx` is greater than the column length. /// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T; fn get_val(&self, idx: u64) -> T;
/// Fills an output buffer with the fast field values /// Fills an output buffer with the fast field values
/// associated with the `DocId` going from /// associated with the `DocId` going from
@@ -27,28 +27,21 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
#[inline] #[inline]
fn get_range(&self, start: u64, output: &mut [T]) { fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) { for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32); *out = self.get_val(idx);
} }
} }
/// Get the positions of values which are in the provided value range. /// Return the positions of values which are in the provided range.
///
/// Note that position == docid for single value fast fields
#[inline] #[inline]
fn get_docids_for_value_range( fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> {
&self, let mut vals = Vec::new();
value_range: RangeInclusive<T>, for idx in 0..self.num_vals() {
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let val = self.get_val(idx); let val = self.get_val(idx);
if value_range.contains(&val) { if range.contains(&val) {
positions.push(idx); vals.push(idx);
} }
} }
vals
} }
/// Returns the minimum value for this fast field. /// Returns the minimum value for this fast field.
@@ -68,12 +61,7 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
fn max_value(&self) -> T; fn max_value(&self) -> T;
/// The number of values in the column. /// The number of values in the column.
fn num_vals(&self) -> u32; fn num_vals(&self) -> u64;
/// The number of docs in the column. For single value columns this equals num_vals.
fn num_docs(&self) -> u32 {
self.num_vals()
}
/// Returns a iterator over the data /// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> { fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
@@ -89,7 +77,7 @@ pub struct VecColumn<'a, T = u64> {
} }
impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C { impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u64) -> T {
(*self).get_val(idx) (*self).get_val(idx)
} }
@@ -101,7 +89,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
(*self).max_value() (*self).max_value()
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
(*self).num_vals() (*self).num_vals()
} }
@@ -115,7 +103,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
} }
impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> { impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T { fn get_val(&self, position: u64) -> T {
self.values[position as usize] self.values[position as usize]
} }
@@ -131,8 +119,8 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.values.len() as u32 self.values.len() as u64
} }
fn get_range(&self, start: u64, output: &mut [T]) { fn get_range(&self, start: u64, output: &mut [T]) {
@@ -168,7 +156,7 @@ struct MonotonicMappingColumn<C, T, Input> {
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el /// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
/// ///
/// The inverse of the mapping is required for: /// The inverse of the mapping is required for:
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> ` /// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the /// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column. /// serialization does before calling the underlying column.
/// ///
@@ -200,7 +188,7 @@ where
Output: PartialOrd + Send + Sync + Clone, Output: PartialOrd + Send + Sync + Clone,
{ {
#[inline] #[inline]
fn get_val(&self, idx: u32) -> Output { fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx); let from_val = self.from_column.get_val(idx);
self.monotonic_mapping.mapping(from_val) self.monotonic_mapping.mapping(from_val)
} }
@@ -215,7 +203,7 @@ where
self.monotonic_mapping.mapping(from_max_value) self.monotonic_mapping.mapping(from_max_value)
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.from_column.num_vals() self.from_column.num_vals()
} }
@@ -227,17 +215,10 @@ where
) )
} }
fn get_docids_for_value_range( fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
&self, self.from_column.get_between_vals(
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_docids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone()) self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()), ..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
) )
} }
@@ -245,7 +226,6 @@ where
// and we do not have any specialized implementation anyway. // and we do not have any specialized implementation anyway.
} }
/// Wraps an iterator into a `Column`.
pub struct IterColumn<T>(T); pub struct IterColumn<T>(T);
impl<T> From<T> for IterColumn<T> impl<T> From<T> for IterColumn<T>
@@ -261,7 +241,7 @@ where
T: Iterator + Clone + ExactSizeIterator + Send + Sync, T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd, T::Item: PartialOrd,
{ {
fn get_val(&self, idx: u32) -> T::Item { fn get_val(&self, idx: u64) -> T::Item {
self.0.clone().nth(idx as usize).unwrap() self.0.clone().nth(idx as usize).unwrap()
} }
@@ -273,8 +253,8 @@ where
self.0.clone().last().unwrap() self.0.clone().last().unwrap()
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.0.len() as u32 self.0.len() as u64
} }
fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {

View File

@@ -57,7 +57,7 @@ fn num_bits(val: u128) -> u8 {
/// metadata. /// metadata.
pub fn get_compact_space( pub fn get_compact_space(
values_deduped_sorted: &BTreeSet<u128>, values_deduped_sorted: &BTreeSet<u128>,
total_num_values: u32, total_num_values: u64,
cost_per_blank: usize, cost_per_blank: usize,
) -> CompactSpace { ) -> CompactSpace {
let mut compact_space_builder = CompactSpaceBuilder::new(); let mut compact_space_builder = CompactSpaceBuilder::new();

View File

@@ -14,7 +14,7 @@ use std::{
cmp::Ordering, cmp::Ordering,
collections::BTreeSet, collections::BTreeSet,
io::{self, Write}, io::{self, Write},
ops::{Range, RangeInclusive}, ops::RangeInclusive,
}; };
use common::{BinarySerializable, CountingWriter, VInt, VIntU128}; use common::{BinarySerializable, CountingWriter, VInt, VIntU128};
@@ -165,13 +165,13 @@ pub struct IPCodecParams {
bit_unpacker: BitUnpacker, bit_unpacker: BitUnpacker,
min_value: u128, min_value: u128,
max_value: u128, max_value: u128,
num_vals: u32, num_vals: u64,
num_bits: u8, num_bits: u8,
} }
impl CompactSpaceCompressor { impl CompactSpaceCompressor {
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u32) -> Self { pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u64) -> Self {
let mut values_sorted = BTreeSet::new(); let mut values_sorted = BTreeSet::new();
values_sorted.extend(iter); values_sorted.extend(iter);
let total_num_values = num_vals; let total_num_values = num_vals;
@@ -200,7 +200,7 @@ impl CompactSpaceCompressor {
bit_unpacker: BitUnpacker::new(num_bits), bit_unpacker: BitUnpacker::new(num_bits),
min_value, min_value,
max_value, max_value,
num_vals: total_num_values, num_vals: total_num_values as u64,
num_bits, num_bits,
}, },
} }
@@ -267,7 +267,7 @@ impl BinarySerializable for IPCodecParams {
let _header_flags = u64::deserialize(reader)?; let _header_flags = u64::deserialize(reader)?;
let min_value = VIntU128::deserialize(reader)?.0; let min_value = VIntU128::deserialize(reader)?.0;
let max_value = VIntU128::deserialize(reader)?.0; let max_value = VIntU128::deserialize(reader)?.0;
let num_vals = VIntU128::deserialize(reader)?.0 as u32; let num_vals = VIntU128::deserialize(reader)?.0 as u64;
let num_bits = u8::deserialize(reader)?; let num_bits = u8::deserialize(reader)?;
let compact_space = CompactSpace::deserialize(reader)?; let compact_space = CompactSpace::deserialize(reader)?;
@@ -284,7 +284,7 @@ impl BinarySerializable for IPCodecParams {
impl Column<u128> for CompactSpaceDecompressor { impl Column<u128> for CompactSpaceDecompressor {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u128 { fn get_val(&self, doc: u64) -> u128 {
self.get(doc) self.get(doc)
} }
@@ -296,7 +296,7 @@ impl Column<u128> for CompactSpaceDecompressor {
self.max_value() self.max_value()
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.params.num_vals self.params.num_vals
} }
@@ -304,15 +304,8 @@ impl Column<u128> for CompactSpaceDecompressor {
fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> {
Box::new(self.iter()) Box::new(self.iter())
} }
fn get_between_vals(&self, range: RangeInclusive<u128>) -> Vec<u64> {
#[inline] self.get_between_vals(range)
fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<u128>,
positions_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.get_positions_for_value_range(value_range, positions_range, positions)
} }
} }
@@ -347,19 +340,12 @@ impl CompactSpaceDecompressor {
/// Comparing on compact space: Real dataset 1.08 GElements/s /// Comparing on compact space: Real dataset 1.08 GElements/s
/// ///
/// Comparing on original space: Real dataset .06 GElements/s (not completely optimized) /// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
#[inline] pub fn get_between_vals(&self, range: RangeInclusive<u128>) -> Vec<u64> {
pub fn get_positions_for_value_range( if range.start() > range.end() {
&self, return Vec::new();
value_range: RangeInclusive<u128>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
if value_range.start() > value_range.end() {
return;
} }
let position_range = position_range.start..position_range.end.min(self.num_vals()); let from_value = *range.start();
let from_value = *value_range.start(); let to_value = *range.end();
let to_value = *value_range.end();
assert!(to_value >= from_value); assert!(to_value >= from_value);
let compact_from = self.u128_to_compact(from_value); let compact_from = self.u128_to_compact(from_value);
let compact_to = self.u128_to_compact(to_value); let compact_to = self.u128_to_compact(to_value);
@@ -367,7 +353,7 @@ impl CompactSpaceDecompressor {
// Quick return, if both ranges fall into the same non-mapped space, the range can't cover // Quick return, if both ranges fall into the same non-mapped space, the range can't cover
// any values, so we can early exit // any values, so we can early exit
match (compact_to, compact_from) { match (compact_to, compact_from) {
(Err(pos1), Err(pos2)) if pos1 == pos2 => return, (Err(pos1), Err(pos2)) if pos1 == pos2 => return Vec::new(),
_ => {} _ => {}
} }
@@ -389,28 +375,27 @@ impl CompactSpaceDecompressor {
}); });
let range = compact_from..=compact_to; let range = compact_from..=compact_to;
let mut positions = Vec::new();
let scan_num_docs = position_range.end - position_range.start;
let step_size = 4; let step_size = 4;
let cutoff = position_range.start + scan_num_docs - scan_num_docs % step_size; let cutoff = self.params.num_vals - self.params.num_vals % step_size;
let mut push_if_in_range = |idx, val| { let mut push_if_in_range = |idx, val| {
if range.contains(&val) { if range.contains(&val) {
positions.push(idx); positions.push(idx);
} }
}; };
let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data); let get_val = |idx| self.params.bit_unpacker.get(idx as u64, &self.data);
// unrolled loop // unrolled loop
for idx in (position_range.start..cutoff).step_by(step_size as usize) { for idx in (0..cutoff).step_by(step_size as usize) {
let idx1 = idx; let idx1 = idx;
let idx2 = idx + 1; let idx2 = idx + 1;
let idx3 = idx + 2; let idx3 = idx + 2;
let idx4 = idx + 3; let idx4 = idx + 3;
let val1 = get_val(idx1 as u32); let val1 = get_val(idx1);
let val2 = get_val(idx2 as u32); let val2 = get_val(idx2);
let val3 = get_val(idx3 as u32); let val3 = get_val(idx3);
let val4 = get_val(idx4 as u32); let val4 = get_val(idx4);
push_if_in_range(idx1, val1); push_if_in_range(idx1, val1);
push_if_in_range(idx2, val2); push_if_in_range(idx2, val2);
push_if_in_range(idx3, val3); push_if_in_range(idx3, val3);
@@ -418,15 +403,17 @@ impl CompactSpaceDecompressor {
} }
// handle rest // handle rest
for idx in cutoff..position_range.end { for idx in cutoff..self.params.num_vals {
push_if_in_range(idx, get_val(idx as u32)); push_if_in_range(idx, get_val(idx));
} }
positions
} }
#[inline] #[inline]
fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ { fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
(0..self.params.num_vals) (0..self.params.num_vals)
.map(move |idx| self.params.bit_unpacker.get(idx, &self.data) as u64) .map(move |idx| self.params.bit_unpacker.get(idx as u64, &self.data) as u64)
} }
#[inline] #[inline]
@@ -438,7 +425,7 @@ impl CompactSpaceDecompressor {
} }
#[inline] #[inline]
pub fn get(&self, idx: u32) -> u128 { pub fn get(&self, idx: u64) -> u128 {
let compact = self.params.bit_unpacker.get(idx, &self.data); let compact = self.params.bit_unpacker.get(idx, &self.data);
self.compact_to_u128(compact) self.compact_to_u128(compact)
} }
@@ -465,7 +452,7 @@ mod tests {
] ]
.into_iter() .into_iter()
.collect(); .collect();
let compact_space = get_compact_space(ips, ips.len() as u32, 11); let compact_space = get_compact_space(ips, ips.len() as u64, 11);
let amplitude = compact_space.amplitude_compact_space(); let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 17); assert_eq!(amplitude, 17);
assert_eq!(1, compact_space.u128_to_compact(2).unwrap()); assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
@@ -496,7 +483,7 @@ mod tests {
#[test] #[test]
fn compact_space_amplitude_test() { fn compact_space_amplitude_test() {
let ips = &[100000u128, 1000000].into_iter().collect(); let ips = &[100000u128, 1000000].into_iter().collect();
let compact_space = get_compact_space(ips, ips.len() as u32, 1); let compact_space = get_compact_space(ips, ips.len() as u64, 1);
let amplitude = compact_space.amplitude_compact_space(); let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 2); assert_eq!(amplitude, 2);
} }
@@ -504,21 +491,16 @@ mod tests {
fn test_all(data: OwnedBytes, expected: &[u128]) { fn test_all(data: OwnedBytes, expected: &[u128]) {
let decompressor = CompactSpaceDecompressor::open(data).unwrap(); let decompressor = CompactSpaceDecompressor::open(data).unwrap();
for (idx, expected_val) in expected.iter().cloned().enumerate() { for (idx, expected_val) in expected.iter().cloned().enumerate() {
let val = decompressor.get(idx as u32); let val = decompressor.get(idx as u64);
assert_eq!(val, expected_val); assert_eq!(val, expected_val);
let test_range = |range: RangeInclusive<u128>| { let test_range = |range: RangeInclusive<u128>| {
let expected_positions = expected let expected_positions = expected
.iter() .iter()
.positions(|val| range.contains(val)) .positions(|val| range.contains(val))
.map(|pos| pos as u32) .map(|pos| pos as u64)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let mut positions = Vec::new(); let positions = decompressor.get_between_vals(range);
decompressor.get_positions_for_value_range(
range,
0..decompressor.num_vals(),
&mut positions,
);
assert_eq!(positions, expected_positions); assert_eq!(positions, expected_positions);
}; };
@@ -533,7 +515,7 @@ mod tests {
let mut out = Vec::new(); let mut out = Vec::new();
serialize_u128( serialize_u128(
|| u128_vals.iter().cloned(), || u128_vals.iter().cloned(),
u128_vals.len() as u32, u128_vals.len() as u64,
&mut out, &mut out,
) )
.unwrap(); .unwrap();
@@ -558,107 +540,24 @@ mod tests {
]; ];
let data = test_aux_vals(vals); let data = test_aux_vals(vals);
let decomp = CompactSpaceDecompressor::open(data).unwrap(); let decomp = CompactSpaceDecompressor::open(data).unwrap();
let complete_range = 0..vals.len() as u32; let positions = decomp.get_between_vals(0..=1);
for (pos, val) in vals.iter().enumerate() {
let val = *val as u128;
let pos = pos as u32;
let mut positions = Vec::new();
decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
assert_eq!(positions, vec![pos]);
}
// handle docid range out of bounds
let positions = get_positions_for_value_range_helper(&decomp, 0..=1, 1..u32::MAX);
assert_eq!(positions, vec![]);
let positions =
get_positions_for_value_range_helper(&decomp, 0..=1, complete_range.clone());
assert_eq!(positions, vec![0]); assert_eq!(positions, vec![0]);
let positions = let positions = decomp.get_between_vals(0..=2);
get_positions_for_value_range_helper(&decomp, 0..=2, complete_range.clone());
assert_eq!(positions, vec![0]); assert_eq!(positions, vec![0]);
let positions = let positions = decomp.get_between_vals(0..=3);
get_positions_for_value_range_helper(&decomp, 0..=3, complete_range.clone());
assert_eq!(positions, vec![0, 2]); assert_eq!(positions, vec![0, 2]);
assert_eq!( assert_eq!(decomp.get_between_vals(99999u128..=99999u128), vec![3]);
get_positions_for_value_range_helper( assert_eq!(decomp.get_between_vals(99999u128..=100000u128), vec![3, 4]);
&decomp, assert_eq!(decomp.get_between_vals(99998u128..=100000u128), vec![3, 4]);
99999u128..=99999u128, assert_eq!(decomp.get_between_vals(99998u128..=99999u128), vec![3]);
complete_range.clone() assert_eq!(decomp.get_between_vals(99998u128..=99998u128), vec![]);
), assert_eq!(decomp.get_between_vals(333u128..=333u128), vec![8]);
vec![3] assert_eq!(decomp.get_between_vals(332u128..=333u128), vec![8]);
); assert_eq!(decomp.get_between_vals(332u128..=334u128), vec![8]);
assert_eq!( assert_eq!(decomp.get_between_vals(333u128..=334u128), vec![8]);
get_positions_for_value_range_helper(
&decomp,
99999u128..=100000u128,
complete_range.clone()
),
vec![3, 4]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=100000u128,
complete_range.clone()
),
vec![3, 4]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=99999u128,
complete_range.clone()
),
vec![3]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=99998u128,
complete_range.clone()
),
vec![]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
333u128..=333u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
332u128..=333u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
332u128..=334u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
333u128..=334u128,
complete_range.clone()
),
vec![8]
);
assert_eq!( assert_eq!(
get_positions_for_value_range_helper( decomp.get_between_vals(4_000_211_221u128..=5_000_000_000u128),
&decomp,
4_000_211_221u128..=5_000_000_000u128,
complete_range.clone()
),
vec![6, 7] vec![6, 7]
); );
} }
@@ -683,29 +582,12 @@ mod tests {
]; ];
let data = test_aux_vals(vals); let data = test_aux_vals(vals);
let decomp = CompactSpaceDecompressor::open(data).unwrap(); let decomp = CompactSpaceDecompressor::open(data).unwrap();
let complete_range = 0..vals.len() as u32; let positions = decomp.get_between_vals(0..=5);
assert_eq!( assert_eq!(positions, vec![]);
get_positions_for_value_range_helper(&decomp, 0..=5, complete_range.clone()), let positions = decomp.get_between_vals(0..=100);
vec![] assert_eq!(positions, vec![0]);
); let positions = decomp.get_between_vals(0..=105);
assert_eq!( assert_eq!(positions, vec![0]);
get_positions_for_value_range_helper(&decomp, 0..=100, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(&decomp, 0..=105, complete_range.clone()),
vec![0]
);
}
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
column: &C,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
) -> Vec<u32> {
let mut positions = Vec::new();
column.get_docids_for_value_range(value_range, doc_id_range, &mut positions);
positions
} }
#[test] #[test]
@@ -726,33 +608,13 @@ mod tests {
5_000_000_000, 5_000_000_000,
]; ];
let mut out = Vec::new(); let mut out = Vec::new();
serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap(); serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap();
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap(); let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
let complete_range = 0..vals.len() as u32;
assert_eq!( assert_eq!(decomp.get_between_vals(199..=200), vec![0]);
get_positions_for_value_range_helper(&*decomp, 199..=200, complete_range.clone()), assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]);
vec![0] assert_eq!(decomp.get_between_vals(200..=200), vec![0]);
); assert_eq!(decomp.get_between_vals(1_000_000..=1_000_000), vec![11]);
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 199..=201, complete_range.clone()),
vec![0, 1]
);
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 200..=200, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(
&*decomp,
1_000_000..=1_000_000,
complete_range.clone()
),
vec![11]
);
} }
#[test] #[test]

View File

@@ -41,7 +41,7 @@ mod serialize;
use self::bitpacked::BitpackedCodec; use self::bitpacked::BitpackedCodec;
use self::blockwise_linear::BlockwiseLinearCodec; use self::blockwise_linear::BlockwiseLinearCodec;
pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn}; pub use self::column::{monotonic_map_column, Column, VecColumn};
use self::linear::LinearCodec; use self::linear::LinearCodec;
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
@@ -199,9 +199,9 @@ mod tests {
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap(); let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u32); assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().copied().enumerate() { for (doc, orig_val) in data.iter().copied().enumerate() {
let val = reader.get_val(doc as u32); let val = reader.get_val(doc as u64);
assert_eq!( assert_eq!(
val, orig_val, val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \ "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
@@ -211,18 +211,13 @@ mod tests {
if !data.is_empty() { if !data.is_empty() {
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1); let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
let expected_positions: Vec<u32> = data let expected_positions: Vec<u64> = data
.iter() .iter()
.enumerate() .enumerate()
.filter(|(_, el)| **el == data[test_rand_idx]) .filter(|(_, el)| **el == data[test_rand_idx])
.map(|(pos, _)| pos as u32) .map(|(pos, _)| pos as u64)
.collect(); .collect();
let mut positions = Vec::new(); let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
reader.get_docids_for_value_range(
data[test_rand_idx]..=data[test_rand_idx],
0..data.len() as u32,
&mut positions,
);
assert_eq!(expected_positions, positions); assert_eq!(expected_positions, positions);
} }
Some((estimation, actual_compression)) Some((estimation, actual_compression))
@@ -434,7 +429,7 @@ mod bench {
b.iter(|| { b.iter(|| {
let mut sum = 0u64; let mut sum = 0u64;
for pos in value_iter() { for pos in value_iter() {
let val = col.get_val(pos as u32); let val = col.get_val(pos as u64);
sum = sum.wrapping_add(val); sum = sum.wrapping_add(val);
} }
sum sum
@@ -446,7 +441,7 @@ mod bench {
b.iter(|| { b.iter(|| {
let mut sum = 0u64; let mut sum = 0u64;
for pos in value_iter() { for pos in value_iter() {
let val = col.get_val(pos as u32); let val = col.get_val(pos as u64);
sum = sum.wrapping_add(val); sum = sum.wrapping_add(val);
} }
sum sum

View File

@@ -1,5 +1,5 @@
use std::io; use std::io;
use std::num::NonZeroU32; use std::num::NonZeroU64;
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
@@ -29,7 +29,7 @@ pub struct Line {
/// compute_slope(y0, y1) /// compute_slope(y0, y1)
/// = compute_slope(y0 + X % 2^64, y1 + X % 2^64) /// = compute_slope(y0 + X % 2^64, y1 + X % 2^64)
/// ` /// `
fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 { fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU64) -> u64 {
let dy = y1.wrapping_sub(y0); let dy = y1.wrapping_sub(y0);
let sign = dy <= (1 << 63); let sign = dy <= (1 << 63);
let abs_dy = if sign { let abs_dy = if sign {
@@ -43,7 +43,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
return 0u64; return 0u64;
} }
let abs_slope = (abs_dy << 32) / num_vals.get() as u64; let abs_slope = (abs_dy << 32) / num_vals.get();
if sign { if sign {
abs_slope abs_slope
} else { } else {
@@ -62,8 +62,8 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
impl Line { impl Line {
#[inline(always)] #[inline(always)]
pub fn eval(&self, x: u32) -> u64 { pub fn eval(&self, x: u64) -> u64 {
let linear_part = ((x as u64).wrapping_mul(self.slope) >> 32) as i32 as u64; let linear_part = (x.wrapping_mul(self.slope) >> 32) as i32 as u64;
self.intercept.wrapping_add(linear_part) self.intercept.wrapping_add(linear_part)
} }
@@ -75,7 +75,7 @@ impl Line {
Self::train_from( Self::train_from(
first_val, first_val,
last_val, last_val,
num_vals as u32, num_vals,
sample_positions_and_values.iter().cloned(), sample_positions_and_values.iter().cloned(),
) )
} }
@@ -84,11 +84,11 @@ impl Line {
fn train_from( fn train_from(
first_val: u64, first_val: u64,
last_val: u64, last_val: u64,
num_vals: u32, num_vals: u64,
positions_and_values: impl Iterator<Item = (u64, u64)>, positions_and_values: impl Iterator<Item = (u64, u64)>,
) -> Self { ) -> Self {
// TODO replace with let else // TODO replace with let else
let idx_last_val = if let Some(idx_last_val) = NonZeroU32::new(num_vals - 1) { let idx_last_val = if let Some(idx_last_val) = NonZeroU64::new(num_vals - 1) {
idx_last_val idx_last_val
} else { } else {
return Line::default(); return Line::default();
@@ -129,7 +129,7 @@ impl Line {
}; };
let heuristic_shift = y0.wrapping_sub(MID_POINT); let heuristic_shift = y0.wrapping_sub(MID_POINT);
line.intercept = positions_and_values line.intercept = positions_and_values
.map(|(pos, y)| y.wrapping_sub(line.eval(pos as u32))) .map(|(pos, y)| y.wrapping_sub(line.eval(pos)))
.min_by_key(|&val| val.wrapping_sub(heuristic_shift)) .min_by_key(|&val| val.wrapping_sub(heuristic_shift))
.unwrap_or(0u64); //< Never happens. .unwrap_or(0u64); //< Never happens.
line line
@@ -199,7 +199,7 @@ mod tests {
let line = Line::train(&VecColumn::from(&ys)); let line = Line::train(&VecColumn::from(&ys));
ys.iter() ys.iter()
.enumerate() .enumerate()
.map(|(x, y)| y.wrapping_sub(line.eval(x as u32))) .map(|(x, y)| y.wrapping_sub(line.eval(x as u64)))
.max() .max()
} }

View File

@@ -19,7 +19,7 @@ pub struct LinearReader {
impl Column for LinearReader { impl Column for LinearReader {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u64 { fn get_val(&self, doc: u64) -> u64 {
let interpoled_val: u64 = self.linear_params.line.eval(doc); let interpoled_val: u64 = self.linear_params.line.eval(doc);
let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data); let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data);
interpoled_val.wrapping_add(bitpacked_diff) interpoled_val.wrapping_add(bitpacked_diff)
@@ -37,7 +37,7 @@ impl Column for LinearReader {
} }
#[inline] #[inline]
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.header.num_vals self.header.num_vals
} }
} }
@@ -93,7 +93,7 @@ impl FastFieldCodec for LinearCodec {
.iter() .iter()
.enumerate() .enumerate()
.map(|(pos, actual_value)| { .map(|(pos, actual_value)| {
let calculated_value = line.eval(pos as u32); let calculated_value = line.eval(pos as u64);
actual_value.wrapping_sub(calculated_value) actual_value.wrapping_sub(calculated_value)
}) })
.max() .max()
@@ -108,7 +108,7 @@ impl FastFieldCodec for LinearCodec {
let mut bit_packer = BitPacker::new(); let mut bit_packer = BitPacker::new();
for (pos, actual_value) in column.iter().enumerate() { for (pos, actual_value) in column.iter().enumerate() {
let calculated_value = line.eval(pos as u32); let calculated_value = line.eval(pos as u64);
let offset = actual_value.wrapping_sub(calculated_value); let offset = actual_value.wrapping_sub(calculated_value);
bit_packer.write(offset, num_bits, write)?; bit_packer.write(offset, num_bits, write)?;
} }
@@ -140,7 +140,7 @@ impl FastFieldCodec for LinearCodec {
let estimated_bit_width = sample_positions_and_values let estimated_bit_width = sample_positions_and_values
.into_iter() .into_iter()
.map(|(pos, actual_value)| { .map(|(pos, actual_value)| {
let interpolated_val = line.eval(pos as u32); let interpolated_val = line.eval(pos as u64);
actual_value.wrapping_sub(interpolated_val) actual_value.wrapping_sub(interpolated_val)
}) })
.map(|diff| ((diff as f32 * 1.5) * 2.0) as u64) .map(|diff| ((diff as f32 * 1.5) * 2.0) as u64)

View File

@@ -90,7 +90,7 @@ fn bench_ip() {
{ {
let mut data = vec![]; let mut data = vec![];
for dataset in dataset.chunks(500_000) { for dataset in dataset.chunks(500_000) {
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
} }
let compression = data.len() as f64 / (dataset.len() * 16) as f64; let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression 50_000 chunks {:.4}", compression); println!("Compression 50_000 chunks {:.4}", compression);
@@ -103,7 +103,7 @@ fn bench_ip() {
let mut data = vec![]; let mut data = vec![];
{ {
print_time!("creation"); print_time!("creation");
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
} }
let compression = data.len() as f64 / (dataset.len() * 16) as f64; let compression = data.len() as f64 / (dataset.len() * 16) as f64;
@@ -115,15 +115,9 @@ fn bench_ip() {
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap(); let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
// Sample some ranges // Sample some ranges
let mut doc_values = Vec::new();
for value in dataset.iter().take(1110).skip(1100).cloned() { for value in dataset.iter().take(1110).skip(1100).cloned() {
doc_values.clear();
print_time!("get range"); print_time!("get range");
decompressor.get_docids_for_value_range( let doc_values = decompressor.get_between_vals(value..=value);
value..=value,
0..decompressor.num_vals(),
&mut doc_values,
);
println!("{:?}", doc_values.len()); println!("{:?}", doc_values.len());
} }
} }

View File

@@ -46,14 +46,14 @@ use crate::{
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub struct NormalizedHeader { pub struct NormalizedHeader {
/// The number of values in the underlying column. /// The number of values in the underlying column.
pub num_vals: u32, pub num_vals: u64,
/// The max value of the underlying column. /// The max value of the underlying column.
pub max_value: u64, pub max_value: u64,
} }
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub(crate) struct Header { pub(crate) struct Header {
pub num_vals: u32, pub num_vals: u64,
pub min_value: u64, pub min_value: u64,
pub max_value: u64, pub max_value: u64,
pub gcd: Option<NonZeroU64>, pub gcd: Option<NonZeroU64>,
@@ -110,7 +110,7 @@ pub fn normalize_column<C: Column>(
impl BinarySerializable for Header { impl BinarySerializable for Header {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.num_vals as u64).serialize(writer)?; VInt(self.num_vals).serialize(writer)?;
VInt(self.min_value).serialize(writer)?; VInt(self.min_value).serialize(writer)?;
VInt(self.max_value - self.min_value).serialize(writer)?; VInt(self.max_value - self.min_value).serialize(writer)?;
if let Some(gcd) = self.gcd { if let Some(gcd) = self.gcd {
@@ -123,7 +123,7 @@ impl BinarySerializable for Header {
} }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let num_vals = VInt::deserialize(reader)?.0 as u32; let num_vals = VInt::deserialize(reader)?.0;
let min_value = VInt::deserialize(reader)?.0; let min_value = VInt::deserialize(reader)?.0;
let amplitude = VInt::deserialize(reader)?.0; let amplitude = VInt::deserialize(reader)?.0;
let max_value = min_value + amplitude; let max_value = min_value + amplitude;
@@ -164,7 +164,7 @@ pub fn estimate<T: MonotonicallyMappableToU64>(
/// Serializes u128 values with the compact space codec. /// Serializes u128 values with the compact space codec.
pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>( pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
iter_gen: F, iter_gen: F,
num_vals: u32, num_vals: u64,
output: &mut impl io::Write, output: &mut impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
// TODO write header, to later support more codecs // TODO write header, to later support more codecs

View File

@@ -62,20 +62,6 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
}) })
} }
// word variant that allows more characters, e.g. for range queries that don't allow field
// specifier
fn relaxed_word<'a>() -> impl Parser<&'a str, Output = String> {
(
satisfy(|c: char| {
!c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
}),
many(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
/// Parses a date time according to rfc3339 /// Parses a date time according to rfc3339
/// 2015-08-02T18:54:42+02 /// 2015-08-02T18:54:42+02
/// 2021-04-13T19:46:26.266051969+00:00 /// 2021-04-13T19:46:26.266051969+00:00
@@ -195,8 +181,8 @@ fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let range_term_val = || { let range_term_val = || {
attempt(date_time()) attempt(date_time())
.or(word())
.or(negative_number()) .or(negative_number())
.or(relaxed_word())
.or(char('*').with(value("*".to_string()))) .or(char('*').with(value("*".to_string())))
}; };
@@ -663,34 +649,6 @@ mod test {
.expect("Cannot parse date range") .expect("Cannot parse date range")
.0; .0;
assert_eq!(res6, expected_flexible_dates); assert_eq!(res6, expected_flexible_dates);
// IP Range Unbounded
let expected_weight = UserInputLeaf::Range {
field: Some("ip".to_string()),
lower: UserInputBound::Inclusive("::1".to_string()),
upper: UserInputBound::Unbounded,
};
let res1 = range()
.parse("ip: >=::1")
.expect("Cannot parse ip v6 format")
.0;
let res2 = range()
.parse("ip:[::1 TO *}")
.expect("Cannot parse ip v6 format")
.0;
assert_eq!(res1, expected_weight);
assert_eq!(res2, expected_weight);
// IP Range Bounded
let expected_weight = UserInputLeaf::Range {
field: Some("ip".to_string()),
lower: UserInputBound::Inclusive("::0.0.0.50".to_string()),
upper: UserInputBound::Exclusive("::0.0.0.52".to_string()),
};
let res1 = range()
.parse("ip:[::0.0.0.50 TO ::0.0.0.52}")
.expect("Cannot parse ip v6 format")
.0;
assert_eq!(res1, expected_weight);
} }
#[test] #[test]

View File

@@ -4,7 +4,9 @@
//! intermediate average results, which is the sum and the number of values. The actual average is //! intermediate average results, which is the sum and the number of values. The actual average is
//! calculated on the step from intermediate to final aggregation result tree. //! calculated on the step from intermediate to final aggregation result tree.
use rustc_hash::FxHashMap; use std::collections::HashMap;
use fnv::FnvHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::agg_req::BucketAggregationInternal; use super::agg_req::BucketAggregationInternal;
@@ -16,7 +18,7 @@ use crate::TantivyError;
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
/// The final aggegation result. /// The final aggegation result.
pub struct AggregationResults(pub FxHashMap<String, AggregationResult>); pub struct AggregationResults(pub HashMap<String, AggregationResult>);
impl AggregationResults { impl AggregationResults {
pub(crate) fn get_value_from_aggregation( pub(crate) fn get_value_from_aggregation(
@@ -143,7 +145,7 @@ pub enum BucketEntries<T> {
/// Vector format bucket entries /// Vector format bucket entries
Vec(Vec<T>), Vec(Vec<T>),
/// HashMap format bucket entries /// HashMap format bucket entries
HashMap(FxHashMap<String, T>), HashMap(FnvHashMap<String, T>),
} }
/// This is the default entry for a bucket, which contains a key, count, and optionally /// This is the default entry for a bucket, which contains a key, count, and optionally

View File

@@ -331,10 +331,10 @@ impl SegmentHistogramCollector {
.expect("unexpected fast field cardinatility"); .expect("unexpected fast field cardinatility");
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0])); let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0] as u64));
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1])); let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1] as u64));
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2])); let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2] as u64));
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3])); let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3] as u64));
let bucket_pos0 = get_bucket_num(val0); let bucket_pos0 = get_bucket_num(val0);
let bucket_pos1 = get_bucket_num(val1); let bucket_pos1 = get_bucket_num(val1);
@@ -371,7 +371,7 @@ impl SegmentHistogramCollector {
)?; )?;
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get_val(doc), &self.field_type); let val = f64_from_fastfield_u64(accessor.get_val(doc as u64), &self.field_type);
if !bounds.contains(val) { if !bounds.contains(val) {
continue; continue;
} }

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Range; use std::ops::Range;
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{ use crate::aggregation::agg_req_with_accessor::{
@@ -176,7 +176,7 @@ impl SegmentRangeCollector {
) -> crate::Result<IntermediateBucketResult> { ) -> crate::Result<IntermediateBucketResult> {
let field_type = self.field_type; let field_type = self.field_type;
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self let buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets .buckets
.into_iter() .into_iter()
.map(move |range_bucket| { .map(move |range_bucket| {
@@ -263,10 +263,10 @@ impl SegmentRangeCollector {
.as_single() .as_single()
.expect("unexpected fast field cardinality"); .expect("unexpected fast field cardinality");
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = accessor.get_val(docs[0]); let val1 = accessor.get_val(docs[0] as u64);
let val2 = accessor.get_val(docs[1]); let val2 = accessor.get_val(docs[1] as u64);
let val3 = accessor.get_val(docs[2]); let val3 = accessor.get_val(docs[2] as u64);
let val4 = accessor.get_val(docs[3]); let val4 = accessor.get_val(docs[3] as u64);
let bucket_pos1 = self.get_bucket_pos(val1); let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2); let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3); let bucket_pos3 = self.get_bucket_pos(val3);
@@ -278,7 +278,7 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = accessor.get_val(doc); let val = accessor.get_val(doc as u64);
let bucket_pos = self.get_bucket_pos(val); let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
} }

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use fnv::FnvHashMap;
use itertools::Itertools; use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::{CustomOrder, Order, OrderTarget}; use super::{CustomOrder, Order, OrderTarget};
@@ -199,7 +199,7 @@ impl TermsAggregationInternal {
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
/// Container to store term_ids and their buckets. /// Container to store term_ids and their buckets.
struct TermBuckets { struct TermBuckets {
pub(crate) entries: FxHashMap<u32, TermBucketEntry>, pub(crate) entries: FnvHashMap<u32, TermBucketEntry>,
blueprint: Option<SegmentAggregationResultsCollector>, blueprint: Option<SegmentAggregationResultsCollector>,
} }
@@ -397,7 +397,7 @@ impl SegmentTermCollector {
.expect("internal error: inverted index not loaded for term aggregation"); .expect("internal error: inverted index not loaded for term aggregation");
let term_dict = inverted_index.terms(); let term_dict = inverted_index.terms();
let mut dict: FxHashMap<String, IntermediateTermBucketEntry> = Default::default(); let mut dict: FnvHashMap<String, IntermediateTermBucketEntry> = Default::default();
let mut buffer = vec![]; let mut buffer = vec![];
for (term_id, entry) in entries { for (term_id, entry) in entries {
term_dict term_dict
@@ -1129,9 +1129,9 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb"); assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0); assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);

View File

@@ -3,9 +3,10 @@
//! indices. //! indices.
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::HashMap;
use fnv::FnvHashMap;
use itertools::Itertools; use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::agg_req::{ use super::agg_req::{
@@ -50,7 +51,7 @@ impl IntermediateAggregationResults {
// Important assumption: // Important assumption:
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the // When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
// request // request
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default(); let mut results: HashMap<String, AggregationResult> = HashMap::new();
if let Some(buckets) = self.buckets { if let Some(buckets) = self.buckets {
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)? convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)?
@@ -131,7 +132,7 @@ impl IntermediateAggregationResults {
} }
fn convert_and_add_final_metrics_to_result( fn convert_and_add_final_metrics_to_result(
results: &mut FxHashMap<String, AggregationResult>, results: &mut HashMap<String, AggregationResult>,
metrics: VecWithNames<IntermediateMetricResult>, metrics: VecWithNames<IntermediateMetricResult>,
) { ) {
results.extend( results.extend(
@@ -142,7 +143,7 @@ fn convert_and_add_final_metrics_to_result(
} }
fn add_empty_final_metrics_to_result( fn add_empty_final_metrics_to_result(
results: &mut FxHashMap<String, AggregationResult>, results: &mut HashMap<String, AggregationResult>,
req_metrics: &VecWithNames<MetricAggregation>, req_metrics: &VecWithNames<MetricAggregation>,
) -> crate::Result<()> { ) -> crate::Result<()> {
results.extend(req_metrics.iter().map(|(key, req)| { results.extend(req_metrics.iter().map(|(key, req)| {
@@ -156,7 +157,7 @@ fn add_empty_final_metrics_to_result(
} }
fn add_empty_final_buckets_to_result( fn add_empty_final_buckets_to_result(
results: &mut FxHashMap<String, AggregationResult>, results: &mut HashMap<String, AggregationResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>, req_buckets: &VecWithNames<BucketAggregationInternal>,
) -> crate::Result<()> { ) -> crate::Result<()> {
let requested_buckets = req_buckets.iter(); let requested_buckets = req_buckets.iter();
@@ -168,7 +169,7 @@ fn add_empty_final_buckets_to_result(
} }
fn convert_and_add_final_buckets_to_result( fn convert_and_add_final_buckets_to_result(
results: &mut FxHashMap<String, AggregationResult>, results: &mut HashMap<String, AggregationResult>,
buckets: VecWithNames<IntermediateBucketResult>, buckets: VecWithNames<IntermediateBucketResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>, req_buckets: &VecWithNames<BucketAggregationInternal>,
) -> crate::Result<()> { ) -> crate::Result<()> {
@@ -287,7 +288,7 @@ impl IntermediateBucketResult {
.keyed; .keyed;
let buckets = if is_keyed { let buckets = if is_keyed {
let mut bucket_map = let mut bucket_map =
FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
for bucket in buckets { for bucket in buckets {
bucket_map.insert(bucket.key.to_string(), bucket); bucket_map.insert(bucket.key.to_string(), bucket);
} }
@@ -307,7 +308,7 @@ impl IntermediateBucketResult {
let buckets = if req.as_histogram().unwrap().keyed { let buckets = if req.as_histogram().unwrap().keyed {
let mut bucket_map = let mut bucket_map =
FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
for bucket in buckets { for bucket in buckets {
bucket_map.insert(bucket.key.to_string(), bucket); bucket_map.insert(bucket.key.to_string(), bucket);
} }
@@ -395,13 +396,13 @@ impl IntermediateBucketResult {
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Range aggregation including error counts /// Range aggregation including error counts
pub struct IntermediateRangeBucketResult { pub struct IntermediateRangeBucketResult {
pub(crate) buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry>, pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
} }
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Term aggregation including error counts /// Term aggregation including error counts
pub struct IntermediateTermBucketResult { pub struct IntermediateTermBucketResult {
pub(crate) entries: FxHashMap<String, IntermediateTermBucketEntry>, pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
pub(crate) sum_other_doc_count: u64, pub(crate) sum_other_doc_count: u64,
pub(crate) doc_count_error_upper_bound: u64, pub(crate) doc_count_error_upper_bound: u64,
} }
@@ -498,8 +499,8 @@ trait MergeFruits {
} }
fn merge_maps<V: MergeFruits + Clone>( fn merge_maps<V: MergeFruits + Clone>(
entries_left: &mut FxHashMap<SerializedKey, V>, entries_left: &mut FnvHashMap<SerializedKey, V>,
mut entries_right: FxHashMap<SerializedKey, V>, mut entries_right: FnvHashMap<SerializedKey, V>,
) { ) {
for (name, entry_left) in entries_left.iter_mut() { for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.remove(name) { if let Some(entry_right) = entries_right.remove(name) {
@@ -625,7 +626,7 @@ mod tests {
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults { fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
let mut map = HashMap::new(); let mut map = HashMap::new();
let mut buckets = FxHashMap::default(); let mut buckets = FnvHashMap::default();
for (key, doc_count) in data { for (key, doc_count) in data {
buckets.insert( buckets.insert(
key.to_string(), key.to_string(),
@@ -652,7 +653,7 @@ mod tests {
data: &[(String, u64, String, u64)], data: &[(String, u64, String, u64)],
) -> IntermediateAggregationResults { ) -> IntermediateAggregationResults {
let mut map = HashMap::new(); let mut map = HashMap::new();
let mut buckets: FxHashMap<_, _> = Default::default(); let mut buckets: FnvHashMap<_, _> = Default::default();
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data { for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
buckets.insert( buckets.insert(
key.to_string(), key.to_string(),

View File

@@ -60,10 +60,10 @@ impl SegmentAverageCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get_val(docs[0]); let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1]); let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2]); let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3]); let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type); let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type); let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type); let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -74,7 +74,7 @@ impl SegmentAverageCollector {
self.data.collect(val4); self.data.collect(val4);
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = field.get_val(doc); let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type); let val = f64_from_fastfield_u64(val, &self.field_type);
self.data.collect(val); self.data.collect(val);
} }

View File

@@ -166,10 +166,10 @@ impl SegmentStatsCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get_val(docs[0]); let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1]); let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2]); let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3]); let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type); let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type); let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type); let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -180,7 +180,7 @@ impl SegmentStatsCollector {
self.stats.collect(val4); self.stats.collect(val4);
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = field.get_val(doc); let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type); let val = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val); self.stats.collect(val);
} }

View File

@@ -616,7 +616,7 @@ mod tests {
.map(|mut doc| { .map(|mut doc| {
doc.add_facet( doc.add_facet(
facet_field, facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)), &format!("/facet/{}", thread_rng().sample(&uniform)),
); );
doc doc
}) })

View File

@@ -177,7 +177,7 @@ where
type Fruit = TSegmentCollector::Fruit; type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_val(doc); let value = self.fast_field_reader.get_val(doc as u64);
if (self.predicate)(value) { if (self.predicate)(value) {
self.segment_collector.collect(doc, score) self.segment_collector.collect(doc, score)
} }

View File

@@ -94,7 +94,7 @@ impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) { fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc); let value = self.ff_reader.get_val(doc as u64);
self.histogram_computer.add_value(value); self.histogram_computer.add_value(value);
} }

View File

@@ -172,33 +172,17 @@ pub trait Collector: Sync + Send {
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> { ) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?; let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
match (reader.alive_bitset(), self.requires_scoring()) { if let Some(alive_bitset) = reader.alive_bitset() {
(Some(alive_bitset), true) => { weight.for_each(reader, &mut |doc, score| {
weight.for_each(reader, &mut |doc, score| { if alive_bitset.is_alive(doc) {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;
}
(Some(alive_bitset), false) => {
weight.for_each_no_score(reader, &mut |doc| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, 0.0);
}
})?;
}
(None, true) => {
weight.for_each(reader, &mut |doc, score| {
segment_collector.collect(doc, score); segment_collector.collect(doc, score);
})?; }
} })?;
(None, false) => { } else {
weight.for_each_no_score(reader, &mut |doc| { weight.for_each(reader, &mut |doc, score| {
segment_collector.collect(doc, 0.0); segment_collector.collect(doc, score);
})?; })?;
}
} }
Ok(segment_collector.harvest()) Ok(segment_collector.harvest())
} }
} }

View File

@@ -201,7 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) { fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get_val(doc); let val = self.reader.get_val(doc as u64);
self.vals.push(val); self.vals.push(val);
} }

View File

@@ -137,7 +137,7 @@ struct ScorerByFastFieldReader {
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader { impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 { fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc) self.ff_reader.get_val(doc as u64)
} }
} }
@@ -458,7 +458,7 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| { /// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc); /// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// // Well.. For the sake of the example we use a simple logarithm /// // Well.. For the sake of the example we use a simple logarithm
/// // function. /// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2(); /// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -567,8 +567,8 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId| { /// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc); /// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let boosted: u64 = boosted_reader.get_val(doc); /// let boosted: u64 = boosted_reader.get_val(doc as u64);
/// // Score do not have to be `f64` in tantivy. /// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order /// // Here we return a couple to get lexicographical order
/// // for free. /// // for free.

View File

@@ -133,7 +133,7 @@ impl SegmentMeta {
/// associated with a segment component. /// associated with a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string(); let mut path = self.id().uuid_string();
path.push_str(&match component { path.push_str(&*match component {
SegmentComponent::Postings => ".idx".to_string(), SegmentComponent::Postings => ".idx".to_string(),
SegmentComponent::Positions => ".pos".to_string(), SegmentComponent::Positions => ".pos".to_string(),
SegmentComponent::Terms => ".term".to_string(), SegmentComponent::Terms => ".term".to_string(),

View File

@@ -55,7 +55,7 @@ impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
impl Drop for DirectoryLockGuard { impl Drop for DirectoryLockGuard {
fn drop(&mut self) { fn drop(&mut self) {
if let Err(e) = self.directory.delete(&self.path) { if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e); error!("Failed to remove the lock file. {:?}", e);
} }
} }

View File

@@ -1,9 +1,10 @@
use std::ops::Range;
use std::sync::Arc; use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use crate::directory::{FileSlice, OwnedBytes}; use crate::directory::{FileSlice, OwnedBytes};
use crate::fastfield::MultiValueIndex; use crate::fastfield::MultiValueLength;
use crate::DocId; use crate::DocId;
/// Reader for byte array fast fields /// Reader for byte array fast fields
@@ -18,7 +19,7 @@ use crate::DocId;
/// and the start index for the next document, and keeping the bytes in between. /// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)] #[derive(Clone)]
pub struct BytesFastFieldReader { pub struct BytesFastFieldReader {
idx_reader: MultiValueIndex, idx_reader: Arc<dyn Column<u64>>,
values: OwnedBytes, values: OwnedBytes,
} }
@@ -28,27 +29,26 @@ impl BytesFastFieldReader {
values_file: FileSlice, values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> { ) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?; let values = values_file.read_bytes()?;
Ok(BytesFastFieldReader { Ok(BytesFastFieldReader { idx_reader, values })
idx_reader: MultiValueIndex::new(idx_reader),
values,
})
} }
/// returns the multivalue index fn range(&self, doc: DocId) -> Range<u64> {
pub fn get_index_reader(&self) -> &MultiValueIndex { let idx = doc as u64;
&self.idx_reader let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end
} }
/// Returns the bytes associated with the given `doc` /// Returns the bytes associated with the given `doc`
pub fn get_bytes(&self, doc: DocId) -> &[u8] { pub fn get_bytes(&self, doc: DocId) -> &[u8] {
let range = self.idx_reader.range(doc); let range = self.range(doc);
&self.values.as_slice()[range.start as usize..range.end as usize] &self.values.as_slice()[range.start as usize..range.end as usize]
} }
/// Returns the length of the bytes associated with the given `doc` /// Returns the length of the bytes associated with the given `doc`
pub fn num_bytes(&self, doc: DocId) -> u64 { pub fn num_bytes(&self, doc: DocId) -> u64 {
let range = self.idx_reader.range(doc); let range = self.range(doc);
(range.end - range.start) as u64 range.end - range.start
} }
/// Returns the overall number of bytes in this bytes fast field. /// Returns the overall number of bytes in this bytes fast field.
@@ -56,3 +56,15 @@ impl BytesFastFieldReader {
self.values.len() as u64 self.values.len() as u64
} }
} }
impl MultiValueLength for BytesFastFieldReader {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_bytes(doc_id)
}
fn get_total_len(&self) -> u64 {
self.total_num_bytes()
}
}

View File

@@ -27,8 +27,8 @@ pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader; pub use self::facet_reader::FacetReader;
pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex}; pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex};
pub use self::multivalued::{ pub use self::multivalued::{
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValuedFastFieldWriter,
MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader, MultiValuedU128FastFieldReader,
}; };
pub use self::readers::FastFieldReaders; pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType}; pub(crate) use self::readers::{type_and_cardinality, FastType};
@@ -36,7 +36,7 @@ pub use self::serializer::{Column, CompositeFastFieldSerializer};
use self::writer::unexpected_value; use self::writer::unexpected_value;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::{Type, Value}; use crate::schema::{Type, Value};
use crate::DateTime; use crate::{DateTime, DocId};
mod alive_bitset; mod alive_bitset;
mod bytes; mod bytes;
@@ -47,6 +47,17 @@ mod readers;
mod serializer; mod serializer;
mod writer; mod writer;
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
/// for a doc_id
pub trait MultiValueLength {
/// returns the positions for a docid
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64>;
/// returns the num of values associated with a doc_id
fn get_len(&self, doc_id: DocId) -> u64;
/// returns the sum of num values for all doc_ids
fn get_total_len(&self) -> u64;
}
/// Trait for types that are allowed for fast fields: /// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime). /// (u64, i64 and f64, bool, DateTime).
pub trait FastValue: pub trait FastValue:
@@ -173,9 +184,9 @@ mod tests {
#[test] #[test]
pub fn test_fastfield() { pub fn test_fastfield() {
let test_fastfield = fastfield_codecs::serialize_and_load(&[100u64, 200u64, 300u64][..]); let test_fastfield = fastfield_codecs::serialize_and_load(&[100u64, 200u64, 300u64][..]);
assert_eq!(test_fastfield.get_val(0), 100); assert_eq!(test_fastfield.get_val(0u64), 100);
assert_eq!(test_fastfield.get_val(1), 200); assert_eq!(test_fastfield.get_val(1u64), 200);
assert_eq!(test_fastfield.get_val(2), 300); assert_eq!(test_fastfield.get_val(2u64), 300);
} }
#[test] #[test]
@@ -391,7 +402,7 @@ mod tests {
assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64); assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() { for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get_val(doc as u32), i); assert_eq!(fast_field_reader.get_val(doc as u64), i);
} }
let mut buffer = vec![0i64; 100]; let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]); fast_field_reader.get_range(53, &mut buffer[..]);
@@ -473,7 +484,7 @@ mod tests {
let fast_field_reader = open::<u64>(data)?; let fast_field_reader = open::<u64>(data)?;
for a in 0..n { for a in 0..n {
assert_eq!(fast_field_reader.get_val(a as u32), permutation[a as usize]); assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
} }
} }
Ok(()) Ok(())
@@ -965,7 +976,7 @@ mod tests {
let test_fastfield = open::<DateTime>(file.read_bytes()?)?; let test_fastfield = open::<DateTime>(file.read_bytes()?)?;
for (i, time) in times.iter().enumerate() { for (i, time) in times.iter().enumerate() {
assert_eq!(test_fastfield.get_val(i as u32), time.truncate(precision)); assert_eq!(test_fastfield.get_val(i as u64), time.truncate(precision));
} }
Ok(len) Ok(len)
} }

View File

@@ -1,140 +0,0 @@
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::DocId;
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndex {
idx: Arc<dyn Column<u64>>,
}
impl MultiValueIndex {
pub(crate) fn new(idx: Arc<dyn Column<u64>>) -> Self {
Self { idx }
}
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, doc: DocId) -> Range<u32> {
let start = self.idx.get_val(doc) as u32;
let end = self.idx.get_val(doc + 1) as u32;
start..end
}
/// Returns `[start, end)`, such that the values associated with
/// the given documents are `start..end`.
///
/// The passed end range is allowed to be out of bounds.
#[inline]
pub(crate) fn docid_range_to_position_range(&self, range: Range<DocId>) -> Range<u32> {
let end_docid = range.end.min(self.num_docs() - 1) + 1;
let start_docid = range.start.min(end_docid);
let start = self.idx.get_val(start_docid) as u32;
let end = self.idx.get_val(end_docid) as u32;
assert!(start <= end);
start..end
}
/// returns the num of values associated with a doc_id
pub(crate) fn num_vals_for_doc(&self, doc: DocId) -> u32 {
let range = self.range(doc);
range.end - range.start
}
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx.max_value()
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_docs(&self) -> u32 {
self.idx.num_vals() - 1
}
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
/// Positions are converted inplace to docids.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
/// increasing positions.
///
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
pub(crate) fn positions_to_docids(&self, doc_id_range: Range<u32>, positions: &mut Vec<u32>) {
if positions.is_empty() {
return;
}
let mut cur_doc = doc_id_range.start;
let mut last_doc = None;
assert!(self.idx.get_val(doc_id_range.start) as u32 <= positions[0]);
let mut write_doc_pos = 0;
for i in 0..positions.len() {
let pos = positions[i];
loop {
let end = self.idx.get_val(cur_doc + 1) as u32;
if end > pos {
positions[write_doc_pos] = cur_doc;
write_doc_pos += if last_doc == Some(cur_doc) { 0 } else { 1 };
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
positions.truncate(write_doc_pos);
}
}
#[cfg(test)]
mod tests {
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::IterColumn;
use crate::fastfield::MultiValueIndex;
fn index_to_pos_helper(
index: &MultiValueIndex,
doc_id_range: Range<u32>,
positions: &[u32],
) -> Vec<u32> {
let mut positions = positions.to_vec();
index.positions_to_docids(doc_id_range, &mut positions);
positions
}
#[test]
fn test_positions_to_docid() {
let offsets = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
let column = IterColumn::from(offsets.into_iter());
let index = MultiValueIndex::new(Arc::new(column));
assert_eq!(index.num_docs(), 5);
{
let positions = vec![10u32, 11, 15, 20, 21, 22];
assert_eq!(index_to_pos_helper(&index, 0..5, &positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 1..5, &positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
}
}
}

View File

@@ -1,9 +1,7 @@
mod index;
mod reader; mod reader;
mod writer; mod writer;
use fastfield_codecs::FastFieldCodecType; use fastfield_codecs::FastFieldCodecType;
pub use index::MultiValueIndex;
pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader}; pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader};
pub(crate) use self::writer::MultivalueStartIndex; pub(crate) use self::writer::MultivalueStartIndex;
@@ -517,7 +515,7 @@ mod bench {
for val in block { for val in block {
doc.add_u64(field, *val); doc.add_u64(field, *val);
} }
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc);
} }
fast_field_writers fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None) .serialize(&mut serializer, &HashMap::new(), None)
@@ -575,7 +573,7 @@ mod bench {
for val in block { for val in block {
doc.add_u64(field, *val); doc.add_u64(field, *val);
} }
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc);
} }
fast_field_writers fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None) .serialize(&mut serializer, &HashMap::new(), None)
@@ -608,7 +606,7 @@ mod bench {
for val in block { for val in block {
doc.add_u64(field, *val); doc.add_u64(field, *val);
} }
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc);
} }
fast_field_writers fast_field_writers
.serialize(&mut serializer, &HashMap::new(), Some(&doc_id_mapping)) .serialize(&mut serializer, &HashMap::new(), Some(&doc_id_mapping))

View File

@@ -3,8 +3,7 @@ use std::sync::Arc;
use fastfield_codecs::{Column, MonotonicallyMappableToU128}; use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::MultiValueIndex; use crate::fastfield::{FastValue, MultiValueLength};
use crate::fastfield::FastValue;
use crate::DocId; use crate::DocId;
/// Reader for a multivalued `u64` fast field. /// Reader for a multivalued `u64` fast field.
@@ -14,10 +13,9 @@ use crate::DocId;
/// The `vals_reader` will access the concatenated list of all /// The `vals_reader` will access the concatenated list of all
/// values for all reader. /// values for all reader.
/// The `idx_reader` associated, for each document, the index of its first value. /// The `idx_reader` associated, for each document, the index of its first value.
/// Stores the start position for each document.
#[derive(Clone)] #[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> { pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: MultiValueIndex, idx_reader: Arc<dyn Column<u64>>,
vals_reader: Arc<dyn Column<Item>>, vals_reader: Arc<dyn Column<Item>>,
} }
@@ -27,32 +25,36 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
vals_reader: Arc<dyn Column<Item>>, vals_reader: Arc<dyn Column<Item>>,
) -> MultiValuedFastFieldReader<Item> { ) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader { MultiValuedFastFieldReader {
idx_reader: MultiValueIndex::new(idx_reader), idx_reader,
vals_reader, vals_reader,
} }
} }
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let idx = doc as u64;
let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end
}
/// Returns the array of values associated with the given `doc`. /// Returns the array of values associated with the given `doc`.
#[inline] #[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<Item>) { fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
let len = (range.end - range.start) as usize; let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero()); vals.resize(len, Item::make_zero());
self.vals_reader self.vals_reader.get_range(range.start, &mut vals[..]);
.get_range(range.start as u64, &mut vals[..]);
} }
/// Returns the array of values associated with the given `doc`. /// Returns the array of values associated with the given `doc`.
#[inline] #[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) { pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.idx_reader.range(doc); let range = self.range(doc);
self.get_vals_for_range(range, vals); self.get_vals_for_range(range, vals);
} }
/// returns the multivalue index
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
}
/// Returns the minimum value for this fast field. /// Returns the minimum value for this fast field.
/// ///
/// The min value does not take in account of possible /// The min value does not take in account of possible
@@ -73,14 +75,28 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns the number of values associated with the document `DocId`. /// Returns the number of values associated with the document `DocId`.
#[inline] #[inline]
pub fn num_vals(&self, doc: DocId) -> u32 { pub fn num_vals(&self, doc: DocId) -> usize {
self.idx_reader.num_vals_for_doc(doc) let range = self.range(doc);
(range.end - range.start) as usize
} }
/// Returns the overall number of values in this field. /// Returns the overall number of values in this field .
#[inline] #[inline]
pub fn total_num_vals(&self) -> u64 { pub fn total_num_vals(&self) -> u64 {
self.idx_reader.total_num_vals() self.idx_reader.max_value()
}
}
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
fn get_range(&self, doc_id: DocId) -> Range<u64> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_vals(doc_id) as u64
}
fn get_total_len(&self) -> u64 {
self.total_num_vals() as u64
} }
} }
@@ -93,7 +109,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// The `idx_reader` associated, for each document, the index of its first value. /// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)] #[derive(Clone)]
pub struct MultiValuedU128FastFieldReader<T: MonotonicallyMappableToU128> { pub struct MultiValuedU128FastFieldReader<T: MonotonicallyMappableToU128> {
idx_reader: MultiValueIndex, idx_reader: Arc<dyn Column<u64>>,
vals_reader: Arc<dyn Column<T>>, vals_reader: Arc<dyn Column<T>>,
} }
@@ -103,31 +119,24 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
vals_reader: Arc<dyn Column<T>>, vals_reader: Arc<dyn Column<T>>,
) -> MultiValuedU128FastFieldReader<T> { ) -> MultiValuedU128FastFieldReader<T> {
Self { Self {
idx_reader: MultiValueIndex::new(idx_reader), idx_reader,
vals_reader, vals_reader,
} }
} }
/// Returns `[start, end)`, such that the values associated
/// to the given document are `start..end`.
#[inline] #[inline]
fn get_docids_for_value_range( fn range(&self, doc: DocId) -> Range<u64> {
&self, let start = self.idx_reader.get_val(doc as u64);
value_range: RangeInclusive<T>, let end = self.idx_reader.get_val(doc as u64 + 1);
doc_id_range: Range<u32>, start..end
positions: &mut Vec<u32>,
) {
let position_range = self
.get_index_reader()
.docid_range_to_position_range(doc_id_range.clone());
self.vals_reader
.get_docids_for_value_range(value_range, position_range, positions);
self.idx_reader.positions_to_docids(doc_id_range, positions);
} }
/// Returns the array of values associated to the given `doc`. /// Returns the array of values associated to the given `doc`.
#[inline] #[inline]
pub fn get_first_val(&self, doc: DocId) -> Option<T> { pub fn get_first_val(&self, doc: DocId) -> Option<T> {
let range = self.idx_reader.range(doc); let range = self.range(doc);
if range.is_empty() { if range.is_empty() {
return None; return None;
} }
@@ -136,25 +145,26 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns the array of values associated to the given `doc`. /// Returns the array of values associated to the given `doc`.
#[inline] #[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<T>) { fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<T>) {
let len = (range.end - range.start) as usize; let len = (range.end - range.start) as usize;
vals.resize(len, T::from_u128(0)); vals.resize(len, T::from_u128(0));
self.vals_reader self.vals_reader.get_range(range.start, &mut vals[..]);
.get_range(range.start as u64, &mut vals[..]);
}
/// Returns the index reader
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
} }
/// Returns the array of values associated to the given `doc`. /// Returns the array of values associated to the given `doc`.
#[inline] #[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<T>) { pub fn get_vals(&self, doc: DocId, vals: &mut Vec<T>) {
let range = self.idx_reader.range(doc); let range = self.range(doc);
self.get_vals_for_range(range, vals); self.get_vals_for_range(range, vals);
} }
/// Returns all docids which are in the provided value range
pub fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<DocId> {
let positions = self.vals_reader.get_between_vals(range);
positions_to_docids(&positions, self.idx_reader.as_ref())
}
/// Iterates over all elements in the fast field /// Iterates over all elements in the fast field
pub fn iter(&self) -> impl Iterator<Item = T> + '_ { pub fn iter(&self) -> impl Iterator<Item = T> + '_ {
self.vals_reader.iter() self.vals_reader.iter()
@@ -180,27 +190,85 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns the number of values associated with the document `DocId`. /// Returns the number of values associated with the document `DocId`.
#[inline] #[inline]
pub fn num_vals(&self, doc: DocId) -> u32 { pub fn num_vals(&self, doc: DocId) -> usize {
self.idx_reader.num_vals_for_doc(doc) let range = self.range(doc);
(range.end - range.start) as usize
} }
/// Returns the overall number of values in this field. It does not include deletes. /// Returns the overall number of values in this field.
#[inline] #[inline]
pub fn total_num_vals(&self) -> u64 { pub fn total_num_vals(&self) -> u64 {
assert_eq!( self.idx_reader.max_value()
self.vals_reader.num_vals() as u64,
self.get_index_reader().total_num_vals()
);
self.idx_reader.total_num_vals()
} }
} }
impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFieldReader<T> {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_vals(doc_id) as u64
}
fn get_total_len(&self) -> u64 {
self.total_num_vals() as u64
}
}
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically increasing
/// positions.
///
/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a
/// docid to its value position.
fn positions_to_docids<C: Column + ?Sized>(positions: &[u64], idx_reader: &C) -> Vec<DocId> {
let mut docs = vec![];
let mut cur_doc = 0u32;
let mut last_doc = None;
for pos in positions {
loop {
let end = idx_reader.get_val(cur_doc as u64 + 1);
if end > *pos {
// avoid duplicates
if Some(cur_doc) == last_doc {
break;
}
docs.push(cur_doc);
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
docs
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use fastfield_codecs::VecColumn;
use crate::core::Index; use crate::core::Index;
use crate::fastfield::multivalued::reader::positions_to_docids;
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
#[test]
fn test_positions_to_docid() {
let positions = vec![10u64, 11, 15, 20, 21, 22];
let offsets = vec![0, 10, 12, 15, 22, 23];
{
let column = VecColumn::from(&offsets);
let docids = positions_to_docids(&positions, &column);
assert_eq!(docids, vec![1, 3, 4]);
}
}
#[test] #[test]
fn test_multifastfield_reader() -> crate::Result<()> { fn test_multifastfield_reader() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();

View File

@@ -3,7 +3,7 @@ use std::io;
use fastfield_codecs::{ use fastfield_codecs::{
Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
}; };
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use super::get_fastfield_codecs_for_multivalue; use super::get_fastfield_codecs_for_multivalue;
use crate::fastfield::writer::unexpected_value; use crate::fastfield::writer::unexpected_value;
@@ -144,7 +144,7 @@ impl MultiValuedFastFieldWriter {
pub fn serialize( pub fn serialize(
mut self, mut self,
serializer: &mut CompositeFastFieldSerializer, serializer: &mut CompositeFastFieldSerializer,
term_mapping_opt: Option<&FxHashMap<UnorderedTermId, TermOrdinal>>, term_mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
{ {
@@ -219,7 +219,7 @@ pub(crate) struct MultivalueStartIndex<'a, C: Column> {
impl<'a, C: Column> MultivalueStartIndex<'a, C> { impl<'a, C: Column> MultivalueStartIndex<'a, C> {
pub fn new(column: &'a C, doc_id_map: &'a DocIdMapping) -> Self { pub fn new(column: &'a C, doc_id_map: &'a DocIdMapping) -> Self {
assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u32 + 1); assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u64 + 1);
let (min, max) = let (min, max) =
tantivy_bitpacker::minmax(iter_remapped_multivalue_index(doc_id_map, column)) tantivy_bitpacker::minmax(iter_remapped_multivalue_index(doc_id_map, column))
.unwrap_or((0u64, 0u64)); .unwrap_or((0u64, 0u64));
@@ -232,7 +232,7 @@ impl<'a, C: Column> MultivalueStartIndex<'a, C> {
} }
} }
impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> { impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> {
fn get_val(&self, _idx: u32) -> u64 { fn get_val(&self, _idx: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -244,8 +244,8 @@ impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> {
self.max self.max
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
(self.doc_id_map.num_new_doc_ids() + 1) as u32 (self.doc_id_map.num_new_doc_ids() + 1) as u64
} }
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
@@ -262,7 +262,7 @@ fn iter_remapped_multivalue_index<'a, C: Column>(
) -> impl Iterator<Item = u64> + 'a { ) -> impl Iterator<Item = u64> + 'a {
let mut offset = 0; let mut offset = 0;
std::iter::once(0).chain(doc_id_map.iter_old_doc_ids().map(move |old_doc| { std::iter::once(0).chain(doc_id_map.iter_old_doc_ids().map(move |old_doc| {
let num_vals_for_doc = column.get_val(old_doc + 1) - column.get_val(old_doc); let num_vals_for_doc = column.get_val(old_doc as u64 + 1) - column.get_val(old_doc as u64);
offset += num_vals_for_doc; offset += num_vals_for_doc;
offset as u64 offset as u64
})) }))
@@ -369,7 +369,7 @@ impl MultiValueU128FastFieldWriter {
serializer.create_u128_fast_field_with_idx( serializer.create_u128_fast_field_with_idx(
self.field, self.field,
iter_gen, iter_gen,
self.vals.len() as u32, self.vals.len() as u64,
1, 1,
)?; )?;
} }

View File

@@ -90,7 +90,7 @@ impl CompositeFastFieldSerializer {
&mut self, &mut self,
field: Field, field: Field,
iter_gen: F, iter_gen: F,
num_vals: u32, num_vals: u64,
idx: usize, idx: usize,
) -> io::Result<()> { ) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx); let field_write = self.composite_write.for_field_with_idx(field, idx);

View File

@@ -3,7 +3,7 @@ use std::io;
use common; use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker; use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
@@ -256,7 +256,7 @@ impl FastFieldsWriter {
pub fn serialize( pub fn serialize(
self, self,
serializer: &mut CompositeFastFieldSerializer, serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>, mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
for field_writer in self.term_id_writers { for field_writer in self.term_id_writers {
@@ -363,7 +363,7 @@ impl U128FastFieldWriter {
serializer.create_u128_fast_field_with_idx( serializer.create_u128_fast_field_with_idx(
self.field, self.field,
iter_gen, iter_gen,
self.val_count as u32, self.val_count as u64,
0, 0,
)?; )?;
} else { } else {
@@ -371,7 +371,7 @@ impl U128FastFieldWriter {
serializer.create_u128_fast_field_with_idx( serializer.create_u128_fast_field_with_idx(
self.field, self.field,
iter_gen, iter_gen,
self.val_count as u32, self.val_count as u64,
0, 0,
)?; )?;
} }
@@ -511,7 +511,7 @@ impl IntFastFieldWriter {
vals: &self.vals, vals: &self.vals,
min_value: min, min_value: min,
max_value: max, max_value: max,
num_vals: self.val_count as u32, num_vals: self.val_count as u64,
}; };
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?; serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
@@ -526,7 +526,7 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker, vals: &'bitp BlockedBitpacker,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> { impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
@@ -538,7 +538,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// # Panics /// # Panics
/// ///
/// May panic if `doc` is greater than the index. /// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 { fn get_val(&self, _doc: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -562,7 +562,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }

View File

@@ -95,7 +95,7 @@ fn compute_deleted_bitset(
// document that were inserted before it. // document that were inserted before it.
delete_op delete_op
.target .target
.for_each_no_score(segment_reader, &mut |doc_matching_delete_query| { .for_each(segment_reader, &mut |doc_matching_delete_query, _| {
if doc_opstamps.is_deleted(doc_matching_delete_query, delete_op.opstamp) { if doc_opstamps.is_deleted(doc_matching_delete_query, delete_op.opstamp) {
alive_bitset.remove(doc_matching_delete_query); alive_bitset.remove(doc_matching_delete_query);
might_have_changed = true; might_have_changed = true;
@@ -805,7 +805,7 @@ mod tests {
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use fastfield_codecs::{Column, MonotonicallyMappableToU128}; use fastfield_codecs::MonotonicallyMappableToU128;
use proptest::prelude::*; use proptest::prelude::*;
use proptest::prop_oneof; use proptest::prop_oneof;
use proptest::strategy::Strategy; use proptest::strategy::Strategy;
@@ -1472,7 +1472,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?; let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc)) .map(|doc| fast_field_reader.get_val(doc as u64))
.collect(); .collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(()) Ok(())
@@ -1533,7 +1533,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?; let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc)) .map(|doc| fast_field_reader.get_val(doc as u64))
.collect(); .collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
Ok(()) Ok(())
@@ -1591,25 +1591,6 @@ mod tests {
(existing_ids, deleted_ids) (existing_ids, deleted_ids)
} }
fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
let mut id_list = Vec::new();
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
id_list.push(id);
}
IndexingOp::DeleteDoc { id } => {
id_list.retain(|el| *el != id);
}
IndexingOp::DeleteDocQuery { id } => {
id_list.retain(|el| *el != id);
}
_ => {}
}
}
id_list
}
fn test_operation_strategy( fn test_operation_strategy(
ops: &[IndexingOp], ops: &[IndexingOp],
sort_index: bool, sort_index: bool,
@@ -1619,9 +1600,7 @@ mod tests {
let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED); let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED);
let ips_field = schema_builder.add_ip_addr_field( let ips_field = schema_builder.add_ip_addr_field(
"ips", "ips",
IpAddrOptions::default() IpAddrOptions::default().set_fast(Cardinality::MultiValues),
.set_fast(Cardinality::MultiValues)
.set_indexed(),
); );
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED); let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
let i64_field = schema_builder.add_i64_field("i64", INDEXED); let i64_field = schema_builder.add_i64_field("i64", INDEXED);
@@ -1640,7 +1619,6 @@ mod tests {
); );
let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED); let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED);
let multi_text_fields = schema_builder.add_text_field("multi_text_fields", TEXT | STORED);
let multi_numbers = schema_builder.add_u64_field( let multi_numbers = schema_builder.add_u64_field(
"multi_numbers", "multi_numbers",
@@ -1680,19 +1658,11 @@ mod tests {
let ip_exists = |id| id % 3 != 0; // 0 does not exist let ip_exists = |id| id % 3 != 0; // 0 does not exist
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
let multi_text_field_text2 = "test2 test3 test1 test2 test3 test1";
// rotate right
let multi_text_field_text3 = "test3 test1 test2 test3 test1 test2";
let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);
for &op in ops { for &op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id); let ip_from_id = Ipv6Addr::from_u128(id as u128);
if !ip_exists(id) { if !ip_exists(id) {
// every 3rd doc has no ip field // every 3rd doc has no ip field
@@ -1708,17 +1678,14 @@ mod tests {
multi_bools => (id % 2u64) == 0, multi_bools => (id % 2u64) == 0,
text_field => id.to_string(), text_field => id.to_string(),
facet_field => facet, facet_field => facet,
large_text_field => LOREM, large_text_field=> LOREM
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
))?; ))?;
} else { } else {
index_writer.add_document(doc!(id_field=>id, index_writer.add_document(doc!(id_field=>id,
bytes_field => id.to_le_bytes().as_slice(), bytes_field => id.to_le_bytes().as_slice(),
ip_field => ip, ip_field => ip_from_id,
ips_field => ip, ips_field => ip_from_id,
ips_field => ip, ips_field => ip_from_id,
multi_numbers=> id, multi_numbers=> id,
multi_numbers => id, multi_numbers => id,
bool_field => (id % 2u64) != 0, bool_field => (id % 2u64) != 0,
@@ -1729,10 +1696,7 @@ mod tests {
multi_bools => (id % 2u64) == 0, multi_bools => (id % 2u64) == 0,
text_field => id.to_string(), text_field => id.to_string(),
facet_field => facet, facet_field => facet,
large_text_field => LOREM, large_text_field=> LOREM
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
))?; ))?;
} }
} }
@@ -1761,7 +1725,6 @@ mod tests {
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let num_segments_before_merge = searcher.segment_readers().len();
if force_end_merge { if force_end_merge {
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
@@ -1773,7 +1736,6 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
} }
let num_segments_after_merge = searcher.segment_readers().len();
old_reader.reload()?; old_reader.reload()?;
let old_searcher = old_reader.searcher(); let old_searcher = old_reader.searcher();
@@ -1785,7 +1747,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap(); let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc)) .map(move |doc| ff_reader.get_val(doc as u64))
}) })
.collect(); .collect();
@@ -1796,27 +1758,11 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap(); let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc)) .map(move |doc| ff_reader.get_val(doc as u64))
}) })
.collect(); .collect();
let (expected_ids_and_num_occurrences, deleted_ids) = expected_ids(ops); let (expected_ids_and_num_occurrences, deleted_ids) = expected_ids(ops);
let id_list = get_id_list(ops);
// multivalue fast field content
let mut all_ips = Vec::new();
let mut num_ips = 0;
for segment_reader in searcher.segment_readers().iter() {
let ip_reader = segment_reader.fast_fields().ip_addrs(ips_field).unwrap();
for doc in segment_reader.doc_ids_alive() {
let mut vals = vec![];
ip_reader.get_vals(doc, &mut vals);
all_ips.extend_from_slice(&vals);
}
num_ips += ip_reader.total_num_vals();
}
let num_docs_expected = expected_ids_and_num_occurrences let num_docs_expected = expected_ids_and_num_occurrences
.iter() .iter()
.map(|(_, id_occurrences)| *id_occurrences as usize) .map(|(_, id_occurrences)| *id_occurrences as usize)
@@ -1838,30 +1784,6 @@ mod tests {
.collect::<HashSet<_>>() .collect::<HashSet<_>>()
); );
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| ip_exists(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u64);
expected_multi_ips.sort();
all_ips.sort();
assert_eq!(expected_multi_ips, all_ips);
// Test fastfield num_docs
let num_docs: usize = searcher
.segment_readers()
.iter()
.map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().ip_addrs(ips_field).unwrap();
ff_reader.num_docs() as usize
})
.sum();
assert_eq!(num_docs, num_docs_expected);
}
// Load all ips addr // Load all ips addr
let ips: HashSet<Ipv6Addr> = searcher let ips: HashSet<Ipv6Addr> = searcher
.segment_readers() .segment_readers()
@@ -1869,7 +1791,7 @@ mod tests {
.flat_map(|segment_reader| { .flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap(); let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap();
segment_reader.doc_ids_alive().flat_map(move |doc| { segment_reader.doc_ids_alive().flat_map(move |doc| {
let val = ff_reader.get_val(doc); let val = ff_reader.get_val(doc as u64);
if val == Ipv6Addr::from_u128(0) { if val == Ipv6Addr::from_u128(0) {
// TODO Fix null handling // TODO Fix null handling
None None
@@ -1926,7 +1848,7 @@ mod tests {
ff_reader.get_vals(doc, &mut vals); ff_reader.get_vals(doc, &mut vals);
assert_eq!(vals.len(), 2); assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]); assert_eq!(vals[0], vals[1]);
assert_eq!(id_reader.get_val(doc), vals[0]); assert_eq!(id_reader.get_val(doc as u64), vals[0]);
let mut bool_vals = vec![]; let mut bool_vals = vec![];
bool_ff_reader.get_vals(doc, &mut bool_vals); bool_ff_reader.get_vals(doc, &mut bool_vals);
@@ -2000,21 +1922,11 @@ mod tests {
for (existing_id, count) in &expected_ids_and_num_occurrences { for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count); let (existing_id, count) = (*existing_id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64; let assert_field = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(text_field), count); assert_eq!(assert_field(text_field), count);
assert_eq!(get_num_hits(i64_field), count); assert_eq!(assert_field(i64_field), count);
assert_eq!(get_num_hits(f64_field), count); assert_eq!(assert_field(f64_field), count);
assert_eq!(get_num_hits(id_field), count); assert_eq!(assert_field(id_field), count);
// Test multi text
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_expected
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_expected
);
// Test bytes // Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice()); let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
@@ -2065,51 +1977,6 @@ mod tests {
assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count); assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count);
} }
} }
// assert data is like expected
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !ip_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
// let query = gen_query_inclusive("ip", ip, ip);
// assert_eq!(do_search_ip_field(&query), count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !ip_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
// let query = gen_query_inclusive("ip", ip, ip);
// assert_eq!(do_search_ip_field(&query), count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
}
// test facets // test facets
for segment_reader in searcher.segment_readers().iter() { for segment_reader in searcher.segment_readers().iter() {
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap(); let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
@@ -2122,7 +1989,7 @@ mod tests {
facet_reader facet_reader
.facet_from_ord(facet_ords[0], &mut facet) .facet_from_ord(facet_ords[0], &mut facet)
.unwrap(); .unwrap();
let id = ff_reader.get_val(doc_id); let id = ff_reader.get_val(doc_id as u64);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected); assert_eq!(facet, facet_expected);
@@ -2131,40 +1998,6 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_ip_range_query_multivalue_bug() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 2 },
IndexingOp::Commit,
IndexingOp::AddDoc { id: 1 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::Commit,
IndexingOp::Merge
],
true,
false
)
.is_ok());
}
#[test]
fn test_ff_num_ips_regression() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::Commit,
IndexingOp::DeleteDocQuery { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::Commit,
],
false,
true
)
.is_ok());
}
#[test] #[test]
fn test_minimal() { fn test_minimal() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
@@ -2174,7 +2007,7 @@ mod tests {
IndexingOp::DeleteDoc { id: 13 } IndexingOp::DeleteDoc { id: 13 }
], ],
true, true,
true false
) )
.is_ok()); .is_ok());

View File

@@ -1,6 +1,6 @@
use fastfield_codecs::MonotonicallyMappableToU64; use fastfield_codecs::MonotonicallyMappableToU64;
use fnv::FnvHashMap;
use murmurhash32::murmurhash2; use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;
use crate::fastfield::FastValue; use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
@@ -52,7 +52,7 @@ use crate::{DatePrecision, DateTime, DocId, Term};
/// path map to the same index position as long as the probability is relatively low. /// path map to the same index position as long as the probability is relatively low.
#[derive(Default)] #[derive(Default)]
struct IndexingPositionsPerPath { struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>, positions_per_path: FnvHashMap<u32, IndexingPosition>,
} }
impl IndexingPositionsPerPath { impl IndexingPositionsPerPath {

View File

@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{ use crate::fastfield::{
get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer,
MultiValueIndex, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader, MultiValueLength, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
}; };
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -348,29 +348,9 @@ impl IndexMerger {
field, field,
fast_field_serializer, fast_field_serializer,
doc_id_mapping, doc_id_mapping,
&segment_and_ff_readers &segment_and_ff_readers,
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)?; )?;
let num_vals = segment_and_ff_readers
.iter()
.map(|(segment_reader, reader)| {
// TODO implement generic version, implement reverse scan, all - deletes
if let Some(alive_bitset) = segment_reader.alive_bitset() {
alive_bitset
.iter_alive()
.map(|doc| reader.num_vals(doc))
.sum()
} else {
reader.total_num_vals() as u32
}
})
.sum();
let fast_field_readers = segment_and_ff_readers let fast_field_readers = segment_and_ff_readers
.into_iter() .into_iter()
.map(|(_, ff_reader)| ff_reader) .map(|(_, ff_reader)| ff_reader)
@@ -385,7 +365,12 @@ impl IndexMerger {
}) })
}; };
fast_field_serializer.create_u128_fast_field_with_idx(field, iter_gen, num_vals, 1)?; fast_field_serializer.create_u128_fast_field_with_idx(
field,
iter_gen,
doc_id_mapping.len() as u64,
1,
)?;
Ok(()) Ok(())
} }
@@ -412,13 +397,13 @@ impl IndexMerger {
let iter_gen = || { let iter_gen = || {
doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize];
fast_field_reader.get_val(doc_addr.doc_id) fast_field_reader.get_val(doc_addr.doc_id as u64)
}) })
}; };
fast_field_serializer.create_u128_fast_field_with_idx( fast_field_serializer.create_u128_fast_field_with_idx(
field, field,
iter_gen, iter_gen,
doc_id_mapping.len() as u32, doc_id_mapping.len() as u64,
0, 0,
)?; )?;
Ok(()) Ok(())
@@ -525,8 +510,8 @@ impl IndexMerger {
doc_id_reader_pair doc_id_reader_pair
.into_iter() .into_iter()
.kmerge_by(|a, b| { .kmerge_by(|a, b| {
let val1 = a.2.get_val(a.0); let val1 = a.2.get_val(a.0 as u64);
let val2 = b.2.get_val(b.0); let val2 = b.2.get_val(b.0 as u64);
if sort_by_field.order == Order::Asc { if sort_by_field.order == Order::Asc {
val1 < val2 val1 < val2
} else { } else {
@@ -544,11 +529,11 @@ impl IndexMerger {
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and // Creating the index file to point into the data, generic over `BytesFastFieldReader` and
// `MultiValuedFastFieldReader` // `MultiValuedFastFieldReader`
// //
fn write_1_n_fast_field_idx_generic( fn write_1_n_fast_field_idx_generic<T: MultiValueLength + Send + Sync>(
field: Field, field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
segment_and_ff_readers: &[(&SegmentReader, &MultiValueIndex)], segment_and_ff_readers: &[(&SegmentReader, T)],
) -> crate::Result<()> { ) -> crate::Result<()> {
let column = let column =
RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping); RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping);
@@ -582,12 +567,7 @@ impl IndexMerger {
field, field,
fast_field_serializer, fast_field_serializer,
doc_id_mapping, doc_id_mapping,
&segment_and_ff_readers &segment_and_ff_readers,
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
) )
} }
@@ -717,12 +697,7 @@ impl IndexMerger {
field, field,
fast_field_serializer, fast_field_serializer,
doc_id_mapping, doc_id_mapping,
&segment_and_ff_readers &segment_and_ff_readers,
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)?; )?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field); let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field);
@@ -829,7 +804,7 @@ impl IndexMerger {
// Let's compute the list of non-empty posting lists // Let's compute the list of non-empty posting lists
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() { for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
let segment_reader = &self.readers[segment_ord]; let segment_reader = &self.readers[segment_ord];
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord]; let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
let segment_postings = inverted_index let segment_postings = inverted_index
.read_postings_from_terminfo(&term_info, segment_postings_option)?; .read_postings_from_terminfo(&term_info, segment_postings_option)?;
let alive_bitset_opt = segment_reader.alive_bitset(); let alive_bitset_opt = segment_reader.alive_bitset();

View File

@@ -190,13 +190,13 @@ mod tests {
assert_eq!(fast_field.get_val(4), 2u64); assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get_val(3), 3u64); assert_eq!(fast_field.get_val(3), 3u64);
if force_disjunct_segment_sort_values { if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get_val(2), 20u64); assert_eq!(fast_field.get_val(2u64), 20u64);
assert_eq!(fast_field.get_val(1), 100u64); assert_eq!(fast_field.get_val(1u64), 100u64);
} else { } else {
assert_eq!(fast_field.get_val(2), 10u64); assert_eq!(fast_field.get_val(2u64), 10u64);
assert_eq!(fast_field.get_val(1), 20u64); assert_eq!(fast_field.get_val(1u64), 20u64);
} }
assert_eq!(fast_field.get_val(0), 1_000u64); assert_eq!(fast_field.get_val(0u64), 1_000u64);
// test new field norm mapping // test new field norm mapping
{ {
@@ -545,7 +545,7 @@ mod bench_sorted_index_merge {
// add values in order of the new doc_ids // add values in order of the new doc_ids
let mut val = 0; let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids { for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get_val(doc_id); val = field_reader.get_val(doc_id as u64);
} }
val val

View File

@@ -785,87 +785,4 @@ mod tests {
// On release this was [2, 1]. (< note the decreasing values) // On release this was [2, 1]. (< note the decreasing values)
assert_eq!(positions, &[2, 5]); assert_eq!(positions, &[2, 5]);
} }
#[test]
fn test_multiple_field_value_and_long_tokens() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "roller-coaster".to_string(),
tokens: vec![Token {
offset_from: 0,
offset_to: 14,
position: 0,
text: "rollercoaster".to_string(),
position_length: 2,
}],
};
doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inv_index = seg_reader.inverted_index(text).unwrap();
let term = Term::from_field_text(text, "rollercoaster");
let mut postings = inv_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0u32);
let mut positions = Vec::new();
postings.positions(&mut positions);
assert_eq!(positions, &[0, 3]); //< as opposed to 0, 2 if we had a position length of 1.
}
#[test]
fn test_last_token_not_ending_last() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![Token { // Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
}],
};
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inv_index = seg_reader.inverted_index(text).unwrap();
let term = Term::from_field_text(text, "hello");
let mut postings = inv_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0u32);
let mut positions = Vec::new();
postings.positions(&mut positions);
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
}
} }

View File

@@ -12,7 +12,7 @@ pub(crate) struct RemappedDocIdColumn<'a> {
fast_field_readers: Vec<Arc<dyn Column<u64>>>, fast_field_readers: Vec<Arc<dyn Column<u64>>>,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
fn compute_min_max_val( fn compute_min_max_val(
@@ -32,7 +32,7 @@ fn compute_min_max_val(
// we need to recompute the max / min // we need to recompute the max / min
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc_id| u64_reader.get_val(doc_id)) .map(|doc_id| u64_reader.get_val(doc_id as u64))
.minmax() .minmax()
.into_option() .into_option()
} }
@@ -73,13 +73,13 @@ impl<'a> RemappedDocIdColumn<'a> {
fast_field_readers, fast_field_readers,
min_value, min_value,
max_value, max_value,
num_vals: doc_id_mapping.len() as u32, num_vals: doc_id_mapping.len() as u64,
} }
} }
} }
impl<'a> Column for RemappedDocIdColumn<'a> { impl<'a> Column for RemappedDocIdColumn<'a> {
fn get_val(&self, _doc: u32) -> u64 { fn get_val(&self, _doc: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -90,7 +90,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> {
.map(|old_doc_addr| { .map(|old_doc_addr| {
let fast_field_reader = let fast_field_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize]; &self.fast_field_readers[old_doc_addr.segment_ord as usize];
fast_field_reader.get_val(old_doc_addr.doc_id) fast_field_reader.get_val(old_doc_addr.doc_id as u64)
}), }),
) )
} }
@@ -102,7 +102,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }

View File

@@ -3,7 +3,7 @@ use std::cmp;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use super::flat_map_with_buffer::FlatMapWithBufferIter; use super::flat_map_with_buffer::FlatMapWithBufferIter;
use crate::fastfield::{MultiValueIndex, MultiValuedFastFieldReader}; use crate::fastfield::{MultiValueLength, MultiValuedFastFieldReader};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping; use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
use crate::schema::Field; use crate::schema::Field;
use crate::{DocAddress, SegmentReader}; use crate::{DocAddress, SegmentReader};
@@ -13,7 +13,7 @@ pub(crate) struct RemappedDocIdMultiValueColumn<'a> {
fast_field_readers: Vec<MultiValuedFastFieldReader<u64>>, fast_field_readers: Vec<MultiValuedFastFieldReader<u64>>,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
impl<'a> RemappedDocIdMultiValueColumn<'a> { impl<'a> RemappedDocIdMultiValueColumn<'a> {
@@ -61,13 +61,13 @@ impl<'a> RemappedDocIdMultiValueColumn<'a> {
fast_field_readers, fast_field_readers,
min_value, min_value,
max_value, max_value,
num_vals: num_vals as u32, num_vals: num_vals as u64,
} }
} }
} }
impl<'a> Column for RemappedDocIdMultiValueColumn<'a> { impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
fn get_val(&self, _pos: u32) -> u64 { fn get_val(&self, _pos: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -89,22 +89,22 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a> { pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> {
doc_id_mapping: &'a SegmentDocIdMapping, doc_id_mapping: &'a SegmentDocIdMapping,
multi_value_length_readers: Vec<&'a MultiValueIndex>, multi_value_length_readers: Vec<&'a T>,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
impl<'a> RemappedDocIdMultiValueIndexColumn<'a> { impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
pub(crate) fn new( pub(crate) fn new(
segment_and_ff_readers: &'a [(&'a SegmentReader, &'a MultiValueIndex)], segment_and_ff_readers: &'a [(&'a SegmentReader, T)],
doc_id_mapping: &'a SegmentDocIdMapping, doc_id_mapping: &'a SegmentDocIdMapping,
) -> Self { ) -> Self {
// We go through a complete first pass to compute the minimum and the // We go through a complete first pass to compute the minimum and the
@@ -115,19 +115,17 @@ impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
let mut multi_value_length_readers = Vec::with_capacity(segment_and_ff_readers.len()); let mut multi_value_length_readers = Vec::with_capacity(segment_and_ff_readers.len());
for segment_and_ff_reader in segment_and_ff_readers { for segment_and_ff_reader in segment_and_ff_readers {
let segment_reader = segment_and_ff_reader.0; let segment_reader = segment_and_ff_reader.0;
let multi_value_length_reader = segment_and_ff_reader.1; let multi_value_length_reader = &segment_and_ff_reader.1;
if !segment_reader.has_deletes() { if !segment_reader.has_deletes() {
max_value += multi_value_length_reader.total_num_vals(); max_value += multi_value_length_reader.get_total_len();
} else { } else {
for doc in segment_reader.doc_ids_alive() { for doc in segment_reader.doc_ids_alive() {
max_value += multi_value_length_reader.num_vals_for_doc(doc) as u64; max_value += multi_value_length_reader.get_len(doc);
} }
} }
num_vals += segment_reader.num_docs(); num_vals += segment_reader.num_docs() as u64;
multi_value_length_readers.push(multi_value_length_reader); multi_value_length_readers.push(multi_value_length_reader);
} }
// The value range is always get_val(doc)..get_val(doc + 1)
num_vals += 1;
Self { Self {
doc_id_mapping, doc_id_mapping,
multi_value_length_readers, multi_value_length_readers,
@@ -138,8 +136,8 @@ impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
} }
} }
impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> { impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> {
fn get_val(&self, _pos: u32) -> u64 { fn get_val(&self, _pos: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -150,8 +148,8 @@ impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
move |old_doc_addr| { move |old_doc_addr| {
let ff_reader = let ff_reader =
&self.multi_value_length_readers[old_doc_addr.segment_ord as usize]; &self.multi_value_length_readers[old_doc_addr.segment_ord as usize];
offset += ff_reader.num_vals_for_doc(old_doc_addr.doc_id); offset += ff_reader.get_len(old_doc_addr.doc_id);
offset as u64 offset
}, },
)), )),
) )
@@ -164,7 +162,7 @@ impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }

View File

@@ -311,7 +311,7 @@ pub use crate::postings::Postings;
pub use crate::schema::{DateOptions, DatePrecision, Document, Term}; pub use crate::schema::{DateOptions, DatePrecision, Document, Term};
/// Index format version. /// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5; const INDEX_FORMAT_VERSION: u32 = 4;
/// Structure version for the index. /// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -819,7 +819,7 @@ pub mod tests {
fn test_indexedfield_not_in_documents() -> crate::Result<()> { fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("absent_text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
@@ -1001,7 +1001,7 @@ pub mod tests {
let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let fast_field_float = schema_builder.add_f64_field("float", FAST); let fast_field_float = schema_builder.add_f64_field("float", FAST);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("stored_int", STORED); let stored_int_field = schema_builder.add_u64_field("text", STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);

View File

@@ -3,7 +3,7 @@ use std::io;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::Range; use std::ops::Range;
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use super::stacker::Addr; use super::stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter; use crate::fastfield::MultiValuedFastFieldWriter;
@@ -56,12 +56,12 @@ pub(crate) fn serialize_postings(
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
schema: &Schema, schema: &Schema,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> { ) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> = let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len()); Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(ctx.term_index.iter()); term_offsets.extend(ctx.term_index.iter());
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone()); term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> = let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new(); HashMap::new();
let field_offsets = make_field_partition(&term_offsets); let field_offsets = make_field_partition(&term_offsets);
@@ -74,7 +74,7 @@ pub(crate) fn serialize_postings(
let unordered_term_ids = term_offsets[byte_offsets.clone()] let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter() .iter()
.map(|&(_, _, bucket)| bucket); .map(|&(_, _, bucket)| bucket);
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate() .enumerate()
.map(|(term_ord, unord_term_id)| { .map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal) (unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
@@ -170,7 +170,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.truncate_value_bytes(end_of_path_idx); term_buffer.truncate_value_bytes(end_of_path_idx);
term_buffer.append_bytes(token.text.as_bytes()); term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32; let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32); end_position = start_position + token.position_length as u32;
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx); let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() { if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id); term_id_fast_field_writer.add_val(unordered_term_id);

View File

@@ -33,7 +33,7 @@ where
&'a self, &'a self,
term_dict: &'a TermDictionary, term_dict: &'a TermDictionary,
) -> io::Result<TermStreamer<'a, &'a A>> { ) -> io::Result<TermStreamer<'a, &'a A>> {
let automaton: &A = &self.automaton; let automaton: &A = &*self.automaton;
let term_stream_builder = term_dict.search(automaton); let term_stream_builder = term_dict.search(automaton);
term_stream_builder.into_stream() term_stream_builder.into_stream()
} }

View File

@@ -86,7 +86,10 @@ impl DocSet for BitSetDocSet {
self.doc self.doc
} }
/// Returns the number of values set in the underlying bitset. /// Returns half of the `max_doc`
/// This is quite a terrible heuristic,
/// but we don't have access to any better
/// value.
fn size_hint(&self) -> u32 { fn size_hint(&self) -> u32 {
self.docs.len() as u32 self.docs.len() as u32
} }

View File

@@ -5,7 +5,7 @@ use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::term_query::TermScorer; use crate::query::term_query::TermScorer;
use crate::query::weight::{for_each_docset, for_each_pruning_scorer, for_each_scorer}; use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::{ use crate::query::{
intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer, intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer,
Union, Weight, Union, Weight,
@@ -219,24 +219,6 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
Ok(()) Ok(())
} }
fn for_each_no_score(
&self,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
for_each_docset(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_docset(scorer.as_mut(), callback);
}
}
Ok(())
}
/// Calls `callback` with all of the `(doc, score)` for which score /// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold. /// is exceeding a given threshold.
/// ///

View File

@@ -18,7 +18,6 @@ mod phrase_query;
mod query; mod query;
mod query_parser; mod query_parser;
mod range_query; mod range_query;
mod range_query_ip_fastfield;
mod regex_query; mod regex_query;
mod reqopt_scorer; mod reqopt_scorer;
mod scorer; mod scorer;

View File

@@ -6,13 +6,12 @@ use common::BitSet;
use crate::core::{Searcher, SegmentReader}; use crate::core::{Searcher, SegmentReader};
use crate::error::TantivyError; use crate::error::TantivyError;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Query, Scorer, Weight}; use crate::query::{BitSetDocSet, ConstScorer, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type}; use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DocId, Score}; use crate::{DocId, Score};
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>( fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>, bound: &Bound<TFrom>,
transform: &Transform, transform: &Transform,
) -> Bound<TTo> { ) -> Bound<TTo> {
@@ -30,17 +29,8 @@ pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// ///
/// # Implementation /// # Implementation
/// ///
/// ## Default /// The current implement will iterate over the terms within the range
/// The default implementation collects all documents _upfront_ into a `BitSet`. /// and append all of the document cross into a `BitSet`.
/// This is done by iterating over the terms within the range and loading all docs for each
/// `TermInfo` from the inverted index (posting list) and put them into a `BitSet`.
/// Depending on the number of terms matched, this is a potentially expensive operation.
///
/// ## IP fast field
/// For IP fast fields a custom variant is used, by scanning the fast field. Unlike the default
/// variant we can walk in a lazy fashion over it, since the fastfield is implicit orderered by
/// DocId.
///
/// ///
/// # Example /// # Example
/// ///
@@ -259,8 +249,7 @@ impl Query for RangeQuery {
_scoring_enabled: bool, _scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> { ) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema(); let schema = searcher.schema();
let field_type = schema.get_field_entry(self.field).field_type(); let value_type = schema.get_field_entry(self.field).field_type().value_type();
let value_type = field_type.value_type();
if value_type != self.value_type { if value_type != self.value_type {
let err_msg = format!( let err_msg = format!(
"Create a range query of the type {:?}, when the field given was of type {:?}", "Create a range query of the type {:?}, when the field given was of type {:?}",
@@ -268,20 +257,11 @@ impl Query for RangeQuery {
); );
return Err(TantivyError::SchemaError(err_msg)); return Err(TantivyError::SchemaError(err_msg));
} }
Ok(Box::new(RangeWeight {
if field_type.is_ip_addr() && field_type.is_fast() { field: self.field,
Ok(Box::new(IPFastFieldRangeWeight::new( left_bound: self.left_bound.clone(),
self.field, right_bound: self.right_bound.clone(),
&self.left_bound, }))
&self.right_bound,
)))
} else {
Ok(Box::new(RangeWeight {
field: self.field,
left_bound: self.left_bound.clone(),
right_bound: self.right_bound.clone(),
}))
}
} }
} }
@@ -355,7 +335,7 @@ mod tests {
use super::RangeQuery; use super::RangeQuery;
use crate::collector::{Count, TopDocs}; use crate::collector::{Count, TopDocs};
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT}; use crate::schema::{Document, Field, IntoIpv6Addr, Schema, INDEXED, STORED, TEXT};
use crate::{doc, Index}; use crate::{doc, Index};
#[test] #[test]
@@ -529,24 +509,10 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn search_ip_range_test_posting_list() {
search_ip_range_test_opt(false);
}
#[test] #[test]
fn search_ip_range_test() { fn search_ip_range_test() {
search_ip_range_test_opt(true);
}
fn search_ip_range_test_opt(with_fast_field: bool) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let ip_field = if with_fast_field { let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED);
schema_builder.add_ip_addr_field("ip", INDEXED | STORED | FAST)
} else {
schema_builder.add_ip_addr_field("ip", INDEXED | STORED)
};
let text_field = schema_builder.add_text_field("text", TEXT | STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let ip_addr_1 = IpAddr::from_str("127.0.0.10").unwrap().into_ipv6_addr(); let ip_addr_1 = IpAddr::from_str("127.0.0.10").unwrap().into_ipv6_addr();
@@ -554,22 +520,16 @@ mod tests {
{ {
let mut index_writer = index.writer(3_000_000).unwrap(); let mut index_writer = index.writer(3_000_000).unwrap();
for _ in 0..1_000 { index_writer
index_writer .add_document(doc!(
.add_document(doc!( ip_field => ip_addr_1
ip_field => ip_addr_1, ))
text_field => "BLUBBER" .unwrap();
)) index_writer
.unwrap(); .add_document(doc!(
} ip_field => ip_addr_2
for _ in 0..1_000 { ))
index_writer .unwrap();
.add_document(doc!(
ip_field => ip_addr_2,
text_field => "BLOBBER"
))
.unwrap();
}
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
@@ -583,25 +543,24 @@ mod tests {
count count
}; };
let query_from_text = |text: &str| { let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![]) QueryParser::for_index(&index, vec![ip_field])
.parse_query(text) .parse_query(text)
.unwrap() .unwrap()
}; };
// Inclusive range
assert_eq!( assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20]")), get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20]")),
2000 2
); );
assert_eq!( assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.10 TO 127.0.0.20]")), get_num_hits(query_from_text("ip:[127.0.0.10 TO 127.0.0.20]")),
2000 2
); );
assert_eq!( assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.11 TO 127.0.0.20]")), get_num_hits(query_from_text("ip:[127.0.0.11 TO 127.0.0.20]")),
1000 1
); );
assert_eq!( assert_eq!(
@@ -609,84 +568,9 @@ mod tests {
0 0
); );
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.11 TO *]")), 1000); assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.11 TO *]")), 1);
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.21 TO *]")), 0); assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.21 TO *]")), 0);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.9]")), 0); assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.9]")), 0);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.10]")), 1000); assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.10]")), 1);
// Exclusive range
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.20}")),
1000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.21}")),
2000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.10 TO 127.0.0.20}")),
0
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.11 TO 127.0.0.20}")),
0
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.11 TO 127.0.0.19}")),
0
);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.11 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.10 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.21 TO *}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.20 TO *}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.19 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.9}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.10}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.11}")), 1000);
// Inclusive/Exclusive range
assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20}")),
1000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.20]")),
2000
);
// Intersection
assert_eq!(
get_num_hits(query_from_text(
"text:BLUBBER AND ip:[127.0.0.10 TO 127.0.0.10]"
)),
1000
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLOBBER AND ip:[127.0.0.10 TO 127.0.0.10]"
)),
0
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLOBBER AND ip:[127.0.0.20 TO 127.0.0.20]"
)),
1000
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLUBBER AND ip:[127.0.0.20 TO 127.0.0.20]"
)),
0
);
} }
} }

View File

@@ -1,723 +0,0 @@
//! IP Fastfields support efficient scanning for range queries.
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use std::sync::Arc;
use common::BinarySerializable;
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::range_query::map_bound;
use super::{ConstScorer, Explanation, Scorer, Weight};
use crate::fastfield::MultiValuedU128FastFieldReader;
use crate::schema::{Cardinality, Field};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, TERMINATED};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight {
field: Field,
left_bound: Bound<Ipv6Addr>,
right_bound: Bound<Ipv6Addr>,
}
impl IPFastFieldRangeWeight {
pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
let ip_from_bound_raw_data = |data: &Vec<u8>| {
let left_ip_u128: u128 =
u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap());
Ipv6Addr::from_u128(left_ip_u128)
};
let left_bound = map_bound(left_bound, &ip_from_bound_raw_data);
let right_bound = map_bound(right_bound, &ip_from_bound_raw_data);
Self {
field,
left_bound,
right_bound,
}
}
}
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let field_type = reader.schema().get_field_entry(self.field).field_type();
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let ip_addr_fast_field: Arc<dyn Column<Ipv6Addr>> =
reader.fast_fields().ip_addr(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let docset = IpRangeDocSet::new(value_range, ip_addr_fast_field);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
Cardinality::MultiValues => {
let ip_addr_fast_field: MultiValuedU128FastFieldReader<Ipv6Addr> =
reader.fast_fields().ip_addrs(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let docset = IpRangeDocSet::new(value_range, Arc::new(ip_addr_fast_field));
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({}) does not match",
doc
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
fn bound_to_value_range(
left_bound: &Bound<Ipv6Addr>,
right_bound: &Bound<Ipv6Addr>,
min_value: Ipv6Addr,
max_value: Ipv6Addr,
) -> RangeInclusive<Ipv6Addr> {
let start_value = match left_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
Bound::Unbounded => min_value,
};
let end_value = match right_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
Bound::Unbounded => max_value,
};
start_value..=end_value
}
/// Helper to have a cursor over a vec of docids
struct VecCursor {
docs: Vec<u32>,
current_pos: usize,
}
impl VecCursor {
fn new() -> Self {
Self {
docs: Vec::with_capacity(32),
current_pos: 0,
}
}
fn next(&mut self) -> Option<u32> {
self.current_pos += 1;
self.current()
}
#[inline]
fn current(&self) -> Option<u32> {
self.docs.get(self.current_pos).map(|el| *el as u32)
}
fn get_cleared_data(&mut self) -> &mut Vec<u32> {
self.docs.clear();
self.current_pos = 0;
&mut self.docs
}
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
}
fn is_empty(&self) -> bool {
self.current_pos >= self.docs.len()
}
}
struct IpRangeDocSet<T> {
/// The range filter on the values.
value_range: RangeInclusive<Ipv6Addr>,
ip_addrs: T,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
/// Number of docs range checked in a batch.
///
/// There are two patterns.
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
/// will come, so we start with small chunks
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
/// full scan.
fetch_horizon: u32,
/// Current batch of loaded docs.
loaded_docs: VecCursor,
last_seek_pos_opt: Option<u32>,
}
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl<T> IpRangeDocSet<T>
where Self: SingleOrMultivalued
{
fn new(value_range: RangeInclusive<Ipv6Addr>, ip_addrs: T) -> Self {
let mut ip_range_docset = Self {
value_range,
ip_addrs,
loaded_docs: VecCursor::new(),
next_fetch_start: 0,
fetch_horizon: DEFAULT_FETCH_HORIZON,
last_seek_pos_opt: None,
};
ip_range_docset.reset_fetch_range();
ip_range_docset.fetch_block();
ip_range_docset
}
fn reset_fetch_range(&mut self) {
self.fetch_horizon = DEFAULT_FETCH_HORIZON;
}
/// Returns true if more data could be fetched
fn fetch_block(&mut self) {
const MAX_HORIZON: u32 = 100_000;
while self.loaded_docs.is_empty() {
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
if finished_to_end {
break;
}
// Fetch more data, increase horizon. Horizon only gets reset when doing a seek.
self.fetch_horizon = (self.fetch_horizon * 2).min(MAX_HORIZON);
}
}
/// check if the distance between the seek calls is large
fn is_last_seek_distance_large(&self, new_seek: DocId) -> bool {
if let Some(last_seek_pos) = self.last_seek_pos_opt {
(new_seek - last_seek_pos) >= 128
} else {
true
}
}
}
trait SingleOrMultivalued {
fn num_docs(&self) -> u32;
fn fetch_horizon(&mut self, horizon: u32) -> bool {
// Have different implem for single value and multivalue
todo!();
// let mut finished_to_end = false;
// let limit = self.num_docs();
// let mut end = self.next_fetch_start + horizon;
// if end >= limit {
// end = limit;
// finished_to_end = true;
// }
// let last_loaded_docs_val = self
// .is_multivalue
// .then(|| self.loaded_docs.last_value())
// .flatten();
// let last_loaded_docs_val =
// if self.is_multivalue {
// self.loaded_docs.last_value()
// } else {
// None
// };
// let loaded_docs_data = self.loaded_docs.get_cleared_data();
// self.ip_addr_fast_field.get_docids_for_value_range(
// self.value_range.clone(),
// self.next_fetch_start..end,
// loaded_docs_data,
// );
// // In case of multivalues, we may have an overlap of the same docid between fetching
// blocks if let Some(last_value) = last_loaded_docs_val {
// while self.loaded_docs.current() == Some(last_value) {
// self.loaded_docs.next();
// }
// }
// self.next_fetch_start = end;
// finished_to_end
}
}
impl SingleOrMultivalued for IpRangeDocSet<Arc<dyn Column<Ipv6Addr>>> {
fn num_docs(&self) -> u32 {
self.ip_addrs.num_docs()
}
}
impl SingleOrMultivalued for IpRangeDocSet<Arc<MultiValuedU128FastFieldReader<Ipv6Addr>>> {
fn num_docs(&self) -> u32 {
self.ip_addrs.get_index_reader().num_docs()
}
}
impl<T: Send> DocSet for IpRangeDocSet<T>
where Self: SingleOrMultivalued
{
#[inline]
fn advance(&mut self) -> DocId {
if let Some(docid) = self.loaded_docs.next() {
docid as u32
} else {
if self.next_fetch_start >= self.num_docs() as u32 {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
}
#[inline]
fn doc(&self) -> DocId {
self.loaded_docs
.current()
.map(|el| el as u32)
.unwrap_or(TERMINATED)
}
/// Advances the `DocSet` forward until reaching the target, or going to the
/// lowest [`DocId`] greater than the target.
///
/// If the end of the `DocSet` is reached, [`TERMINATED`] is returned.
///
/// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
fn seek(&mut self, target: DocId) -> DocId {
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
if target > self.next_fetch_start {
self.next_fetch_start = target;
}
let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target {
doc = self.advance();
}
self.last_seek_pos_opt = Some(target);
doc
}
fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{IpAddrOptions, Schema, FAST, INDEXED, STORED, STRING};
use crate::Index;
#[derive(Clone, Debug)]
pub struct Doc {
pub id: String,
pub ip: Ipv6Addr,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..10_000u64).prop_map(doc_from_id_1),
(1u64..10_000u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: id.to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: (id - 1).to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(ops).is_ok());
}
}
#[test]
fn ip_range_regression1_test() {
let ops = vec![doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn ip_range_regression2_test() {
let ops = vec![
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn ip_range_regression3_test() {
let ops = vec![doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED | FAST);
let ips_field = schema_builder.add_ip_addr_field(
"ips",
IpAddrOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
text_field => doc.id.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_ip_range_for_docs(docs: Vec<Doc>) -> crate::Result<()> {
let index = create_index_from_docs(&docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let test_sample = |sample_docs: Vec<Doc>| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive("ip", ips[0], ips[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ips", ips[0], ips[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ip", ips[0], ips[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search on multivalue ip field
let id_filter = sample_docs[0].id.to_string();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ips", ips[0], ips[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(vec![docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(vec![docs[0].clone(), docs[1].clone()]);
test_sample(vec![docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(vec![docs[1].clone(), docs[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id,
// Multiply by 1000, so that we create many buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
let index = create_index_from_docs(&docs);
index
}
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn excute_query(
field: &str,
ip_range: RangeInclusive<Ipv6Addr>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
}
}

View File

@@ -115,7 +115,7 @@ mod tests {
pub fn test_term_set_query() -> crate::Result<()> { pub fn test_term_set_query() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field1 = schema_builder.add_text_field("field1", TEXT); let field1 = schema_builder.add_text_field("field1", TEXT);
let field2 = schema_builder.add_text_field("field2", TEXT); let field2 = schema_builder.add_text_field("field1", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {

View File

@@ -5,7 +5,7 @@ use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings; use crate::postings::SegmentPostings;
use crate::query::bm25::Bm25Weight; use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::weight::{for_each_docset, for_each_scorer}; use crate::query::weight::for_each_scorer;
use crate::query::{Explanation, Scorer, Weight}; use crate::query::{Explanation, Scorer, Weight};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::{DocId, Score, Term}; use crate::{DocId, Score, Term};
@@ -56,18 +56,6 @@ impl Weight for TermWeight {
Ok(()) Ok(())
} }
/// Iterates through all of the document matched by the DocSet
/// `DocSet` and push the scored documents to the collector.
fn for_each_no_score(
&self,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId),
) -> crate::Result<()> {
let mut scorer = self.specialized_scorer(reader, 1.0)?;
for_each_docset(&mut scorer, callback);
Ok(())
}
/// Calls `callback` with all of the `(doc, score)` for which score /// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold. /// is exceeding a given threshold.
/// ///

View File

@@ -94,8 +94,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
self.doc = min_doc; self.doc = min_doc;
refill( refill(
&mut self.docsets, &mut self.docsets,
&mut self.bitsets, &mut *self.bitsets,
&mut self.scores, &mut *self.scores,
min_doc, min_doc,
); );
true true

View File

@@ -1,10 +1,10 @@
use super::Scorer; use super::Scorer;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::query::Explanation; use crate::query::Explanation;
use crate::{DocId, DocSet, Score, TERMINATED}; use crate::{DocId, Score, TERMINATED};
/// Iterates through all of the documents and scores matched by the DocSet /// Iterates through all of the document matched by the DocSet
/// `DocSet`. /// `DocSet` and push the scored documents to the collector.
pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>( pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
scorer: &mut TScorer, scorer: &mut TScorer,
callback: &mut dyn FnMut(DocId, Score), callback: &mut dyn FnMut(DocId, Score),
@@ -16,16 +16,6 @@ pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
} }
} }
/// Iterates through all of the documents matched by the DocSet
/// `DocSet`.
pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, callback: &mut dyn FnMut(DocId)) {
let mut doc = docset.doc();
while doc != TERMINATED {
callback(doc);
doc = docset.advance();
}
}
/// Calls `callback` with all of the `(doc, score)` for which score /// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold. /// is exceeding a given threshold.
/// ///
@@ -88,18 +78,6 @@ pub trait Weight: Send + Sync + 'static {
Ok(()) Ok(())
} }
/// Iterates through all of the document matched by the DocSet
/// `DocSet` and push the scored documents to the collector.
fn for_each_no_score(
&self,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId),
) -> crate::Result<()> {
let mut docset = self.scorer(reader, 1.0)?;
for_each_docset(docset.as_mut(), callback);
Ok(())
}
/// Calls `callback` with all of the `(doc, score)` for which score /// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold. /// is exceeding a given threshold.
/// ///

View File

@@ -176,11 +176,6 @@ impl FieldType {
} }
} }
/// returns true if this is an ip address field
pub fn is_ip_addr(&self) -> bool {
matches!(self, FieldType::IpAddr(_))
}
/// returns true if the field is indexed. /// returns true if the field is indexed.
pub fn is_indexed(&self) -> bool { pub fn is_indexed(&self) -> bool {
match *self { match *self {
@@ -237,11 +232,11 @@ impl FieldType {
/// returns true if the field is fast. /// returns true if the field is fast.
pub fn fastfield_cardinality(&self) -> Option<Cardinality> { pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
match *self { match *self {
FieldType::Bytes(ref bytes_options) => { FieldType::Bytes(ref bytes_options) if bytes_options.is_fast() => {
bytes_options.is_fast().then_some(Cardinality::SingleValue) Some(Cardinality::SingleValue)
} }
FieldType::Str(ref text_options) => { FieldType::Str(ref text_options) if text_options.is_fast() => {
text_options.is_fast().then_some(Cardinality::MultiValues) Some(Cardinality::MultiValues)
} }
FieldType::U64(ref int_options) FieldType::U64(ref int_options)
| FieldType::I64(ref int_options) | FieldType::I64(ref int_options)
@@ -250,7 +245,7 @@ impl FieldType {
FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(), FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues), FieldType::Facet(_) => Some(Cardinality::MultiValues),
FieldType::JsonObject(_) => None, FieldType::JsonObject(_) => None,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.get_fastfield_cardinality(), _ => None,
} }
} }

View File

@@ -46,9 +46,13 @@ impl SchemaBuilder {
/// Adds a new u64 field. /// Adds a new u64 field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_u64_field<T: Into<NumericOptions>>( pub fn add_u64_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -62,9 +66,13 @@ impl SchemaBuilder {
/// Adds a new i64 field. /// Adds a new i64 field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_i64_field<T: Into<NumericOptions>>( pub fn add_i64_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -78,9 +86,13 @@ impl SchemaBuilder {
/// Adds a new f64 field. /// Adds a new f64 field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_f64_field<T: Into<NumericOptions>>( pub fn add_f64_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -94,9 +106,13 @@ impl SchemaBuilder {
/// Adds a new bool field. /// Adds a new bool field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_bool_field<T: Into<NumericOptions>>( pub fn add_bool_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -112,9 +128,13 @@ impl SchemaBuilder {
/// Internally, Tantivy simply stores dates as i64 UTC timestamps, /// Internally, Tantivy simply stores dates as i64 UTC timestamps,
/// while the user supplies DateTime values for convenience. /// while the user supplies DateTime values for convenience.
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_date_field<T: Into<DateOptions>>( pub fn add_date_field<T: Into<DateOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -128,9 +148,13 @@ impl SchemaBuilder {
/// Adds a ip field. /// Adds a ip field.
/// Returns the associated field handle. /// Returns the associated field handle.
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_ip_addr_field<T: Into<IpAddrOptions>>( pub fn add_ip_addr_field<T: Into<IpAddrOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -144,9 +168,13 @@ impl SchemaBuilder {
/// Adds a new text field. /// Adds a new text field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_text_field<T: Into<TextOptions>>( pub fn add_text_field<T: Into<TextOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -200,10 +228,8 @@ impl SchemaBuilder {
pub fn add_field(&mut self, field_entry: FieldEntry) -> Field { pub fn add_field(&mut self, field_entry: FieldEntry) -> Field {
let field = Field::from_field_id(self.fields.len() as u32); let field = Field::from_field_id(self.fields.len() as u32);
let field_name = field_entry.name().to_string(); let field_name = field_entry.name().to_string();
if let Some(_previous_value) = self.fields_map.insert(field_name, field) {
panic!("Field already exists in schema {}", field_entry.name());
};
self.fields.push(field_entry); self.fields.push(field_entry);
self.fields_map.insert(field_name, field);
field field
} }

View File

@@ -2,7 +2,7 @@ use std::io;
use common::{BinarySerializable, FixedSize, HasLen}; use common::{BinarySerializable, FixedSize, HasLen};
use super::{Decompressor, DOC_STORE_VERSION}; use super::Decompressor;
use crate::directory::FileSlice; use crate::directory::FileSlice;
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
@@ -17,7 +17,6 @@ pub struct DocStoreFooter {
/// - reserved for future use: 15 bytes /// - reserved for future use: 15 bytes
impl BinarySerializable for DocStoreFooter { impl BinarySerializable for DocStoreFooter {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
BinarySerializable::serialize(&DOC_STORE_VERSION, writer)?;
BinarySerializable::serialize(&self.offset, writer)?; BinarySerializable::serialize(&self.offset, writer)?;
BinarySerializable::serialize(&self.decompressor.get_id(), writer)?; BinarySerializable::serialize(&self.decompressor.get_id(), writer)?;
writer.write_all(&[0; 15])?; writer.write_all(&[0; 15])?;
@@ -25,13 +24,6 @@ impl BinarySerializable for DocStoreFooter {
} }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_store_version = u32::deserialize(reader)?;
if doc_store_version != DOC_STORE_VERSION {
panic!(
"actual doc store version: {}, expected: {}",
doc_store_version, DOC_STORE_VERSION
);
}
let offset = u64::deserialize(reader)?; let offset = u64::deserialize(reader)?;
let compressor_id = u8::deserialize(reader)?; let compressor_id = u8::deserialize(reader)?;
let mut skip_buf = [0; 15]; let mut skip_buf = [0; 15];
@@ -44,7 +36,7 @@ impl BinarySerializable for DocStoreFooter {
} }
impl FixedSize for DocStoreFooter { impl FixedSize for DocStoreFooter {
const SIZE_IN_BYTES: usize = 28; const SIZE_IN_BYTES: usize = 24;
} }
impl DocStoreFooter { impl DocStoreFooter {

View File

@@ -44,9 +44,6 @@ pub use self::reader::{CacheStats, StoreReader};
pub use self::writer::StoreWriter; pub use self::writer::StoreWriter;
mod store_compressor; mod store_compressor;
/// Doc store version in footer to handle format changes.
pub(crate) const DOC_STORE_VERSION: u32 = 1;
#[cfg(feature = "lz4-compression")] #[cfg(feature = "lz4-compression")]
mod compression_lz4_block; mod compression_lz4_block;

View File

@@ -229,10 +229,10 @@ fn test_empty_string() -> crate::Result<()> {
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
term_dictionary_builder term_dictionary_builder
.insert([], &make_term_info(1_u64)) .insert(&[], &make_term_info(1_u64))
.unwrap(); .unwrap();
term_dictionary_builder term_dictionary_builder
.insert([1u8], &make_term_info(2_u64)) .insert(&[1u8], &make_term_info(2_u64))
.unwrap(); .unwrap();
term_dictionary_builder.finish()? term_dictionary_builder.finish()?
}; };
@@ -252,7 +252,7 @@ fn stream_range_test_dict() -> crate::Result<TermDictionary> {
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?; let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
for i in 0u8..10u8 { for i in 0u8..10u8 {
let number_arr = [i; 1]; let number_arr = [i; 1];
term_dictionary_builder.insert(number_arr, &make_term_info(i as u64))?; term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?;
} }
term_dictionary_builder.finish()? term_dictionary_builder.finish()?
}; };

View File

@@ -126,7 +126,6 @@ mod ngram_tokenizer;
mod raw_tokenizer; mod raw_tokenizer;
mod remove_long; mod remove_long;
mod simple_tokenizer; mod simple_tokenizer;
mod split_compound_words;
mod stemmer; mod stemmer;
mod stop_word_filter; mod stop_word_filter;
mod tokenized_string; mod tokenized_string;
@@ -142,7 +141,6 @@ pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer; pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter; pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer; pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer}; pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter; pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};

View File

@@ -1,252 +0,0 @@
use std::sync::Arc;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
///
/// Words only will be split if they can be fully decomposed into
/// consecutive matches into the given dictionary.
///
/// This is mostly useful to split [compound nouns][compound] common to many
/// Germanic languages into their constituents.
///
/// # Example
///
/// The quality of the dictionary determines the quality of the splits,
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
/// is not split in the following example.
///
/// ```rust
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
///
/// let tokenizer =
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ]));
///
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
/// assert_eq!(stream.next().unwrap().text, "dampf");
/// assert_eq!(stream.next().unwrap().text, "schiff");
/// assert_eq!(stream.next().unwrap().text, "fahrt");
/// assert_eq!(stream.next(), None);
///
/// let mut stream = tokenizer.token_stream("brotbackautomat");
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
/// assert_eq!(stream.next(), None);
/// ```
///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)]
pub struct SplitCompoundWords<S: StateID> {
dict: Arc<AhoCorasick<S>>,
}
impl SplitCompoundWords<usize> {
/// Create a filter from a given dictionary.
///
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Self
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let dict = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(dict);
Self::from_automaton(dict)
}
}
impl<S: StateID> SplitCompoundWords<S> {
/// Create a filter from a given automaton.
///
/// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
Self {
dict: Arc::new(dict),
}
}
}
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: stream,
cuts: Vec::new(),
parts: Vec::new(),
})
}
}
struct SplitCompoundWordsTokenStream<'a, S: StateID> {
dict: Arc<AhoCorasick<S>>,
tail: BoxTokenStream<'a>,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
let token = self.tail.token();
let mut text = token.text.as_str();
self.cuts.clear();
let mut pos = 0;
for match_ in self.dict.find_iter(text) {
if pos != match_.start() {
break;
}
self.cuts.push(pos);
pos = match_.end();
}
if pos == token.text.len() {
// Fill `self.parts` in reverse order,
// so that `self.parts.pop()` yields
// the tokens in their original order.
for pos in self.cuts.iter().rev() {
let (head, tail) = text.split_at(*pos);
text = head;
self.parts.push(Token {
text: tail.to_owned(),
..*token
});
}
}
}
}
impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
fn advance(&mut self) -> bool {
self.parts.pop();
if !self.parts.is_empty() {
return true;
}
if !self.tail.advance() {
return false;
}
// Will yield either `self.parts.last()` or
// `self.tail.token()` if it could not be split.
self.split();
true
}
fn token(&self) -> &Token {
self.parts.last().unwrap_or_else(|| self.tail.token())
}
fn token_mut(&mut self) -> &mut Token {
self.parts
.last_mut()
.unwrap_or_else(|| self.tail.token_mut())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer};
#[test]
fn splitting_compound_words_works() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
{
let mut stream = tokenizer.token_stream("");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foo bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobarbaz");
assert_eq!(stream.next().unwrap().text, "foobarbaz");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("baz foobar qux");
assert_eq!(stream.next().unwrap().text, "baz");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobazbar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foobazbar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar qux foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("barfoo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next(), None);
}
}
}

View File

@@ -1,6 +1,3 @@
use std::borrow::Cow;
use std::mem;
use rust_stemmers::{self, Algorithm}; use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -87,7 +84,6 @@ impl TokenFilter for Stemmer {
BoxTokenStream::from(StemmerTokenStream { BoxTokenStream::from(StemmerTokenStream {
tail: token_stream, tail: token_stream,
stemmer: inner_stemmer, stemmer: inner_stemmer,
buffer: String::new(),
}) })
} }
} }
@@ -95,7 +91,6 @@ impl TokenFilter for Stemmer {
pub struct StemmerTokenStream<'a> { pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer, stemmer: rust_stemmers::Stemmer,
buffer: String,
} }
impl<'a> TokenStream for StemmerTokenStream<'a> { impl<'a> TokenStream for StemmerTokenStream<'a> {
@@ -103,16 +98,10 @@ impl<'a> TokenStream for StemmerTokenStream<'a> {
if !self.tail.advance() { if !self.tail.advance() {
return false; return false;
} }
let token = self.tail.token_mut(); // TODO remove allocation
let stemmed_str = self.stemmer.stem(&token.text); let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
match stemmed_str { self.token_mut().text.clear();
Cow::Owned(stemmed_str) => token.text = stemmed_str, self.token_mut().text.push_str(&stemmed_str);
Cow::Borrowed(stemmed_str) => {
self.buffer.clear();
self.buffer.push_str(stemmed_str);
mem::swap(&mut token.text, &mut self.buffer);
}
}
true true
} }

View File

@@ -10,25 +10,34 @@
//! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
use std::sync::Arc; use std::collections::HashSet;
use std::hash::BuildHasherDefault;
use rustc_hash::FxHashSet; use fnv::FnvHasher;
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream; use crate::tokenizer::BoxTokenStream;
// configure our hashers for SPEED
type StopWordHasher = BuildHasherDefault<FnvHasher>;
type StopWordHashSet = HashSet<String, StopWordHasher>;
/// `TokenFilter` that removes stop words from a token stream /// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)] #[derive(Clone)]
pub struct StopWordFilter { pub struct StopWordFilter {
words: Arc<FxHashSet<String>>, words: StopWordHashSet,
} }
impl StopWordFilter { impl StopWordFilter {
/// Creates a `StopWordFilter` given a list of words to remove /// Creates a `StopWordFilter` given a list of words to remove
pub fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter { pub fn remove(words: Vec<String>) -> StopWordFilter {
StopWordFilter { let mut set = StopWordHashSet::default();
words: Arc::new(words.into_iter().collect()),
for word in words {
set.insert(word);
} }
StopWordFilter { words: set }
} }
fn english() -> StopWordFilter { fn english() -> StopWordFilter {
@@ -38,12 +47,12 @@ impl StopWordFilter {
"there", "these", "they", "this", "to", "was", "will", "with", "there", "these", "they", "this", "to", "was", "will", "with",
]; ];
StopWordFilter::remove(words.iter().map(|&s| s.to_string())) StopWordFilter::remove(words.iter().map(|&s| s.to_string()).collect())
} }
} }
pub struct StopWordFilterStream<'a> { pub struct StopWordFilterStream<'a> {
words: Arc<FxHashSet<String>>, words: StopWordHashSet,
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }