Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
727d024a23 Bugfix position broken.
For Field with several FieldValues, with a
value that contained no token at all, the token position
was reinitialized to 0.

As a result, PhraseQueries can show some false positives.
In addition, after the computation of the position delta, we can
underflow u32, and end up with gigantic delta.

We haven't been able to actually explain the bug in 1629, but it
is assumed that in some corner case these delta can cause a panic.

Closes #1629
2022-10-20 10:19:41 +09:00
64 changed files with 562 additions and 102157 deletions

View File

@@ -1,7 +1,6 @@
Tantivy 0.19 Tantivy 0.19
================================ ================================
- Limit fast fields to u32 (`get_val(u32)`) [#1644](https://github.com/quickwit-oss/tantivy/pull/1644) (@PSeitz)
- Major bugfix: Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz) - Major bugfix: Fix missing fieldnorms for u64, i64, f64, bool, bytes and date [#1620](https://github.com/quickwit-oss/tantivy/pull/1620) (@PSeitz)
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396) - Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
The `DateTime` type has been updated to hold timestamps with microseconds precision. The `DateTime` type has been updated to hold timestamps with microseconds precision.
@@ -28,6 +27,7 @@ Tantivy 0.19
- [#1582](https://github.com/quickwit-oss/tantivy/pull/1582 (@PSeitz) - [#1582](https://github.com/quickwit-oss/tantivy/pull/1582 (@PSeitz)
- [#1611](https://github.com/quickwit-oss/tantivy/pull/1611 (@PSeitz) - [#1611](https://github.com/quickwit-oss/tantivy/pull/1611 (@PSeitz)
Tantivy 0.18 Tantivy 0.18
================================ ================================
@@ -44,10 +44,6 @@ Tantivy 0.18
- Add terms aggregation (@PSeitz) - Add terms aggregation (@PSeitz)
- Add support for zstd compression (@kryesh) - Add support for zstd compression (@kryesh)
Tantivy 0.18.1
================================
- Hotfix: positions computation. #1629 (@fmassot, @fulmicoton, @PSeitz)
Tantivy 0.17 Tantivy 0.17
================================ ================================

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.19.0-dev" version = "0.18.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -14,13 +14,12 @@ edition = "2021"
rust-version = "1.62" rust-version = "1.62"
[dependencies] [dependencies]
oneshot = "0.1.5" oneshot = "0.1.3"
base64 = "0.13.0" base64 = "0.13.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
once_cell = "1.10.0" once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] } regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "0.7"
tantivy-fst = "0.4.0" tantivy-fst = "0.4.0"
memmap2 = { version = "0.5.3", optional = true } memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true } lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
@@ -46,7 +45,7 @@ rust-stemmers = "1.2.0"
downcast-rs = "1.2.0" downcast-rs = "1.2.0"
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] } bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0" census = "0.4.0"
rustc-hash = "1.1.0" fnv = "1.0.7"
thiserror = "1.0.30" thiserror = "1.0.30"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = "0.5.0" fail = "0.5.0"
@@ -61,7 +60,6 @@ measure_time = "0.8.2"
ciborium = { version = "0.2", optional = true} ciborium = { version = "0.2", optional = true}
async-trait = "0.1.53" async-trait = "0.1.53"
arc-swap = "1.5.0" arc-swap = "1.5.0"
yoke = { version = "0.6.2", features = ["derive"] }
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"

File diff suppressed because it is too large Load Diff

View File

@@ -1,159 +1,116 @@
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use pprof::criterion::{Output, PProfProfiler}; use pprof::criterion::{Output, PProfProfiler};
use serde_json::{self, Value as JsonValue}; use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
use tantivy::directory::RamDirectory; use tantivy::Index;
use tantivy::schema::{
FieldValue, TextFieldIndexing, TextOptions, Value, INDEXED, STORED, STRING, TEXT,
};
use tantivy::{Document, Index, IndexBuilder};
const HDFS_LOGS: &str = include_str!("hdfs.json"); const HDFS_LOGS: &str = include_str!("hdfs.json");
const NUM_REPEATS: usize = 20; const NUM_REPEATS: usize = 2;
pub fn hdfs_index_benchmark(c: &mut Criterion) { pub fn hdfs_index_benchmark(c: &mut Criterion) {
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let schema = {
let text_indexing_options = TextFieldIndexing::default() let mut schema_builder = tantivy::schema::SchemaBuilder::new();
.set_tokenizer("default") schema_builder.add_u64_field("timestamp", INDEXED);
.set_fieldnorms(false) schema_builder.add_text_field("body", TEXT);
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions); schema_builder.add_text_field("severity", STRING);
let mut text_options = TextOptions::default().set_indexing_options(text_indexing_options); schema_builder.build()
let text_field = schema_builder.add_text_field("body", text_options); };
let schema = schema_builder.build(); let schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
// prepare doc schema_builder.add_u64_field("timestamp", INDEXED | STORED);
let mut documents_no_array = Vec::new(); schema_builder.add_text_field("body", TEXT | STORED);
let mut documents_with_array = Vec::new(); schema_builder.add_text_field("severity", STRING | STORED);
for doc_json in HDFS_LOGS.trim().split("\n") { schema_builder.build()
let json_obj: serde_json::Map<String, JsonValue> = serde_json::from_str(doc_json).unwrap(); };
let text = json_obj.get("body").unwrap().as_str().unwrap(); let dynamic_schema = {
let mut doc_no_array = Document::new(); let mut schema_builder = tantivy::schema::SchemaBuilder::new();
doc_no_array.add_text(text_field, text); schema_builder.add_json_field("json", TEXT);
documents_no_array.push(doc_no_array); schema_builder.build()
let mut doc_with_array = Document::new(); };
doc_with_array.add_borrowed_values(text.to_owned(), |text| {
text.split(' ')
.map(|text| FieldValue::new(text_field, text.into()))
.collect()
});
documents_with_array.push(doc_with_array);
}
let mut group = c.benchmark_group("index-hdfs"); let mut group = c.benchmark_group("index-hdfs");
group.sample_size(20); group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| { group.bench_function("index-hdfs-no-commit", |b| {
b.iter(|| { b.iter(|| {
let ram_directory = RamDirectory::create(); let index = Index::create_in_ram(schema.clone());
let mut index_writer = IndexBuilder::new() let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS { for _ in 0..NUM_REPEATS {
let documents_cloned = documents_no_array.clone(); for doc_json in HDFS_LOGS.trim().split("\n") {
for doc in documents_cloned { let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }
} }
}) })
}); });
group.bench_function("index-hdfs-with-array-no-commit", |b| { group.bench_function("index-hdfs-with-commit", |b| {
b.iter(|| { b.iter(|| {
let ram_directory = RamDirectory::create(); let index = Index::create_in_ram(schema.clone());
let mut index_writer = IndexBuilder::new() let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS { for _ in 0..NUM_REPEATS {
let documents_with_array_cloned = documents_with_array.clone(); for doc_json in HDFS_LOGS.trim().split("\n") {
for doc in documents_with_array_cloned { let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }
} }
}) })
}); });
// group.bench_function("index-hdfs-with-commit", |b| { group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
// b.iter(|| { b.iter(|| {
// let ram_directory = RamDirectory::create(); let index = Index::create_in_ram(schema_with_store.clone());
// let mut index_writer = IndexBuilder::new() let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
// .schema(schema.clone()) for _ in 0..NUM_REPEATS {
// .single_segment_index_writer(ram_directory, 100_000_000) for doc_json in HDFS_LOGS.trim().split("\n") {
// .unwrap(); let doc = schema.parse_document(doc_json).unwrap();
// for _ in 0..NUM_REPEATS { index_writer.add_document(doc).unwrap();
// for doc_json in HDFS_LOGS.trim().split("\n") { }
// let doc = schema.parse_document(doc_json).unwrap(); }
// index_writer.add_document(doc).unwrap(); index_writer.commit().unwrap();
// } })
// } });
// index_writer.commit().unwrap(); group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
// }) b.iter(|| {
// }); let index = Index::create_in_ram(dynamic_schema.clone());
// group.bench_function("index-hdfs-no-commit-with-docstore", |b| { let json_field = dynamic_schema.get_field("json").unwrap();
// b.iter(|| { let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
// let ram_directory = RamDirectory::create(); for _ in 0..NUM_REPEATS {
// let mut index_writer = IndexBuilder::new() for doc_json in HDFS_LOGS.trim().split("\n") {
// .schema(schema.clone()) let json_val: serde_json::Map<String, serde_json::Value> =
// .single_segment_index_writer(ram_directory, 100_000_000) serde_json::from_str(doc_json).unwrap();
// .unwrap(); let doc = tantivy::doc!(json_field=>json_val);
// for _ in 0..NUM_REPEATS { index_writer.add_document(doc).unwrap();
// for doc_json in HDFS_LOGS.trim().split("\n") { }
// let doc = schema.parse_document(doc_json).unwrap(); }
// index_writer.add_document(doc).unwrap(); index_writer.commit().unwrap();
// } })
// } });
// }) group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
// }); b.iter(|| {
// group.bench_function("index-hdfs-with-commit-with-docstore", |b| { let index = Index::create_in_ram(dynamic_schema.clone());
// b.iter(|| { let json_field = dynamic_schema.get_field("json").unwrap();
// let ram_directory = RamDirectory::create(); let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
// let mut index_writer = IndexBuilder::new() for _ in 0..NUM_REPEATS {
// .schema(schema.clone()) for doc_json in HDFS_LOGS.trim().split("\n") {
// .single_segment_index_writer(ram_directory, 100_000_000) let json_val: serde_json::Map<String, serde_json::Value> =
// .unwrap(); serde_json::from_str(doc_json).unwrap();
// for _ in 0..NUM_REPEATS { let doc = tantivy::doc!(json_field=>json_val);
// for doc_json in HDFS_LOGS.trim().split("\n") { index_writer.add_document(doc).unwrap();
// let doc = schema.parse_document(doc_json).unwrap(); }
// index_writer.add_document(doc).unwrap(); }
// } index_writer.commit().unwrap();
// } })
// index_writer.commit().unwrap(); });
// })
// });
// group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
//});
} }
criterion_group! { criterion_group! {

View File

@@ -87,15 +87,15 @@ impl BitUnpacker {
} }
#[inline] #[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 { pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 { if self.num_bits == 0 {
return 0u64; return 0u64;
} }
let addr_in_bits = idx * self.num_bits as u32; let addr_in_bits = idx * self.num_bits;
let addr = addr_in_bits >> 3; let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
debug_assert!( debug_assert!(
addr + 8 <= data.len() as u32, addr + 8 <= data.len() as u64,
"The fast field field should have been padded with 7 bytes." "The fast field field should have been padded with 7 bytes."
); );
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8]) let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
@@ -130,7 +130,7 @@ mod test {
fn test_bitpacker_util(len: usize, num_bits: u8) { fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits); let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() { for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u32, &data), *val); assert_eq!(bitunpacker.get(i as u64, &data), *val);
} }
} }

View File

@@ -130,7 +130,7 @@ impl BlockedBitpacker {
let pos_in_block = idx % BLOCK_SIZE as usize; let pos_in_block = idx % BLOCK_SIZE as usize;
if let Some(metadata) = self.offset_and_bits.get(metadata_pos) { if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
let unpacked = BitUnpacker::new(metadata.num_bits()).get( let unpacked = BitUnpacker::new(metadata.num_bits()).get(
pos_in_block as u32, pos_in_block as u64,
&self.compressed_blocks[metadata.offset() as usize..], &self.compressed_blocks[metadata.offset() as usize..],
); );
unpacked + metadata.base_value() unpacked + metadata.base_value()

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::{fmt, io}; use std::{fmt, io};
@@ -211,23 +210,6 @@ impl BinarySerializable for String {
} }
} }
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
#[cfg(test)] #[cfg(test)]
pub mod test { pub mod test {

View File

@@ -105,7 +105,7 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>; type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) { fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get_val(doc) as f64; let value = self.fast_field_reader.get_val(doc as u64) as f64;
self.stats.count += 1; self.stats.count += 1;
self.stats.sum += value; self.stats.sum += value;
self.stats.squared_sum += value * value; self.stats.squared_sum += value * value;

View File

@@ -51,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
let product_id_reader = segment.fast_fields().u64(self.field)?; let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment let product_ids: Vec<ProductId> = segment
.doc_ids_alive() .doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc)) .map(|doc| product_id_reader.get_val(doc as u64))
.collect(); .collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter(); let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new(); let mut price_vals: Vec<Price> = Vec::new();

View File

@@ -65,7 +65,7 @@ mod tests {
b.iter(|| { b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
for _ in 0..n { for _ in 0..n {
a = column.get_val(a as u32); a = column.get_val(a as u64);
} }
a a
}); });
@@ -101,7 +101,7 @@ mod tests {
fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> { fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> {
let mut out = vec![]; let mut out = vec![];
let iter_gen = || data.iter().cloned(); let iter_gen = || data.iter().cloned();
serialize_u128(iter_gen, data.len() as u32, &mut out).unwrap(); serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap();
let out = OwnedBytes::new(out); let out = OwnedBytes::new(out);
open_u128::<u128>(out).unwrap() open_u128::<u128>(out).unwrap()
} }
@@ -111,15 +111,7 @@ mod tests {
let (major_item, _minor_item, data) = get_data_50percent_item(); let (major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| column.get_between_vals(major_item..=major_item));
let mut positions = Vec::new();
column.get_positions_for_value_range(
major_item..=major_item,
0..data.len() as u32,
&mut positions,
);
positions
});
} }
#[bench] #[bench]
@@ -127,15 +119,7 @@ mod tests {
let (_major_item, minor_item, data) = get_data_50percent_item(); let (_major_item, minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| column.get_between_vals(minor_item..=minor_item));
let mut positions = Vec::new();
column.get_positions_for_value_range(
minor_item..=minor_item,
0..data.len() as u32,
&mut positions,
);
positions
});
} }
#[bench] #[bench]
@@ -143,15 +127,7 @@ mod tests {
let (_major_item, _minor_item, data) = get_data_50percent_item(); let (_major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| column.get_between_vals(0..=u128::MAX));
let mut positions = Vec::new();
column.get_positions_for_value_range(
0..=u128::MAX,
0..data.len() as u32,
&mut positions,
);
positions
});
} }
#[bench] #[bench]
@@ -161,7 +137,7 @@ mod tests {
b.iter(|| { b.iter(|| {
let mut a = 0u128; let mut a = 0u128;
for i in 0u64..column.num_vals() as u64 { for i in 0u64..column.num_vals() as u64 {
a += column.get_val(i as u32); a += column.get_val(i);
} }
a a
}); });
@@ -175,7 +151,7 @@ mod tests {
let n = column.num_vals(); let n = column.num_vals();
let mut a = 0u128; let mut a = 0u128;
for i in (0..n / 5).map(|val| val * 5) { for i in (0..n / 5).map(|val| val * 5) {
a += column.get_val(i); a += column.get_val(i as u64);
} }
a a
}); });
@@ -200,9 +176,9 @@ mod tests {
let n = permutation.len(); let n = permutation.len();
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation); let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| { b.iter(|| {
let mut a = 0; let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) { for i in (0..n / 7).map(|val| val * 7) {
a += column.get_val(i as u32); a += column.get_val(i as u64);
} }
a a
}); });
@@ -215,7 +191,7 @@ mod tests {
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation); let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| { b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
for i in 0u32..n as u32 { for i in 0u64..n as u64 {
a += column.get_val(i); a += column.get_val(i);
} }
a a
@@ -229,8 +205,8 @@ mod tests {
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation); let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
b.iter(|| { b.iter(|| {
let mut a = 0u64; let mut a = 0u64;
for i in 0..n { for i in 0..n as u64 {
a += column.get_val(i as u32); a += column.get_val(i);
} }
a a
}); });

View File

@@ -17,7 +17,7 @@ pub struct BitpackedReader {
impl Column for BitpackedReader { impl Column for BitpackedReader {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u64 { fn get_val(&self, doc: u64) -> u64 {
self.bit_unpacker.get(doc, &self.data) self.bit_unpacker.get(doc, &self.data)
} }
#[inline] #[inline]
@@ -30,7 +30,7 @@ impl Column for BitpackedReader {
self.normalized_header.max_value self.normalized_header.max_value
} }
#[inline] #[inline]
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.normalized_header.num_vals self.normalized_header.num_vals
} }
} }

View File

@@ -36,7 +36,7 @@ impl BinarySerializable for Block {
} }
} }
fn compute_num_blocks(num_vals: u32) -> usize { fn compute_num_blocks(num_vals: u64) -> usize {
(num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE (num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE
} }
@@ -72,13 +72,13 @@ impl FastFieldCodec for BlockwiseLinearCodec {
// Estimate first_chunk and extrapolate // Estimate first_chunk and extrapolate
fn estimate(column: &dyn crate::Column) -> Option<f32> { fn estimate(column: &dyn crate::Column) -> Option<f32> {
if column.num_vals() < 10 * CHUNK_SIZE as u32 { if column.num_vals() < 10 * CHUNK_SIZE as u64 {
return None; return None;
} }
let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect(); let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect();
let line = Line::train(&VecColumn::from(&first_chunk)); let line = Line::train(&VecColumn::from(&first_chunk));
for (i, buffer_val) in first_chunk.iter_mut().enumerate() { for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u32); let interpolated_val = line.eval(i as u64);
*buffer_val = buffer_val.wrapping_sub(interpolated_val); *buffer_val = buffer_val.wrapping_sub(interpolated_val);
} }
let estimated_bit_width = first_chunk let estimated_bit_width = first_chunk
@@ -95,7 +95,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
}; };
let num_bits = estimated_bit_width as u64 * column.num_vals() as u64 let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
// function metadata per block // function metadata per block
+ metadata_per_block as u64 * (column.num_vals() as u64 / CHUNK_SIZE as u64); + metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64);
let num_bits_uncompressed = 64 * column.num_vals(); let num_bits_uncompressed = 64 * column.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32) Some(num_bits as f32 / num_bits_uncompressed as f32)
} }
@@ -121,7 +121,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
assert!(!buffer.is_empty()); assert!(!buffer.is_empty());
for (i, buffer_val) in buffer.iter_mut().enumerate() { for (i, buffer_val) in buffer.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u32); let interpolated_val = line.eval(i as u64);
*buffer_val = buffer_val.wrapping_sub(interpolated_val); *buffer_val = buffer_val.wrapping_sub(interpolated_val);
} }
let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap(); let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap();
@@ -161,9 +161,9 @@ pub struct BlockwiseLinearReader {
impl Column for BlockwiseLinearReader { impl Column for BlockwiseLinearReader {
#[inline(always)] #[inline(always)]
fn get_val(&self, idx: u32) -> u64 { fn get_val(&self, idx: u64) -> u64 {
let block_id = (idx / CHUNK_SIZE as u32) as usize; let block_id = (idx / CHUNK_SIZE as u64) as usize;
let idx_within_block = idx % (CHUNK_SIZE as u32); let idx_within_block = idx % (CHUNK_SIZE as u64);
let block = &self.blocks[block_id]; let block = &self.blocks[block_id];
let interpoled_val: u64 = block.line.eval(idx_within_block); let interpoled_val: u64 = block.line.eval(idx_within_block);
let block_bytes = &self.data[block.data_start_offset..]; let block_bytes = &self.data[block.data_start_offset..];
@@ -180,7 +180,7 @@ impl Column for BlockwiseLinearReader {
self.normalized_header.max_value self.normalized_header.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.normalized_header.num_vals self.normalized_header.num_vals
} }
} }

View File

@@ -1,5 +1,5 @@
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive}; use std::ops::RangeInclusive;
use tantivy_bitpacker::minmax; use tantivy_bitpacker::minmax;
@@ -14,7 +14,7 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
/// # Panics /// # Panics
/// ///
/// May panic if `idx` is greater than the column length. /// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T; fn get_val(&self, idx: u64) -> T;
/// Fills an output buffer with the fast field values /// Fills an output buffer with the fast field values
/// associated with the `DocId` going from /// associated with the `DocId` going from
@@ -27,28 +27,21 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
#[inline] #[inline]
fn get_range(&self, start: u64, output: &mut [T]) { fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) { for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32); *out = self.get_val(idx);
} }
} }
/// Get the positions of values which are in the provided value range. /// Return the positions of values which are in the provided range.
///
/// Note that position == docid for single value fast fields
#[inline] #[inline]
fn get_positions_for_value_range( fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> {
&self, let mut vals = Vec::new();
value_range: RangeInclusive<T>, for idx in 0..self.num_vals() {
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let val = self.get_val(idx); let val = self.get_val(idx);
if value_range.contains(&val) { if range.contains(&val) {
positions.push(idx); vals.push(idx);
} }
} }
vals
} }
/// Returns the minimum value for this fast field. /// Returns the minimum value for this fast field.
@@ -68,7 +61,7 @@ pub trait Column<T: PartialOrd = u64>: Send + Sync {
fn max_value(&self) -> T; fn max_value(&self) -> T;
/// The number of values in the column. /// The number of values in the column.
fn num_vals(&self) -> u32; fn num_vals(&self) -> u64;
/// Returns a iterator over the data /// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> { fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
@@ -84,7 +77,7 @@ pub struct VecColumn<'a, T = u64> {
} }
impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C { impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u64) -> T {
(*self).get_val(idx) (*self).get_val(idx)
} }
@@ -96,7 +89,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
(*self).max_value() (*self).max_value()
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
(*self).num_vals() (*self).num_vals()
} }
@@ -110,7 +103,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
} }
impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> { impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T { fn get_val(&self, position: u64) -> T {
self.values[position as usize] self.values[position as usize]
} }
@@ -126,8 +119,8 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.values.len() as u32 self.values.len() as u64
} }
fn get_range(&self, start: u64, output: &mut [T]) { fn get_range(&self, start: u64, output: &mut [T]) {
@@ -163,7 +156,7 @@ struct MonotonicMappingColumn<C, T, Input> {
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el /// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
/// ///
/// The inverse of the mapping is required for: /// The inverse of the mapping is required for:
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> ` /// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the /// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column. /// serialization does before calling the underlying column.
/// ///
@@ -195,7 +188,7 @@ where
Output: PartialOrd + Send + Sync + Clone, Output: PartialOrd + Send + Sync + Clone,
{ {
#[inline] #[inline]
fn get_val(&self, idx: u32) -> Output { fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx); let from_val = self.from_column.get_val(idx);
self.monotonic_mapping.mapping(from_val) self.monotonic_mapping.mapping(from_val)
} }
@@ -210,7 +203,7 @@ where
self.monotonic_mapping.mapping(from_max_value) self.monotonic_mapping.mapping(from_max_value)
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.from_column.num_vals() self.from_column.num_vals()
} }
@@ -222,17 +215,10 @@ where
) )
} }
fn get_positions_for_value_range( fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
&self, self.from_column.get_between_vals(
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_positions_for_value_range(
self.monotonic_mapping.inverse(range.start().clone()) self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()), ..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
) )
} }
@@ -255,7 +241,7 @@ where
T: Iterator + Clone + ExactSizeIterator + Send + Sync, T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd, T::Item: PartialOrd,
{ {
fn get_val(&self, idx: u32) -> T::Item { fn get_val(&self, idx: u64) -> T::Item {
self.0.clone().nth(idx as usize).unwrap() self.0.clone().nth(idx as usize).unwrap()
} }
@@ -267,8 +253,8 @@ where
self.0.clone().last().unwrap() self.0.clone().last().unwrap()
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.0.len() as u32 self.0.len() as u64
} }
fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {

View File

@@ -57,7 +57,7 @@ fn num_bits(val: u128) -> u8 {
/// metadata. /// metadata.
pub fn get_compact_space( pub fn get_compact_space(
values_deduped_sorted: &BTreeSet<u128>, values_deduped_sorted: &BTreeSet<u128>,
total_num_values: u32, total_num_values: u64,
cost_per_blank: usize, cost_per_blank: usize,
) -> CompactSpace { ) -> CompactSpace {
let mut compact_space_builder = CompactSpaceBuilder::new(); let mut compact_space_builder = CompactSpaceBuilder::new();

View File

@@ -14,7 +14,7 @@ use std::{
cmp::Ordering, cmp::Ordering,
collections::BTreeSet, collections::BTreeSet,
io::{self, Write}, io::{self, Write},
ops::{Range, RangeInclusive}, ops::RangeInclusive,
}; };
use common::{BinarySerializable, CountingWriter, VInt, VIntU128}; use common::{BinarySerializable, CountingWriter, VInt, VIntU128};
@@ -165,13 +165,13 @@ pub struct IPCodecParams {
bit_unpacker: BitUnpacker, bit_unpacker: BitUnpacker,
min_value: u128, min_value: u128,
max_value: u128, max_value: u128,
num_vals: u32, num_vals: u64,
num_bits: u8, num_bits: u8,
} }
impl CompactSpaceCompressor { impl CompactSpaceCompressor {
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u32) -> Self { pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u64) -> Self {
let mut values_sorted = BTreeSet::new(); let mut values_sorted = BTreeSet::new();
values_sorted.extend(iter); values_sorted.extend(iter);
let total_num_values = num_vals; let total_num_values = num_vals;
@@ -200,7 +200,7 @@ impl CompactSpaceCompressor {
bit_unpacker: BitUnpacker::new(num_bits), bit_unpacker: BitUnpacker::new(num_bits),
min_value, min_value,
max_value, max_value,
num_vals: total_num_values, num_vals: total_num_values as u64,
num_bits, num_bits,
}, },
} }
@@ -267,7 +267,7 @@ impl BinarySerializable for IPCodecParams {
let _header_flags = u64::deserialize(reader)?; let _header_flags = u64::deserialize(reader)?;
let min_value = VIntU128::deserialize(reader)?.0; let min_value = VIntU128::deserialize(reader)?.0;
let max_value = VIntU128::deserialize(reader)?.0; let max_value = VIntU128::deserialize(reader)?.0;
let num_vals = VIntU128::deserialize(reader)?.0 as u32; let num_vals = VIntU128::deserialize(reader)?.0 as u64;
let num_bits = u8::deserialize(reader)?; let num_bits = u8::deserialize(reader)?;
let compact_space = CompactSpace::deserialize(reader)?; let compact_space = CompactSpace::deserialize(reader)?;
@@ -284,7 +284,7 @@ impl BinarySerializable for IPCodecParams {
impl Column<u128> for CompactSpaceDecompressor { impl Column<u128> for CompactSpaceDecompressor {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u128 { fn get_val(&self, doc: u64) -> u128 {
self.get(doc) self.get(doc)
} }
@@ -296,7 +296,7 @@ impl Column<u128> for CompactSpaceDecompressor {
self.max_value() self.max_value()
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.params.num_vals self.params.num_vals
} }
@@ -304,15 +304,8 @@ impl Column<u128> for CompactSpaceDecompressor {
fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> {
Box::new(self.iter()) Box::new(self.iter())
} }
fn get_between_vals(&self, range: RangeInclusive<u128>) -> Vec<u64> {
#[inline] self.get_between_vals(range)
fn get_positions_for_value_range(
&self,
value_range: RangeInclusive<u128>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.get_positions_for_value_range(value_range, doc_id_range, positions)
} }
} }
@@ -347,19 +340,12 @@ impl CompactSpaceDecompressor {
/// Comparing on compact space: Real dataset 1.08 GElements/s /// Comparing on compact space: Real dataset 1.08 GElements/s
/// ///
/// Comparing on original space: Real dataset .06 GElements/s (not completely optimized) /// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
#[inline] pub fn get_between_vals(&self, range: RangeInclusive<u128>) -> Vec<u64> {
pub fn get_positions_for_value_range( if range.start() > range.end() {
&self, return Vec::new();
value_range: RangeInclusive<u128>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
if value_range.start() > value_range.end() {
return;
} }
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals()); let from_value = *range.start();
let from_value = *value_range.start(); let to_value = *range.end();
let to_value = *value_range.end();
assert!(to_value >= from_value); assert!(to_value >= from_value);
let compact_from = self.u128_to_compact(from_value); let compact_from = self.u128_to_compact(from_value);
let compact_to = self.u128_to_compact(to_value); let compact_to = self.u128_to_compact(to_value);
@@ -367,7 +353,7 @@ impl CompactSpaceDecompressor {
// Quick return, if both ranges fall into the same non-mapped space, the range can't cover // Quick return, if both ranges fall into the same non-mapped space, the range can't cover
// any values, so we can early exit // any values, so we can early exit
match (compact_to, compact_from) { match (compact_to, compact_from) {
(Err(pos1), Err(pos2)) if pos1 == pos2 => return, (Err(pos1), Err(pos2)) if pos1 == pos2 => return Vec::new(),
_ => {} _ => {}
} }
@@ -389,28 +375,27 @@ impl CompactSpaceDecompressor {
}); });
let range = compact_from..=compact_to; let range = compact_from..=compact_to;
let mut positions = Vec::new();
let scan_num_docs = doc_id_range.end - doc_id_range.start;
let step_size = 4; let step_size = 4;
let cutoff = doc_id_range.start + scan_num_docs - scan_num_docs % step_size; let cutoff = self.params.num_vals - self.params.num_vals % step_size;
let mut push_if_in_range = |idx, val| { let mut push_if_in_range = |idx, val| {
if range.contains(&val) { if range.contains(&val) {
positions.push(idx); positions.push(idx);
} }
}; };
let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data); let get_val = |idx| self.params.bit_unpacker.get(idx as u64, &self.data);
// unrolled loop // unrolled loop
for idx in (doc_id_range.start..cutoff).step_by(step_size as usize) { for idx in (0..cutoff).step_by(step_size as usize) {
let idx1 = idx; let idx1 = idx;
let idx2 = idx + 1; let idx2 = idx + 1;
let idx3 = idx + 2; let idx3 = idx + 2;
let idx4 = idx + 3; let idx4 = idx + 3;
let val1 = get_val(idx1 as u32); let val1 = get_val(idx1);
let val2 = get_val(idx2 as u32); let val2 = get_val(idx2);
let val3 = get_val(idx3 as u32); let val3 = get_val(idx3);
let val4 = get_val(idx4 as u32); let val4 = get_val(idx4);
push_if_in_range(idx1, val1); push_if_in_range(idx1, val1);
push_if_in_range(idx2, val2); push_if_in_range(idx2, val2);
push_if_in_range(idx3, val3); push_if_in_range(idx3, val3);
@@ -418,15 +403,17 @@ impl CompactSpaceDecompressor {
} }
// handle rest // handle rest
for idx in cutoff..doc_id_range.end { for idx in cutoff..self.params.num_vals {
push_if_in_range(idx, get_val(idx as u32)); push_if_in_range(idx, get_val(idx));
} }
positions
} }
#[inline] #[inline]
fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ { fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
(0..self.params.num_vals) (0..self.params.num_vals)
.map(move |idx| self.params.bit_unpacker.get(idx, &self.data) as u64) .map(move |idx| self.params.bit_unpacker.get(idx as u64, &self.data) as u64)
} }
#[inline] #[inline]
@@ -438,7 +425,7 @@ impl CompactSpaceDecompressor {
} }
#[inline] #[inline]
pub fn get(&self, idx: u32) -> u128 { pub fn get(&self, idx: u64) -> u128 {
let compact = self.params.bit_unpacker.get(idx, &self.data); let compact = self.params.bit_unpacker.get(idx, &self.data);
self.compact_to_u128(compact) self.compact_to_u128(compact)
} }
@@ -465,7 +452,7 @@ mod tests {
] ]
.into_iter() .into_iter()
.collect(); .collect();
let compact_space = get_compact_space(ips, ips.len() as u32, 11); let compact_space = get_compact_space(ips, ips.len() as u64, 11);
let amplitude = compact_space.amplitude_compact_space(); let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 17); assert_eq!(amplitude, 17);
assert_eq!(1, compact_space.u128_to_compact(2).unwrap()); assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
@@ -496,7 +483,7 @@ mod tests {
#[test] #[test]
fn compact_space_amplitude_test() { fn compact_space_amplitude_test() {
let ips = &[100000u128, 1000000].into_iter().collect(); let ips = &[100000u128, 1000000].into_iter().collect();
let compact_space = get_compact_space(ips, ips.len() as u32, 1); let compact_space = get_compact_space(ips, ips.len() as u64, 1);
let amplitude = compact_space.amplitude_compact_space(); let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 2); assert_eq!(amplitude, 2);
} }
@@ -504,21 +491,16 @@ mod tests {
fn test_all(data: OwnedBytes, expected: &[u128]) { fn test_all(data: OwnedBytes, expected: &[u128]) {
let decompressor = CompactSpaceDecompressor::open(data).unwrap(); let decompressor = CompactSpaceDecompressor::open(data).unwrap();
for (idx, expected_val) in expected.iter().cloned().enumerate() { for (idx, expected_val) in expected.iter().cloned().enumerate() {
let val = decompressor.get(idx as u32); let val = decompressor.get(idx as u64);
assert_eq!(val, expected_val); assert_eq!(val, expected_val);
let test_range = |range: RangeInclusive<u128>| { let test_range = |range: RangeInclusive<u128>| {
let expected_positions = expected let expected_positions = expected
.iter() .iter()
.positions(|val| range.contains(val)) .positions(|val| range.contains(val))
.map(|pos| pos as u32) .map(|pos| pos as u64)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let mut positions = Vec::new(); let positions = decompressor.get_between_vals(range);
decompressor.get_positions_for_value_range(
range,
0..decompressor.num_vals(),
&mut positions,
);
assert_eq!(positions, expected_positions); assert_eq!(positions, expected_positions);
}; };
@@ -533,7 +515,7 @@ mod tests {
let mut out = Vec::new(); let mut out = Vec::new();
serialize_u128( serialize_u128(
|| u128_vals.iter().cloned(), || u128_vals.iter().cloned(),
u128_vals.len() as u32, u128_vals.len() as u64,
&mut out, &mut out,
) )
.unwrap(); .unwrap();
@@ -558,107 +540,24 @@ mod tests {
]; ];
let data = test_aux_vals(vals); let data = test_aux_vals(vals);
let decomp = CompactSpaceDecompressor::open(data).unwrap(); let decomp = CompactSpaceDecompressor::open(data).unwrap();
let complete_range = 0..vals.len() as u32; let positions = decomp.get_between_vals(0..=1);
for (pos, val) in vals.iter().enumerate() {
let val = *val as u128;
let pos = pos as u32;
let mut positions = Vec::new();
decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
assert_eq!(positions, vec![pos]);
}
// handle docid range out of bounds
let positions = get_positions_for_value_range_helper(&decomp, 0..=1, 1..u32::MAX);
assert_eq!(positions, vec![]);
let positions =
get_positions_for_value_range_helper(&decomp, 0..=1, complete_range.clone());
assert_eq!(positions, vec![0]); assert_eq!(positions, vec![0]);
let positions = let positions = decomp.get_between_vals(0..=2);
get_positions_for_value_range_helper(&decomp, 0..=2, complete_range.clone());
assert_eq!(positions, vec![0]); assert_eq!(positions, vec![0]);
let positions = let positions = decomp.get_between_vals(0..=3);
get_positions_for_value_range_helper(&decomp, 0..=3, complete_range.clone());
assert_eq!(positions, vec![0, 2]); assert_eq!(positions, vec![0, 2]);
assert_eq!( assert_eq!(decomp.get_between_vals(99999u128..=99999u128), vec![3]);
get_positions_for_value_range_helper( assert_eq!(decomp.get_between_vals(99999u128..=100000u128), vec![3, 4]);
&decomp, assert_eq!(decomp.get_between_vals(99998u128..=100000u128), vec![3, 4]);
99999u128..=99999u128, assert_eq!(decomp.get_between_vals(99998u128..=99999u128), vec![3]);
complete_range.clone() assert_eq!(decomp.get_between_vals(99998u128..=99998u128), vec![]);
), assert_eq!(decomp.get_between_vals(333u128..=333u128), vec![8]);
vec![3] assert_eq!(decomp.get_between_vals(332u128..=333u128), vec![8]);
); assert_eq!(decomp.get_between_vals(332u128..=334u128), vec![8]);
assert_eq!( assert_eq!(decomp.get_between_vals(333u128..=334u128), vec![8]);
get_positions_for_value_range_helper(
&decomp,
99999u128..=100000u128,
complete_range.clone()
),
vec![3, 4]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=100000u128,
complete_range.clone()
),
vec![3, 4]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=99999u128,
complete_range.clone()
),
vec![3]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
99998u128..=99998u128,
complete_range.clone()
),
vec![]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
333u128..=333u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
332u128..=333u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
332u128..=334u128,
complete_range.clone()
),
vec![8]
);
assert_eq!(
get_positions_for_value_range_helper(
&decomp,
333u128..=334u128,
complete_range.clone()
),
vec![8]
);
assert_eq!( assert_eq!(
get_positions_for_value_range_helper( decomp.get_between_vals(4_000_211_221u128..=5_000_000_000u128),
&decomp,
4_000_211_221u128..=5_000_000_000u128,
complete_range.clone()
),
vec![6, 7] vec![6, 7]
); );
} }
@@ -683,29 +582,12 @@ mod tests {
]; ];
let data = test_aux_vals(vals); let data = test_aux_vals(vals);
let decomp = CompactSpaceDecompressor::open(data).unwrap(); let decomp = CompactSpaceDecompressor::open(data).unwrap();
let complete_range = 0..vals.len() as u32; let positions = decomp.get_between_vals(0..=5);
assert_eq!( assert_eq!(positions, vec![]);
get_positions_for_value_range_helper(&decomp, 0..=5, complete_range.clone()), let positions = decomp.get_between_vals(0..=100);
vec![] assert_eq!(positions, vec![0]);
); let positions = decomp.get_between_vals(0..=105);
assert_eq!( assert_eq!(positions, vec![0]);
get_positions_for_value_range_helper(&decomp, 0..=100, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(&decomp, 0..=105, complete_range.clone()),
vec![0]
);
}
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
column: &C,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
) -> Vec<u32> {
let mut positions = Vec::new();
column.get_positions_for_value_range(value_range, doc_id_range, &mut positions);
positions
} }
#[test] #[test]
@@ -726,33 +608,13 @@ mod tests {
5_000_000_000, 5_000_000_000,
]; ];
let mut out = Vec::new(); let mut out = Vec::new();
serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap(); serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap();
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap(); let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
let complete_range = 0..vals.len() as u32;
assert_eq!( assert_eq!(decomp.get_between_vals(199..=200), vec![0]);
get_positions_for_value_range_helper(&*decomp, 199..=200, complete_range.clone()), assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]);
vec![0] assert_eq!(decomp.get_between_vals(200..=200), vec![0]);
); assert_eq!(decomp.get_between_vals(1_000_000..=1_000_000), vec![11]);
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 199..=201, complete_range.clone()),
vec![0, 1]
);
assert_eq!(
get_positions_for_value_range_helper(&*decomp, 200..=200, complete_range.clone()),
vec![0]
);
assert_eq!(
get_positions_for_value_range_helper(
&*decomp,
1_000_000..=1_000_000,
complete_range.clone()
),
vec![11]
);
} }
#[test] #[test]

View File

@@ -199,9 +199,9 @@ mod tests {
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap(); let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u32); assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().copied().enumerate() { for (doc, orig_val) in data.iter().copied().enumerate() {
let val = reader.get_val(doc as u32); let val = reader.get_val(doc as u64);
assert_eq!( assert_eq!(
val, orig_val, val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \ "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
@@ -211,18 +211,13 @@ mod tests {
if !data.is_empty() { if !data.is_empty() {
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1); let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
let expected_positions: Vec<u32> = data let expected_positions: Vec<u64> = data
.iter() .iter()
.enumerate() .enumerate()
.filter(|(_, el)| **el == data[test_rand_idx]) .filter(|(_, el)| **el == data[test_rand_idx])
.map(|(pos, _)| pos as u32) .map(|(pos, _)| pos as u64)
.collect(); .collect();
let mut positions = Vec::new(); let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
reader.get_positions_for_value_range(
data[test_rand_idx]..=data[test_rand_idx],
0..data.len() as u32,
&mut positions,
);
assert_eq!(expected_positions, positions); assert_eq!(expected_positions, positions);
} }
Some((estimation, actual_compression)) Some((estimation, actual_compression))
@@ -434,7 +429,7 @@ mod bench {
b.iter(|| { b.iter(|| {
let mut sum = 0u64; let mut sum = 0u64;
for pos in value_iter() { for pos in value_iter() {
let val = col.get_val(pos as u32); let val = col.get_val(pos as u64);
sum = sum.wrapping_add(val); sum = sum.wrapping_add(val);
} }
sum sum
@@ -446,7 +441,7 @@ mod bench {
b.iter(|| { b.iter(|| {
let mut sum = 0u64; let mut sum = 0u64;
for pos in value_iter() { for pos in value_iter() {
let val = col.get_val(pos as u32); let val = col.get_val(pos as u64);
sum = sum.wrapping_add(val); sum = sum.wrapping_add(val);
} }
sum sum

View File

@@ -1,5 +1,5 @@
use std::io; use std::io;
use std::num::NonZeroU32; use std::num::NonZeroU64;
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
@@ -29,7 +29,7 @@ pub struct Line {
/// compute_slope(y0, y1) /// compute_slope(y0, y1)
/// = compute_slope(y0 + X % 2^64, y1 + X % 2^64) /// = compute_slope(y0 + X % 2^64, y1 + X % 2^64)
/// ` /// `
fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 { fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU64) -> u64 {
let dy = y1.wrapping_sub(y0); let dy = y1.wrapping_sub(y0);
let sign = dy <= (1 << 63); let sign = dy <= (1 << 63);
let abs_dy = if sign { let abs_dy = if sign {
@@ -43,7 +43,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
return 0u64; return 0u64;
} }
let abs_slope = (abs_dy << 32) / num_vals.get() as u64; let abs_slope = (abs_dy << 32) / num_vals.get();
if sign { if sign {
abs_slope abs_slope
} else { } else {
@@ -62,8 +62,8 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
impl Line { impl Line {
#[inline(always)] #[inline(always)]
pub fn eval(&self, x: u32) -> u64 { pub fn eval(&self, x: u64) -> u64 {
let linear_part = ((x as u64).wrapping_mul(self.slope) >> 32) as i32 as u64; let linear_part = (x.wrapping_mul(self.slope) >> 32) as i32 as u64;
self.intercept.wrapping_add(linear_part) self.intercept.wrapping_add(linear_part)
} }
@@ -75,7 +75,7 @@ impl Line {
Self::train_from( Self::train_from(
first_val, first_val,
last_val, last_val,
num_vals as u32, num_vals,
sample_positions_and_values.iter().cloned(), sample_positions_and_values.iter().cloned(),
) )
} }
@@ -84,11 +84,11 @@ impl Line {
fn train_from( fn train_from(
first_val: u64, first_val: u64,
last_val: u64, last_val: u64,
num_vals: u32, num_vals: u64,
positions_and_values: impl Iterator<Item = (u64, u64)>, positions_and_values: impl Iterator<Item = (u64, u64)>,
) -> Self { ) -> Self {
// TODO replace with let else // TODO replace with let else
let idx_last_val = if let Some(idx_last_val) = NonZeroU32::new(num_vals - 1) { let idx_last_val = if let Some(idx_last_val) = NonZeroU64::new(num_vals - 1) {
idx_last_val idx_last_val
} else { } else {
return Line::default(); return Line::default();
@@ -129,7 +129,7 @@ impl Line {
}; };
let heuristic_shift = y0.wrapping_sub(MID_POINT); let heuristic_shift = y0.wrapping_sub(MID_POINT);
line.intercept = positions_and_values line.intercept = positions_and_values
.map(|(pos, y)| y.wrapping_sub(line.eval(pos as u32))) .map(|(pos, y)| y.wrapping_sub(line.eval(pos)))
.min_by_key(|&val| val.wrapping_sub(heuristic_shift)) .min_by_key(|&val| val.wrapping_sub(heuristic_shift))
.unwrap_or(0u64); //< Never happens. .unwrap_or(0u64); //< Never happens.
line line
@@ -199,7 +199,7 @@ mod tests {
let line = Line::train(&VecColumn::from(&ys)); let line = Line::train(&VecColumn::from(&ys));
ys.iter() ys.iter()
.enumerate() .enumerate()
.map(|(x, y)| y.wrapping_sub(line.eval(x as u32))) .map(|(x, y)| y.wrapping_sub(line.eval(x as u64)))
.max() .max()
} }

View File

@@ -19,7 +19,7 @@ pub struct LinearReader {
impl Column for LinearReader { impl Column for LinearReader {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u64 { fn get_val(&self, doc: u64) -> u64 {
let interpoled_val: u64 = self.linear_params.line.eval(doc); let interpoled_val: u64 = self.linear_params.line.eval(doc);
let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data); let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data);
interpoled_val.wrapping_add(bitpacked_diff) interpoled_val.wrapping_add(bitpacked_diff)
@@ -37,7 +37,7 @@ impl Column for LinearReader {
} }
#[inline] #[inline]
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.header.num_vals self.header.num_vals
} }
} }
@@ -93,7 +93,7 @@ impl FastFieldCodec for LinearCodec {
.iter() .iter()
.enumerate() .enumerate()
.map(|(pos, actual_value)| { .map(|(pos, actual_value)| {
let calculated_value = line.eval(pos as u32); let calculated_value = line.eval(pos as u64);
actual_value.wrapping_sub(calculated_value) actual_value.wrapping_sub(calculated_value)
}) })
.max() .max()
@@ -108,7 +108,7 @@ impl FastFieldCodec for LinearCodec {
let mut bit_packer = BitPacker::new(); let mut bit_packer = BitPacker::new();
for (pos, actual_value) in column.iter().enumerate() { for (pos, actual_value) in column.iter().enumerate() {
let calculated_value = line.eval(pos as u32); let calculated_value = line.eval(pos as u64);
let offset = actual_value.wrapping_sub(calculated_value); let offset = actual_value.wrapping_sub(calculated_value);
bit_packer.write(offset, num_bits, write)?; bit_packer.write(offset, num_bits, write)?;
} }
@@ -140,7 +140,7 @@ impl FastFieldCodec for LinearCodec {
let estimated_bit_width = sample_positions_and_values let estimated_bit_width = sample_positions_and_values
.into_iter() .into_iter()
.map(|(pos, actual_value)| { .map(|(pos, actual_value)| {
let interpolated_val = line.eval(pos as u32); let interpolated_val = line.eval(pos as u64);
actual_value.wrapping_sub(interpolated_val) actual_value.wrapping_sub(interpolated_val)
}) })
.map(|diff| ((diff as f32 * 1.5) * 2.0) as u64) .map(|diff| ((diff as f32 * 1.5) * 2.0) as u64)

View File

@@ -90,7 +90,7 @@ fn bench_ip() {
{ {
let mut data = vec![]; let mut data = vec![];
for dataset in dataset.chunks(500_000) { for dataset in dataset.chunks(500_000) {
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
} }
let compression = data.len() as f64 / (dataset.len() * 16) as f64; let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression 50_000 chunks {:.4}", compression); println!("Compression 50_000 chunks {:.4}", compression);
@@ -103,7 +103,7 @@ fn bench_ip() {
let mut data = vec![]; let mut data = vec![];
{ {
print_time!("creation"); print_time!("creation");
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
} }
let compression = data.len() as f64 / (dataset.len() * 16) as f64; let compression = data.len() as f64 / (dataset.len() * 16) as f64;
@@ -115,15 +115,9 @@ fn bench_ip() {
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap(); let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
// Sample some ranges // Sample some ranges
let mut doc_values = Vec::new();
for value in dataset.iter().take(1110).skip(1100).cloned() { for value in dataset.iter().take(1110).skip(1100).cloned() {
doc_values.clear();
print_time!("get range"); print_time!("get range");
decompressor.get_positions_for_value_range( let doc_values = decompressor.get_between_vals(value..=value);
value..=value,
0..decompressor.num_vals(),
&mut doc_values,
);
println!("{:?}", doc_values.len()); println!("{:?}", doc_values.len());
} }
} }

View File

@@ -46,14 +46,14 @@ use crate::{
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub struct NormalizedHeader { pub struct NormalizedHeader {
/// The number of values in the underlying column. /// The number of values in the underlying column.
pub num_vals: u32, pub num_vals: u64,
/// The max value of the underlying column. /// The max value of the underlying column.
pub max_value: u64, pub max_value: u64,
} }
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub(crate) struct Header { pub(crate) struct Header {
pub num_vals: u32, pub num_vals: u64,
pub min_value: u64, pub min_value: u64,
pub max_value: u64, pub max_value: u64,
pub gcd: Option<NonZeroU64>, pub gcd: Option<NonZeroU64>,
@@ -110,7 +110,7 @@ pub fn normalize_column<C: Column>(
impl BinarySerializable for Header { impl BinarySerializable for Header {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.num_vals as u64).serialize(writer)?; VInt(self.num_vals).serialize(writer)?;
VInt(self.min_value).serialize(writer)?; VInt(self.min_value).serialize(writer)?;
VInt(self.max_value - self.min_value).serialize(writer)?; VInt(self.max_value - self.min_value).serialize(writer)?;
if let Some(gcd) = self.gcd { if let Some(gcd) = self.gcd {
@@ -123,7 +123,7 @@ impl BinarySerializable for Header {
} }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let num_vals = VInt::deserialize(reader)?.0 as u32; let num_vals = VInt::deserialize(reader)?.0;
let min_value = VInt::deserialize(reader)?.0; let min_value = VInt::deserialize(reader)?.0;
let amplitude = VInt::deserialize(reader)?.0; let amplitude = VInt::deserialize(reader)?.0;
let max_value = min_value + amplitude; let max_value = min_value + amplitude;
@@ -164,7 +164,7 @@ pub fn estimate<T: MonotonicallyMappableToU64>(
/// Serializes u128 values with the compact space codec. /// Serializes u128 values with the compact space codec.
pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>( pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
iter_gen: F, iter_gen: F,
num_vals: u32, num_vals: u64,
output: &mut impl io::Write, output: &mut impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
// TODO write header, to later support more codecs // TODO write header, to later support more codecs

View File

@@ -62,20 +62,6 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
}) })
} }
// word variant that allows more characters, e.g. for range queries that don't allow field
// specifier
fn relaxed_word<'a>() -> impl Parser<&'a str, Output = String> {
(
satisfy(|c: char| {
!c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
}),
many(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
/// Parses a date time according to rfc3339 /// Parses a date time according to rfc3339
/// 2015-08-02T18:54:42+02 /// 2015-08-02T18:54:42+02
/// 2021-04-13T19:46:26.266051969+00:00 /// 2021-04-13T19:46:26.266051969+00:00
@@ -195,8 +181,8 @@ fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let range_term_val = || { let range_term_val = || {
attempt(date_time()) attempt(date_time())
.or(word())
.or(negative_number()) .or(negative_number())
.or(relaxed_word())
.or(char('*').with(value("*".to_string()))) .or(char('*').with(value("*".to_string())))
}; };
@@ -663,34 +649,6 @@ mod test {
.expect("Cannot parse date range") .expect("Cannot parse date range")
.0; .0;
assert_eq!(res6, expected_flexible_dates); assert_eq!(res6, expected_flexible_dates);
// IP Range Unbounded
let expected_weight = UserInputLeaf::Range {
field: Some("ip".to_string()),
lower: UserInputBound::Inclusive("::1".to_string()),
upper: UserInputBound::Unbounded,
};
let res1 = range()
.parse("ip: >=::1")
.expect("Cannot parse ip v6 format")
.0;
let res2 = range()
.parse("ip:[::1 TO *}")
.expect("Cannot parse ip v6 format")
.0;
assert_eq!(res1, expected_weight);
assert_eq!(res2, expected_weight);
// IP Range Bounded
let expected_weight = UserInputLeaf::Range {
field: Some("ip".to_string()),
lower: UserInputBound::Inclusive("::0.0.0.50".to_string()),
upper: UserInputBound::Exclusive("::0.0.0.52".to_string()),
};
let res1 = range()
.parse("ip:[::0.0.0.50 TO ::0.0.0.52}")
.expect("Cannot parse ip v6 format")
.0;
assert_eq!(res1, expected_weight);
} }
#[test] #[test]

View File

@@ -6,7 +6,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::agg_req::BucketAggregationInternal; use super::agg_req::BucketAggregationInternal;
@@ -145,7 +145,7 @@ pub enum BucketEntries<T> {
/// Vector format bucket entries /// Vector format bucket entries
Vec(Vec<T>), Vec(Vec<T>),
/// HashMap format bucket entries /// HashMap format bucket entries
HashMap(FxHashMap<String, T>), HashMap(FnvHashMap<String, T>),
} }
/// This is the default entry for a bucket, which contains a key, count, and optionally /// This is the default entry for a bucket, which contains a key, count, and optionally

View File

@@ -331,10 +331,10 @@ impl SegmentHistogramCollector {
.expect("unexpected fast field cardinatility"); .expect("unexpected fast field cardinatility");
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0])); let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0] as u64));
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1])); let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1] as u64));
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2])); let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2] as u64));
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3])); let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3] as u64));
let bucket_pos0 = get_bucket_num(val0); let bucket_pos0 = get_bucket_num(val0);
let bucket_pos1 = get_bucket_num(val1); let bucket_pos1 = get_bucket_num(val1);
@@ -371,7 +371,7 @@ impl SegmentHistogramCollector {
)?; )?;
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get_val(doc), &self.field_type); let val = f64_from_fastfield_u64(accessor.get_val(doc as u64), &self.field_type);
if !bounds.contains(val) { if !bounds.contains(val) {
continue; continue;
} }

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Range; use std::ops::Range;
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{ use crate::aggregation::agg_req_with_accessor::{
@@ -176,7 +176,7 @@ impl SegmentRangeCollector {
) -> crate::Result<IntermediateBucketResult> { ) -> crate::Result<IntermediateBucketResult> {
let field_type = self.field_type; let field_type = self.field_type;
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self let buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets .buckets
.into_iter() .into_iter()
.map(move |range_bucket| { .map(move |range_bucket| {
@@ -263,10 +263,10 @@ impl SegmentRangeCollector {
.as_single() .as_single()
.expect("unexpected fast field cardinality"); .expect("unexpected fast field cardinality");
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = accessor.get_val(docs[0]); let val1 = accessor.get_val(docs[0] as u64);
let val2 = accessor.get_val(docs[1]); let val2 = accessor.get_val(docs[1] as u64);
let val3 = accessor.get_val(docs[2]); let val3 = accessor.get_val(docs[2] as u64);
let val4 = accessor.get_val(docs[3]); let val4 = accessor.get_val(docs[3] as u64);
let bucket_pos1 = self.get_bucket_pos(val1); let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2); let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3); let bucket_pos3 = self.get_bucket_pos(val3);
@@ -278,7 +278,7 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = accessor.get_val(doc); let val = accessor.get_val(doc as u64);
let bucket_pos = self.get_bucket_pos(val); let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
} }

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use fnv::FnvHashMap;
use itertools::Itertools; use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::{CustomOrder, Order, OrderTarget}; use super::{CustomOrder, Order, OrderTarget};
@@ -199,7 +199,7 @@ impl TermsAggregationInternal {
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
/// Container to store term_ids and their buckets. /// Container to store term_ids and their buckets.
struct TermBuckets { struct TermBuckets {
pub(crate) entries: FxHashMap<u32, TermBucketEntry>, pub(crate) entries: FnvHashMap<u32, TermBucketEntry>,
blueprint: Option<SegmentAggregationResultsCollector>, blueprint: Option<SegmentAggregationResultsCollector>,
} }
@@ -397,7 +397,7 @@ impl SegmentTermCollector {
.expect("internal error: inverted index not loaded for term aggregation"); .expect("internal error: inverted index not loaded for term aggregation");
let term_dict = inverted_index.terms(); let term_dict = inverted_index.terms();
let mut dict: FxHashMap<String, IntermediateTermBucketEntry> = Default::default(); let mut dict: FnvHashMap<String, IntermediateTermBucketEntry> = Default::default();
let mut buffer = vec![]; let mut buffer = vec![];
for (term_id, entry) in entries { for (term_id, entry) in entries {
term_dict term_dict
@@ -1129,9 +1129,9 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb"); assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0); assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);

View File

@@ -5,8 +5,8 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::HashMap; use std::collections::HashMap;
use fnv::FnvHashMap;
use itertools::Itertools; use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::agg_req::{ use super::agg_req::{
@@ -288,7 +288,7 @@ impl IntermediateBucketResult {
.keyed; .keyed;
let buckets = if is_keyed { let buckets = if is_keyed {
let mut bucket_map = let mut bucket_map =
FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
for bucket in buckets { for bucket in buckets {
bucket_map.insert(bucket.key.to_string(), bucket); bucket_map.insert(bucket.key.to_string(), bucket);
} }
@@ -308,7 +308,7 @@ impl IntermediateBucketResult {
let buckets = if req.as_histogram().unwrap().keyed { let buckets = if req.as_histogram().unwrap().keyed {
let mut bucket_map = let mut bucket_map =
FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
for bucket in buckets { for bucket in buckets {
bucket_map.insert(bucket.key.to_string(), bucket); bucket_map.insert(bucket.key.to_string(), bucket);
} }
@@ -396,13 +396,13 @@ impl IntermediateBucketResult {
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Range aggregation including error counts /// Range aggregation including error counts
pub struct IntermediateRangeBucketResult { pub struct IntermediateRangeBucketResult {
pub(crate) buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry>, pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
} }
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Term aggregation including error counts /// Term aggregation including error counts
pub struct IntermediateTermBucketResult { pub struct IntermediateTermBucketResult {
pub(crate) entries: FxHashMap<String, IntermediateTermBucketEntry>, pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
pub(crate) sum_other_doc_count: u64, pub(crate) sum_other_doc_count: u64,
pub(crate) doc_count_error_upper_bound: u64, pub(crate) doc_count_error_upper_bound: u64,
} }
@@ -499,8 +499,8 @@ trait MergeFruits {
} }
fn merge_maps<V: MergeFruits + Clone>( fn merge_maps<V: MergeFruits + Clone>(
entries_left: &mut FxHashMap<SerializedKey, V>, entries_left: &mut FnvHashMap<SerializedKey, V>,
mut entries_right: FxHashMap<SerializedKey, V>, mut entries_right: FnvHashMap<SerializedKey, V>,
) { ) {
for (name, entry_left) in entries_left.iter_mut() { for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.remove(name) { if let Some(entry_right) = entries_right.remove(name) {
@@ -626,7 +626,7 @@ mod tests {
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults { fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
let mut map = HashMap::new(); let mut map = HashMap::new();
let mut buckets = FxHashMap::default(); let mut buckets = FnvHashMap::default();
for (key, doc_count) in data { for (key, doc_count) in data {
buckets.insert( buckets.insert(
key.to_string(), key.to_string(),
@@ -653,7 +653,7 @@ mod tests {
data: &[(String, u64, String, u64)], data: &[(String, u64, String, u64)],
) -> IntermediateAggregationResults { ) -> IntermediateAggregationResults {
let mut map = HashMap::new(); let mut map = HashMap::new();
let mut buckets: FxHashMap<_, _> = Default::default(); let mut buckets: FnvHashMap<_, _> = Default::default();
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data { for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
buckets.insert( buckets.insert(
key.to_string(), key.to_string(),

View File

@@ -60,10 +60,10 @@ impl SegmentAverageCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get_val(docs[0]); let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1]); let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2]); let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3]); let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type); let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type); let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type); let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -74,7 +74,7 @@ impl SegmentAverageCollector {
self.data.collect(val4); self.data.collect(val4);
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = field.get_val(doc); let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type); let val = f64_from_fastfield_u64(val, &self.field_type);
self.data.collect(val); self.data.collect(val);
} }

View File

@@ -166,10 +166,10 @@ impl SegmentStatsCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get_val(docs[0]); let val1 = field.get_val(docs[0] as u64);
let val2 = field.get_val(docs[1]); let val2 = field.get_val(docs[1] as u64);
let val3 = field.get_val(docs[2]); let val3 = field.get_val(docs[2] as u64);
let val4 = field.get_val(docs[3]); let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type); let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type); let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type); let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -180,7 +180,7 @@ impl SegmentStatsCollector {
self.stats.collect(val4); self.stats.collect(val4);
} }
for &doc in iter.remainder() { for &doc in iter.remainder() {
let val = field.get_val(doc); let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type); let val = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val); self.stats.collect(val);
} }

View File

@@ -177,7 +177,7 @@ where
type Fruit = TSegmentCollector::Fruit; type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_val(doc); let value = self.fast_field_reader.get_val(doc as u64);
if (self.predicate)(value) { if (self.predicate)(value) {
self.segment_collector.collect(doc, score) self.segment_collector.collect(doc, score)
} }

View File

@@ -94,7 +94,7 @@ impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) { fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc); let value = self.ff_reader.get_val(doc as u64);
self.histogram_computer.add_value(value); self.histogram_computer.add_value(value);
} }

View File

@@ -201,7 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) { fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get_val(doc); let val = self.reader.get_val(doc as u64);
self.vals.push(val); self.vals.push(val);
} }

View File

@@ -137,7 +137,7 @@ struct ScorerByFastFieldReader {
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader { impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 { fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc) self.ff_reader.get_val(doc as u64)
} }
} }
@@ -458,7 +458,7 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| { /// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc); /// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// // Well.. For the sake of the example we use a simple logarithm /// // Well.. For the sake of the example we use a simple logarithm
/// // function. /// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2(); /// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -567,8 +567,8 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId| { /// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc); /// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let boosted: u64 = boosted_reader.get_val(doc); /// let boosted: u64 = boosted_reader.get_val(doc as u64);
/// // Score do not have to be `f64` in tantivy. /// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order /// // Here we return a couple to get lexicographical order
/// // for free. /// // for free.

View File

@@ -32,9 +32,10 @@ impl BytesFastFieldReader {
Ok(BytesFastFieldReader { idx_reader, values }) Ok(BytesFastFieldReader { idx_reader, values })
} }
fn range(&self, doc: DocId) -> Range<u32> { fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get_val(doc) as u32; let idx = doc as u64;
let end = self.idx_reader.get_val(doc + 1) as u32; let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end start..end
} }
@@ -47,7 +48,7 @@ impl BytesFastFieldReader {
/// Returns the length of the bytes associated with the given `doc` /// Returns the length of the bytes associated with the given `doc`
pub fn num_bytes(&self, doc: DocId) -> u64 { pub fn num_bytes(&self, doc: DocId) -> u64 {
let range = self.range(doc); let range = self.range(doc);
(range.end - range.start) as u64 range.end - range.start
} }
/// Returns the overall number of bytes in this bytes fast field. /// Returns the overall number of bytes in this bytes fast field.
@@ -57,7 +58,7 @@ impl BytesFastFieldReader {
} }
impl MultiValueLength for BytesFastFieldReader { impl MultiValueLength for BytesFastFieldReader {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32> { fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
self.range(doc_id) self.range(doc_id)
} }
fn get_len(&self, doc_id: DocId) -> u64 { fn get_len(&self, doc_id: DocId) -> u64 {

View File

@@ -51,7 +51,7 @@ mod writer;
/// for a doc_id /// for a doc_id
pub trait MultiValueLength { pub trait MultiValueLength {
/// returns the positions for a docid /// returns the positions for a docid
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32>; fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64>;
/// returns the num of values associated with a doc_id /// returns the num of values associated with a doc_id
fn get_len(&self, doc_id: DocId) -> u64; fn get_len(&self, doc_id: DocId) -> u64;
/// returns the sum of num values for all doc_ids /// returns the sum of num values for all doc_ids
@@ -184,9 +184,9 @@ mod tests {
#[test] #[test]
pub fn test_fastfield() { pub fn test_fastfield() {
let test_fastfield = fastfield_codecs::serialize_and_load(&[100u64, 200u64, 300u64][..]); let test_fastfield = fastfield_codecs::serialize_and_load(&[100u64, 200u64, 300u64][..]);
assert_eq!(test_fastfield.get_val(0), 100); assert_eq!(test_fastfield.get_val(0u64), 100);
assert_eq!(test_fastfield.get_val(1), 200); assert_eq!(test_fastfield.get_val(1u64), 200);
assert_eq!(test_fastfield.get_val(2), 300); assert_eq!(test_fastfield.get_val(2u64), 300);
} }
#[test] #[test]
@@ -402,7 +402,7 @@ mod tests {
assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64); assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() { for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get_val(doc as u32), i); assert_eq!(fast_field_reader.get_val(doc as u64), i);
} }
let mut buffer = vec![0i64; 100]; let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]); fast_field_reader.get_range(53, &mut buffer[..]);
@@ -484,7 +484,7 @@ mod tests {
let fast_field_reader = open::<u64>(data)?; let fast_field_reader = open::<u64>(data)?;
for a in 0..n { for a in 0..n {
assert_eq!(fast_field_reader.get_val(a as u32), permutation[a as usize]); assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
} }
} }
Ok(()) Ok(())
@@ -976,7 +976,7 @@ mod tests {
let test_fastfield = open::<DateTime>(file.read_bytes()?)?; let test_fastfield = open::<DateTime>(file.read_bytes()?)?;
for (i, time) in times.iter().enumerate() { for (i, time) in times.iter().enumerate() {
assert_eq!(test_fastfield.get_val(i as u32), time.truncate(precision)); assert_eq!(test_fastfield.get_val(i as u64), time.truncate(precision));
} }
Ok(len) Ok(len)
} }

View File

@@ -515,7 +515,7 @@ mod bench {
for val in block { for val in block {
doc.add_u64(field, *val); doc.add_u64(field, *val);
} }
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc);
} }
fast_field_writers fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None) .serialize(&mut serializer, &HashMap::new(), None)
@@ -573,7 +573,7 @@ mod bench {
for val in block { for val in block {
doc.add_u64(field, *val); doc.add_u64(field, *val);
} }
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc);
} }
fast_field_writers fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None) .serialize(&mut serializer, &HashMap::new(), None)
@@ -606,7 +606,7 @@ mod bench {
for val in block { for val in block {
doc.add_u64(field, *val); doc.add_u64(field, *val);
} }
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc);
} }
fast_field_writers fast_field_writers
.serialize(&mut serializer, &HashMap::new(), Some(&doc_id_mapping)) .serialize(&mut serializer, &HashMap::new(), Some(&doc_id_mapping))

View File

@@ -33,19 +33,19 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns `[start, end)`, such that the values associated with /// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`. /// the given document are `start..end`.
#[inline] #[inline]
fn range(&self, doc: DocId) -> Range<u32> { fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get_val(doc) as u32; let idx = doc as u64;
let end = self.idx_reader.get_val(doc + 1) as u32; let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end start..end
} }
/// Returns the array of values associated with the given `doc`. /// Returns the array of values associated with the given `doc`.
#[inline] #[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<Item>) { fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
let len = (range.end - range.start) as usize; let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero()); vals.resize(len, Item::make_zero());
self.vals_reader self.vals_reader.get_range(range.start, &mut vals[..]);
.get_range(range.start as u64, &mut vals[..]);
} }
/// Returns the array of values associated with the given `doc`. /// Returns the array of values associated with the given `doc`.
@@ -88,7 +88,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
} }
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> { impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
fn get_range(&self, doc_id: DocId) -> Range<u32> { fn get_range(&self, doc_id: DocId) -> Range<u64> {
self.range(doc_id) self.range(doc_id)
} }
fn get_len(&self, doc_id: DocId) -> u64 { fn get_len(&self, doc_id: DocId) -> u64 {
@@ -127,9 +127,9 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns `[start, end)`, such that the values associated /// Returns `[start, end)`, such that the values associated
/// to the given document are `start..end`. /// to the given document are `start..end`.
#[inline] #[inline]
fn range(&self, doc: DocId) -> Range<u32> { fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get_val(doc) as u32; let start = self.idx_reader.get_val(doc as u64);
let end = self.idx_reader.get_val(doc + 1) as u32; let end = self.idx_reader.get_val(doc as u64 + 1);
start..end start..end
} }
@@ -145,11 +145,10 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns the array of values associated to the given `doc`. /// Returns the array of values associated to the given `doc`.
#[inline] #[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<T>) { fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<T>) {
let len = (range.end - range.start) as usize; let len = (range.end - range.start) as usize;
vals.resize(len, T::from_u128(0)); vals.resize(len, T::from_u128(0));
self.vals_reader self.vals_reader.get_range(range.start, &mut vals[..]);
.get_range(range.start as u64, &mut vals[..]);
} }
/// Returns the array of values associated to the given `doc`. /// Returns the array of values associated to the given `doc`.
@@ -160,14 +159,8 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
} }
/// Returns all docids which are in the provided value range /// Returns all docids which are in the provided value range
pub fn get_positions_for_value_range( pub fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<DocId> {
&self, let positions = self.vals_reader.get_between_vals(range);
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
) -> Vec<DocId> {
let mut positions = Vec::new(); // TODO replace
self.vals_reader
.get_positions_for_value_range(value_range, doc_id_range, &mut positions);
positions_to_docids(&positions, self.idx_reader.as_ref()) positions_to_docids(&positions, self.idx_reader.as_ref())
} }
@@ -210,7 +203,7 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
} }
impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFieldReader<T> { impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFieldReader<T> {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32> { fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
self.range(doc_id) self.range(doc_id)
} }
fn get_len(&self, doc_id: DocId) -> u64 { fn get_len(&self, doc_id: DocId) -> u64 {
@@ -230,14 +223,14 @@ impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFie
/// ///
/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a /// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a
/// docid to its value position. /// docid to its value position.
fn positions_to_docids<C: Column + ?Sized>(positions: &[u32], idx_reader: &C) -> Vec<DocId> { fn positions_to_docids<C: Column + ?Sized>(positions: &[u64], idx_reader: &C) -> Vec<DocId> {
let mut docs = vec![]; let mut docs = vec![];
let mut cur_doc = 0u32; let mut cur_doc = 0u32;
let mut last_doc = None; let mut last_doc = None;
for pos in positions { for pos in positions {
loop { loop {
let end = idx_reader.get_val(cur_doc + 1) as u32; let end = idx_reader.get_val(cur_doc as u64 + 1);
if end > *pos { if end > *pos {
// avoid duplicates // avoid duplicates
if Some(cur_doc) == last_doc { if Some(cur_doc) == last_doc {
@@ -265,7 +258,7 @@ mod tests {
#[test] #[test]
fn test_positions_to_docid() { fn test_positions_to_docid() {
let positions = vec![10u32, 11, 15, 20, 21, 22]; let positions = vec![10u64, 11, 15, 20, 21, 22];
let offsets = vec![0, 10, 12, 15, 22, 23]; let offsets = vec![0, 10, 12, 15, 22, 23];
{ {

View File

@@ -3,7 +3,7 @@ use std::io;
use fastfield_codecs::{ use fastfield_codecs::{
Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
}; };
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use super::get_fastfield_codecs_for_multivalue; use super::get_fastfield_codecs_for_multivalue;
use crate::fastfield::writer::unexpected_value; use crate::fastfield::writer::unexpected_value;
@@ -144,7 +144,7 @@ impl MultiValuedFastFieldWriter {
pub fn serialize( pub fn serialize(
mut self, mut self,
serializer: &mut CompositeFastFieldSerializer, serializer: &mut CompositeFastFieldSerializer,
term_mapping_opt: Option<&FxHashMap<UnorderedTermId, TermOrdinal>>, term_mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
{ {
@@ -219,7 +219,7 @@ pub(crate) struct MultivalueStartIndex<'a, C: Column> {
impl<'a, C: Column> MultivalueStartIndex<'a, C> { impl<'a, C: Column> MultivalueStartIndex<'a, C> {
pub fn new(column: &'a C, doc_id_map: &'a DocIdMapping) -> Self { pub fn new(column: &'a C, doc_id_map: &'a DocIdMapping) -> Self {
assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u32 + 1); assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u64 + 1);
let (min, max) = let (min, max) =
tantivy_bitpacker::minmax(iter_remapped_multivalue_index(doc_id_map, column)) tantivy_bitpacker::minmax(iter_remapped_multivalue_index(doc_id_map, column))
.unwrap_or((0u64, 0u64)); .unwrap_or((0u64, 0u64));
@@ -232,7 +232,7 @@ impl<'a, C: Column> MultivalueStartIndex<'a, C> {
} }
} }
impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> { impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> {
fn get_val(&self, _idx: u32) -> u64 { fn get_val(&self, _idx: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -244,8 +244,8 @@ impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> {
self.max self.max
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
(self.doc_id_map.num_new_doc_ids() + 1) as u32 (self.doc_id_map.num_new_doc_ids() + 1) as u64
} }
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
@@ -262,7 +262,7 @@ fn iter_remapped_multivalue_index<'a, C: Column>(
) -> impl Iterator<Item = u64> + 'a { ) -> impl Iterator<Item = u64> + 'a {
let mut offset = 0; let mut offset = 0;
std::iter::once(0).chain(doc_id_map.iter_old_doc_ids().map(move |old_doc| { std::iter::once(0).chain(doc_id_map.iter_old_doc_ids().map(move |old_doc| {
let num_vals_for_doc = column.get_val(old_doc + 1) - column.get_val(old_doc); let num_vals_for_doc = column.get_val(old_doc as u64 + 1) - column.get_val(old_doc as u64);
offset += num_vals_for_doc; offset += num_vals_for_doc;
offset as u64 offset as u64
})) }))
@@ -369,7 +369,7 @@ impl MultiValueU128FastFieldWriter {
serializer.create_u128_fast_field_with_idx( serializer.create_u128_fast_field_with_idx(
self.field, self.field,
iter_gen, iter_gen,
self.vals.len() as u32, self.vals.len() as u64,
1, 1,
)?; )?;
} }

View File

@@ -90,7 +90,7 @@ impl CompositeFastFieldSerializer {
&mut self, &mut self,
field: Field, field: Field,
iter_gen: F, iter_gen: F,
num_vals: u32, num_vals: u64,
idx: usize, idx: usize,
) -> io::Result<()> { ) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx); let field_write = self.composite_write.for_field_with_idx(field, idx);

View File

@@ -3,7 +3,7 @@ use std::io;
use common; use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker; use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
@@ -256,7 +256,7 @@ impl FastFieldsWriter {
pub fn serialize( pub fn serialize(
self, self,
serializer: &mut CompositeFastFieldSerializer, serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>, mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
for field_writer in self.term_id_writers { for field_writer in self.term_id_writers {
@@ -363,7 +363,7 @@ impl U128FastFieldWriter {
serializer.create_u128_fast_field_with_idx( serializer.create_u128_fast_field_with_idx(
self.field, self.field,
iter_gen, iter_gen,
self.val_count as u32, self.val_count as u64,
0, 0,
)?; )?;
} else { } else {
@@ -371,7 +371,7 @@ impl U128FastFieldWriter {
serializer.create_u128_fast_field_with_idx( serializer.create_u128_fast_field_with_idx(
self.field, self.field,
iter_gen, iter_gen,
self.val_count as u32, self.val_count as u64,
0, 0,
)?; )?;
} }
@@ -511,7 +511,7 @@ impl IntFastFieldWriter {
vals: &self.vals, vals: &self.vals,
min_value: min, min_value: min,
max_value: max, max_value: max,
num_vals: self.val_count as u32, num_vals: self.val_count as u64,
}; };
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?; serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
@@ -526,7 +526,7 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker, vals: &'bitp BlockedBitpacker,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> { impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
@@ -538,7 +538,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// # Panics /// # Panics
/// ///
/// May panic if `doc` is greater than the index. /// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 { fn get_val(&self, _doc: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -562,7 +562,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }

View File

@@ -1472,7 +1472,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?; let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc)) .map(|doc| fast_field_reader.get_val(doc as u64))
.collect(); .collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(()) Ok(())
@@ -1533,7 +1533,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?; let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc| fast_field_reader.get_val(doc)) .map(|doc| fast_field_reader.get_val(doc as u64))
.collect(); .collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
Ok(()) Ok(())
@@ -1619,7 +1619,6 @@ mod tests {
); );
let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED); let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED);
let multi_text_fields = schema_builder.add_text_field("multi_text_fields", TEXT | STORED);
let multi_numbers = schema_builder.add_u64_field( let multi_numbers = schema_builder.add_u64_field(
"multi_numbers", "multi_numbers",
@@ -1659,12 +1658,6 @@ mod tests {
let ip_exists = |id| id % 3 != 0; // 0 does not exist let ip_exists = |id| id % 3 != 0; // 0 does not exist
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
let multi_text_field_text2 = "test2 test3 test1 test2 test3 test1";
// rotate right
let multi_text_field_text3 = "test3 test1 test2 test3 test1 test2";
for &op in ops { for &op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id } => {
@@ -1685,10 +1678,7 @@ mod tests {
multi_bools => (id % 2u64) == 0, multi_bools => (id % 2u64) == 0,
text_field => id.to_string(), text_field => id.to_string(),
facet_field => facet, facet_field => facet,
large_text_field => LOREM, large_text_field=> LOREM
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
))?; ))?;
} else { } else {
index_writer.add_document(doc!(id_field=>id, index_writer.add_document(doc!(id_field=>id,
@@ -1706,10 +1696,7 @@ mod tests {
multi_bools => (id % 2u64) == 0, multi_bools => (id % 2u64) == 0,
text_field => id.to_string(), text_field => id.to_string(),
facet_field => facet, facet_field => facet,
large_text_field => LOREM, large_text_field=> LOREM
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
))?; ))?;
} }
} }
@@ -1760,7 +1747,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap(); let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc)) .map(move |doc| ff_reader.get_val(doc as u64))
}) })
.collect(); .collect();
@@ -1771,7 +1758,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap(); let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc| ff_reader.get_val(doc)) .map(move |doc| ff_reader.get_val(doc as u64))
}) })
.collect(); .collect();
@@ -1804,7 +1791,7 @@ mod tests {
.flat_map(|segment_reader| { .flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap(); let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap();
segment_reader.doc_ids_alive().flat_map(move |doc| { segment_reader.doc_ids_alive().flat_map(move |doc| {
let val = ff_reader.get_val(doc); let val = ff_reader.get_val(doc as u64);
if val == Ipv6Addr::from_u128(0) { if val == Ipv6Addr::from_u128(0) {
// TODO Fix null handling // TODO Fix null handling
None None
@@ -1861,7 +1848,7 @@ mod tests {
ff_reader.get_vals(doc, &mut vals); ff_reader.get_vals(doc, &mut vals);
assert_eq!(vals.len(), 2); assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]); assert_eq!(vals[0], vals[1]);
assert_eq!(id_reader.get_val(doc), vals[0]); assert_eq!(id_reader.get_val(doc as u64), vals[0]);
let mut bool_vals = vec![]; let mut bool_vals = vec![];
bool_ff_reader.get_vals(doc, &mut bool_vals); bool_ff_reader.get_vals(doc, &mut bool_vals);
@@ -1935,21 +1922,11 @@ mod tests {
for (existing_id, count) in &expected_ids_and_num_occurrences { for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count); let (existing_id, count) = (*existing_id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64; let assert_field = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(text_field), count); assert_eq!(assert_field(text_field), count);
assert_eq!(get_num_hits(i64_field), count); assert_eq!(assert_field(i64_field), count);
assert_eq!(get_num_hits(f64_field), count); assert_eq!(assert_field(f64_field), count);
assert_eq!(get_num_hits(id_field), count); assert_eq!(assert_field(id_field), count);
// Test multi text
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_expected
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_expected
);
// Test bytes // Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice()); let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
@@ -2012,7 +1989,7 @@ mod tests {
facet_reader facet_reader
.facet_from_ord(facet_ords[0], &mut facet) .facet_from_ord(facet_ords[0], &mut facet)
.unwrap(); .unwrap();
let id = ff_reader.get_val(doc_id); let id = ff_reader.get_val(doc_id as u64);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected); assert_eq!(facet, facet_expected);

View File

@@ -1,6 +1,6 @@
use fastfield_codecs::MonotonicallyMappableToU64; use fastfield_codecs::MonotonicallyMappableToU64;
use fnv::FnvHashMap;
use murmurhash32::murmurhash2; use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;
use crate::fastfield::FastValue; use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
@@ -52,7 +52,7 @@ use crate::{DatePrecision, DateTime, DocId, Term};
/// path map to the same index position as long as the probability is relatively low. /// path map to the same index position as long as the probability is relatively low.
#[derive(Default)] #[derive(Default)]
struct IndexingPositionsPerPath { struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>, positions_per_path: FnvHashMap<u32, IndexingPosition>,
} }
impl IndexingPositionsPerPath { impl IndexingPositionsPerPath {

View File

@@ -368,7 +368,7 @@ impl IndexMerger {
fast_field_serializer.create_u128_fast_field_with_idx( fast_field_serializer.create_u128_fast_field_with_idx(
field, field,
iter_gen, iter_gen,
doc_id_mapping.len() as u32, doc_id_mapping.len() as u64,
1, 1,
)?; )?;
@@ -397,13 +397,13 @@ impl IndexMerger {
let iter_gen = || { let iter_gen = || {
doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize];
fast_field_reader.get_val(doc_addr.doc_id) fast_field_reader.get_val(doc_addr.doc_id as u64)
}) })
}; };
fast_field_serializer.create_u128_fast_field_with_idx( fast_field_serializer.create_u128_fast_field_with_idx(
field, field,
iter_gen, iter_gen,
doc_id_mapping.len() as u32, doc_id_mapping.len() as u64,
0, 0,
)?; )?;
Ok(()) Ok(())
@@ -510,8 +510,8 @@ impl IndexMerger {
doc_id_reader_pair doc_id_reader_pair
.into_iter() .into_iter()
.kmerge_by(|a, b| { .kmerge_by(|a, b| {
let val1 = a.2.get_val(a.0); let val1 = a.2.get_val(a.0 as u64);
let val2 = b.2.get_val(b.0); let val2 = b.2.get_val(b.0 as u64);
if sort_by_field.order == Order::Asc { if sort_by_field.order == Order::Asc {
val1 < val2 val1 < val2
} else { } else {

View File

@@ -190,13 +190,13 @@ mod tests {
assert_eq!(fast_field.get_val(4), 2u64); assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get_val(3), 3u64); assert_eq!(fast_field.get_val(3), 3u64);
if force_disjunct_segment_sort_values { if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get_val(2), 20u64); assert_eq!(fast_field.get_val(2u64), 20u64);
assert_eq!(fast_field.get_val(1), 100u64); assert_eq!(fast_field.get_val(1u64), 100u64);
} else { } else {
assert_eq!(fast_field.get_val(2), 10u64); assert_eq!(fast_field.get_val(2u64), 10u64);
assert_eq!(fast_field.get_val(1), 20u64); assert_eq!(fast_field.get_val(1u64), 20u64);
} }
assert_eq!(fast_field.get_val(0), 1_000u64); assert_eq!(fast_field.get_val(0u64), 1_000u64);
// test new field norm mapping // test new field norm mapping
{ {
@@ -545,7 +545,7 @@ mod bench_sorted_index_merge {
// add values in order of the new doc_ids // add values in order of the new doc_ids
let mut val = 0; let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids { for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get_val(doc_id); val = field_reader.get_val(doc_id as u64);
} }
val val

View File

@@ -158,6 +158,7 @@ impl SegmentWriter {
let doc_id = self.max_doc; let doc_id = self.max_doc;
let vals_grouped_by_field = doc let vals_grouped_by_field = doc
.field_values() .field_values()
.iter()
.sorted_by_key(|el| el.field()) .sorted_by_key(|el| el.field())
.group_by(|el| el.field()); .group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field { for (field, field_values) in &vals_grouped_by_field {
@@ -501,17 +502,9 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get(0).unwrap(); let doc = reader.get(0).unwrap();
assert_eq!(doc.value_count(), 2); assert_eq!(doc.field_values().len(), 2);
let mut field_value_iter = doc.field_values(); assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!( assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
field_value_iter.next().unwrap().value().as_text(),
Some("A")
);
assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("title")
);
assert!(field_value_iter.next().is_none());
} }
#[test] #[test]
@@ -792,90 +785,4 @@ mod tests {
// On release this was [2, 1]. (< note the decreasing values) // On release this was [2, 1]. (< note the decreasing values)
assert_eq!(positions, &[2, 5]); assert_eq!(positions, &[2, 5]);
} }
#[test]
fn test_multiple_field_value_and_long_tokens() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "roller-coaster".to_string(),
tokens: vec![Token {
offset_from: 0,
offset_to: 14,
position: 0,
text: "rollercoaster".to_string(),
position_length: 2,
}],
};
doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inv_index = seg_reader.inverted_index(text).unwrap();
let term = Term::from_field_text(text, "rollercoaster");
let mut postings = inv_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0u32);
let mut positions = Vec::new();
postings.positions(&mut positions);
assert_eq!(positions, &[0, 3]); //< as opposed to 0, 2 if we had a position length of 1.
}
#[test]
fn test_last_token_not_ending_last() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![
Token {
// Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
},
],
};
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inv_index = seg_reader.inverted_index(text).unwrap();
let term = Term::from_field_text(text, "hello");
let mut postings = inv_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0u32);
let mut positions = Vec::new();
postings.positions(&mut positions);
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
}
} }

View File

@@ -12,7 +12,7 @@ pub(crate) struct RemappedDocIdColumn<'a> {
fast_field_readers: Vec<Arc<dyn Column<u64>>>, fast_field_readers: Vec<Arc<dyn Column<u64>>>,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
fn compute_min_max_val( fn compute_min_max_val(
@@ -32,7 +32,7 @@ fn compute_min_max_val(
// we need to recompute the max / min // we need to recompute the max / min
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc_id| u64_reader.get_val(doc_id)) .map(|doc_id| u64_reader.get_val(doc_id as u64))
.minmax() .minmax()
.into_option() .into_option()
} }
@@ -73,13 +73,13 @@ impl<'a> RemappedDocIdColumn<'a> {
fast_field_readers, fast_field_readers,
min_value, min_value,
max_value, max_value,
num_vals: doc_id_mapping.len() as u32, num_vals: doc_id_mapping.len() as u64,
} }
} }
} }
impl<'a> Column for RemappedDocIdColumn<'a> { impl<'a> Column for RemappedDocIdColumn<'a> {
fn get_val(&self, _doc: u32) -> u64 { fn get_val(&self, _doc: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -90,7 +90,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> {
.map(|old_doc_addr| { .map(|old_doc_addr| {
let fast_field_reader = let fast_field_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize]; &self.fast_field_readers[old_doc_addr.segment_ord as usize];
fast_field_reader.get_val(old_doc_addr.doc_id) fast_field_reader.get_val(old_doc_addr.doc_id as u64)
}), }),
) )
} }
@@ -102,7 +102,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }

View File

@@ -13,7 +13,7 @@ pub(crate) struct RemappedDocIdMultiValueColumn<'a> {
fast_field_readers: Vec<MultiValuedFastFieldReader<u64>>, fast_field_readers: Vec<MultiValuedFastFieldReader<u64>>,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
impl<'a> RemappedDocIdMultiValueColumn<'a> { impl<'a> RemappedDocIdMultiValueColumn<'a> {
@@ -61,13 +61,13 @@ impl<'a> RemappedDocIdMultiValueColumn<'a> {
fast_field_readers, fast_field_readers,
min_value, min_value,
max_value, max_value,
num_vals: num_vals as u32, num_vals: num_vals as u64,
} }
} }
} }
impl<'a> Column for RemappedDocIdMultiValueColumn<'a> { impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
fn get_val(&self, _pos: u32) -> u64 { fn get_val(&self, _pos: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -89,7 +89,7 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }
@@ -99,7 +99,7 @@ pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> {
multi_value_length_readers: Vec<&'a T>, multi_value_length_readers: Vec<&'a T>,
min_value: u64, min_value: u64,
max_value: u64, max_value: u64,
num_vals: u32, num_vals: u64,
} }
impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> { impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
@@ -123,7 +123,7 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
max_value += multi_value_length_reader.get_len(doc); max_value += multi_value_length_reader.get_len(doc);
} }
} }
num_vals += segment_reader.num_docs(); num_vals += segment_reader.num_docs() as u64;
multi_value_length_readers.push(multi_value_length_reader); multi_value_length_readers.push(multi_value_length_reader);
} }
Self { Self {
@@ -137,7 +137,7 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
} }
impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> { impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> {
fn get_val(&self, _pos: u32) -> u64 { fn get_val(&self, _pos: u64) -> u64 {
unimplemented!() unimplemented!()
} }
@@ -162,7 +162,7 @@ impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIn
self.max_value self.max_value
} }
fn num_vals(&self) -> u32 { fn num_vals(&self) -> u64 {
self.num_vals self.num_vals
} }
} }

View File

@@ -311,7 +311,7 @@ pub use crate::postings::Postings;
pub use crate::schema::{DateOptions, DatePrecision, Document, Term}; pub use crate::schema::{DateOptions, DatePrecision, Document, Term};
/// Index format version. /// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5; const INDEX_FORMAT_VERSION: u32 = 4;
/// Structure version for the index. /// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -819,7 +819,7 @@ pub mod tests {
fn test_indexedfield_not_in_documents() -> crate::Result<()> { fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("absent_text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
@@ -1001,7 +1001,7 @@ pub mod tests {
let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let fast_field_float = schema_builder.add_f64_field("float", FAST); let fast_field_float = schema_builder.add_f64_field("float", FAST);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("stored_int", STORED); let stored_int_field = schema_builder.add_u64_field("text", STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);

View File

@@ -3,7 +3,7 @@ use std::io;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::Range; use std::ops::Range;
use rustc_hash::FxHashMap; use fnv::FnvHashMap;
use super::stacker::Addr; use super::stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter; use crate::fastfield::MultiValuedFastFieldWriter;
@@ -56,12 +56,12 @@ pub(crate) fn serialize_postings(
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
schema: &Schema, schema: &Schema,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> { ) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> = let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len()); Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(ctx.term_index.iter()); term_offsets.extend(ctx.term_index.iter());
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone()); term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> = let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new(); HashMap::new();
let field_offsets = make_field_partition(&term_offsets); let field_offsets = make_field_partition(&term_offsets);
@@ -74,7 +74,7 @@ pub(crate) fn serialize_postings(
let unordered_term_ids = term_offsets[byte_offsets.clone()] let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter() .iter()
.map(|&(_, _, bucket)| bucket); .map(|&(_, _, bucket)| bucket);
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate() .enumerate()
.map(|(term_ord, unord_term_id)| { .map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal) (unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
@@ -170,7 +170,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.truncate_value_bytes(end_of_path_idx); term_buffer.truncate_value_bytes(end_of_path_idx);
term_buffer.append_bytes(token.text.as_bytes()); term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32; let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32); end_position = start_position + token.position_length as u32;
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx); let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() { if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id); term_id_fast_field_writer.add_val(unordered_term_id);

View File

@@ -86,7 +86,10 @@ impl DocSet for BitSetDocSet {
self.doc self.doc
} }
/// Returns the number of values set in the underlying bitset. /// Returns half of the `max_doc`
/// This is quite a terrible heuristic,
/// but we don't have access to any better
/// value.
fn size_hint(&self) -> u32 { fn size_hint(&self) -> u32 {
self.docs.len() as u32 self.docs.len() as u32
} }

View File

@@ -18,7 +18,6 @@ mod phrase_query;
mod query; mod query;
mod query_parser; mod query_parser;
mod range_query; mod range_query;
mod range_query_ip_fastfield;
mod regex_query; mod regex_query;
mod reqopt_scorer; mod reqopt_scorer;
mod scorer; mod scorer;

View File

@@ -31,7 +31,7 @@ pub struct MoreLikeThisQuery {
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
enum TargetDocument { enum TargetDocument {
DocumentAdress(DocAddress), DocumentAdress(DocAddress),
DocumentFields(Vec<(Field, Vec<Value<'static>>)>), DocumentFields(Vec<(Field, Vec<Value>)>),
} }
impl MoreLikeThisQuery { impl MoreLikeThisQuery {
@@ -160,10 +160,7 @@ impl MoreLikeThisQueryBuilder {
/// that will be used to compose the resulting query. /// that will be used to compose the resulting query.
/// This interface is meant to be used when you want to provide your own set of fields /// This interface is meant to be used when you want to provide your own set of fields
/// not necessarily from a specific document. /// not necessarily from a specific document.
pub fn with_document_fields( pub fn with_document_fields(self, doc_fields: Vec<(Field, Vec<Value>)>) -> MoreLikeThisQuery {
self,
doc_fields: Vec<(Field, Vec<Value<'static>>)>,
) -> MoreLikeThisQuery {
MoreLikeThisQuery { MoreLikeThisQuery {
mlt: self.mlt, mlt: self.mlt,
target: TargetDocument::DocumentFields(doc_fields), target: TargetDocument::DocumentFields(doc_fields),

View File

@@ -6,13 +6,12 @@ use common::BitSet;
use crate::core::{Searcher, SegmentReader}; use crate::core::{Searcher, SegmentReader};
use crate::error::TantivyError; use crate::error::TantivyError;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Query, Scorer, Weight}; use crate::query::{BitSetDocSet, ConstScorer, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type}; use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DocId, Score}; use crate::{DocId, Score};
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>( fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>, bound: &Bound<TFrom>,
transform: &Transform, transform: &Transform,
) -> Bound<TTo> { ) -> Bound<TTo> {
@@ -30,17 +29,8 @@ pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// ///
/// # Implementation /// # Implementation
/// ///
/// ## Default /// The current implement will iterate over the terms within the range
/// The default implementation collects all documents _upfront_ into a `BitSet`. /// and append all of the document cross into a `BitSet`.
/// This is done by iterating over the terms within the range and loading all docs for each
/// `TermInfo` from the inverted index (posting list) and put them into a `BitSet`.
/// Depending on the number of terms matched, this is a potentially expensive operation.
///
/// ## IP fast field
/// For IP fast fields a custom variant is used, by scanning the fast field. Unlike the default
/// variant we can walk in a lazy fashion over it, since the fastfield is implicit orderered by
/// DocId.
///
/// ///
/// # Example /// # Example
/// ///
@@ -259,8 +249,7 @@ impl Query for RangeQuery {
_scoring_enabled: bool, _scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> { ) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema(); let schema = searcher.schema();
let field_type = schema.get_field_entry(self.field).field_type(); let value_type = schema.get_field_entry(self.field).field_type().value_type();
let value_type = field_type.value_type();
if value_type != self.value_type { if value_type != self.value_type {
let err_msg = format!( let err_msg = format!(
"Create a range query of the type {:?}, when the field given was of type {:?}", "Create a range query of the type {:?}, when the field given was of type {:?}",
@@ -268,20 +257,11 @@ impl Query for RangeQuery {
); );
return Err(TantivyError::SchemaError(err_msg)); return Err(TantivyError::SchemaError(err_msg));
} }
Ok(Box::new(RangeWeight {
if field_type.is_ip_addr() && field_type.is_fast() { field: self.field,
Ok(Box::new(IPFastFieldRangeWeight::new( left_bound: self.left_bound.clone(),
self.field, right_bound: self.right_bound.clone(),
&self.left_bound, }))
&self.right_bound,
)))
} else {
Ok(Box::new(RangeWeight {
field: self.field,
left_bound: self.left_bound.clone(),
right_bound: self.right_bound.clone(),
}))
}
} }
} }
@@ -355,7 +335,7 @@ mod tests {
use super::RangeQuery; use super::RangeQuery;
use crate::collector::{Count, TopDocs}; use crate::collector::{Count, TopDocs};
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT}; use crate::schema::{Document, Field, IntoIpv6Addr, Schema, INDEXED, STORED, TEXT};
use crate::{doc, Index}; use crate::{doc, Index};
#[test] #[test]
@@ -529,24 +509,10 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn search_ip_range_test_posting_list() {
search_ip_range_test_opt(false);
}
#[test] #[test]
fn search_ip_range_test() { fn search_ip_range_test() {
search_ip_range_test_opt(true);
}
fn search_ip_range_test_opt(with_fast_field: bool) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let ip_field = if with_fast_field { let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED);
schema_builder.add_ip_addr_field("ip", INDEXED | STORED | FAST)
} else {
schema_builder.add_ip_addr_field("ip", INDEXED | STORED)
};
let text_field = schema_builder.add_text_field("text", TEXT | STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let ip_addr_1 = IpAddr::from_str("127.0.0.10").unwrap().into_ipv6_addr(); let ip_addr_1 = IpAddr::from_str("127.0.0.10").unwrap().into_ipv6_addr();
@@ -554,22 +520,16 @@ mod tests {
{ {
let mut index_writer = index.writer(3_000_000).unwrap(); let mut index_writer = index.writer(3_000_000).unwrap();
for _ in 0..1_000 { index_writer
index_writer .add_document(doc!(
.add_document(doc!( ip_field => ip_addr_1
ip_field => ip_addr_1, ))
text_field => "BLUBBER" .unwrap();
)) index_writer
.unwrap(); .add_document(doc!(
} ip_field => ip_addr_2
for _ in 0..1_000 { ))
index_writer .unwrap();
.add_document(doc!(
ip_field => ip_addr_2,
text_field => "BLOBBER"
))
.unwrap();
}
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
@@ -583,25 +543,24 @@ mod tests {
count count
}; };
let query_from_text = |text: &str| { let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![]) QueryParser::for_index(&index, vec![ip_field])
.parse_query(text) .parse_query(text)
.unwrap() .unwrap()
}; };
// Inclusive range
assert_eq!( assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20]")), get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20]")),
2000 2
); );
assert_eq!( assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.10 TO 127.0.0.20]")), get_num_hits(query_from_text("ip:[127.0.0.10 TO 127.0.0.20]")),
2000 2
); );
assert_eq!( assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.11 TO 127.0.0.20]")), get_num_hits(query_from_text("ip:[127.0.0.11 TO 127.0.0.20]")),
1000 1
); );
assert_eq!( assert_eq!(
@@ -609,84 +568,9 @@ mod tests {
0 0
); );
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.11 TO *]")), 1000); assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.11 TO *]")), 1);
assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.21 TO *]")), 0); assert_eq!(get_num_hits(query_from_text("ip:[127.0.0.21 TO *]")), 0);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.9]")), 0); assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.9]")), 0);
assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.10]")), 1000); assert_eq!(get_num_hits(query_from_text("ip:[* TO 127.0.0.10]")), 1);
// Exclusive range
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.20}")),
1000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.21}")),
2000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.10 TO 127.0.0.20}")),
0
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.11 TO 127.0.0.20}")),
0
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.11 TO 127.0.0.19}")),
0
);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.11 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.10 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.21 TO *}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.20 TO *}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{127.0.0.19 TO *}")), 1000);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.9}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.10}")), 0);
assert_eq!(get_num_hits(query_from_text("ip:{* TO 127.0.0.11}")), 1000);
// Inclusive/Exclusive range
assert_eq!(
get_num_hits(query_from_text("ip:[127.0.0.1 TO 127.0.0.20}")),
1000
);
assert_eq!(
get_num_hits(query_from_text("ip:{127.0.0.1 TO 127.0.0.20]")),
2000
);
// Intersection
assert_eq!(
get_num_hits(query_from_text(
"text:BLUBBER AND ip:[127.0.0.10 TO 127.0.0.10]"
)),
1000
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLOBBER AND ip:[127.0.0.10 TO 127.0.0.10]"
)),
0
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLOBBER AND ip:[127.0.0.20 TO 127.0.0.20]"
)),
1000
);
assert_eq!(
get_num_hits(query_from_text(
"text:BLUBBER AND ip:[127.0.0.20 TO 127.0.0.20]"
)),
0
);
} }
} }

View File

@@ -1,595 +0,0 @@
//! IP Fastfields support efficient scanning for range queries.
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use std::sync::Arc;
use common::BinarySerializable;
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::range_query::map_bound;
use super::{ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Cardinality, Field};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, TERMINATED};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight {
field: Field,
left_bound: Bound<Ipv6Addr>,
right_bound: Bound<Ipv6Addr>,
}
impl IPFastFieldRangeWeight {
pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
let ip_from_bound_raw_data = |data: &Vec<u8>| {
let left_ip_u128: u128 =
u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap());
Ipv6Addr::from_u128(left_ip_u128)
};
let left_bound = map_bound(left_bound, &ip_from_bound_raw_data);
let right_bound = map_bound(right_bound, &ip_from_bound_raw_data);
Self {
field,
left_bound,
right_bound,
}
}
}
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let field_type = reader.schema().get_field_entry(self.field).field_type();
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.as_ref(),
);
let docset = IpRangeDocSet::new(value_range, ip_addr_fast_field);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
Cardinality::MultiValues => unimplemented!(),
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({}) does not match",
doc
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
fn bound_to_value_range(
left_bound: &Bound<Ipv6Addr>,
right_bound: &Bound<Ipv6Addr>,
column: &dyn Column<Ipv6Addr>,
) -> RangeInclusive<Ipv6Addr> {
let start_value = match left_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
Bound::Unbounded => column.min_value(),
};
let end_value = match right_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
Bound::Unbounded => column.max_value(),
};
start_value..=end_value
}
/// Helper to have a cursor over a vec of docids
struct VecCursor {
docs: Vec<u32>,
current_pos: usize,
}
impl VecCursor {
fn new() -> Self {
Self {
docs: Vec::with_capacity(32),
current_pos: 0,
}
}
fn next(&mut self) -> Option<u32> {
self.current_pos += 1;
self.current()
}
#[inline]
fn current(&self) -> Option<u32> {
self.docs.get(self.current_pos).map(|el| *el as u32)
}
fn get_cleared_data(&mut self) -> &mut Vec<u32> {
self.docs.clear();
self.current_pos = 0;
&mut self.docs
}
fn is_empty(&self) -> bool {
self.current_pos >= self.docs.len()
}
}
struct IpRangeDocSet {
/// The range filter on the values.
value_range: RangeInclusive<Ipv6Addr>,
ip_addr_fast_field: Arc<dyn Column<Ipv6Addr>>,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
/// Number of docs range checked in a batch.
///
/// There are two patterns.
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
/// will come, so we start with small chunks
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
/// full scan.
fetch_horizon: u32,
/// Current batch of loaded docs.
loaded_docs: VecCursor,
last_seek_pos_opt: Option<u32>,
}
const DEFALT_FETCH_HORIZON: u32 = 128;
impl IpRangeDocSet {
fn new(
value_range: RangeInclusive<Ipv6Addr>,
ip_addr_fast_field: Arc<dyn Column<Ipv6Addr>>,
) -> Self {
let mut ip_range_docset = Self {
value_range,
ip_addr_fast_field,
loaded_docs: VecCursor::new(),
next_fetch_start: 0,
fetch_horizon: DEFALT_FETCH_HORIZON,
last_seek_pos_opt: None,
};
ip_range_docset.reset_fetch_range();
ip_range_docset.fetch_block();
ip_range_docset
}
fn reset_fetch_range(&mut self) {
self.fetch_horizon = DEFALT_FETCH_HORIZON;
}
/// Returns true if more data could be fetched
fn fetch_block(&mut self) {
const MAX_HORIZON: u32 = 100_000;
while self.loaded_docs.is_empty() {
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
if finished_to_end {
break;
}
// Fetch more data, increase horizon. Horizon only gets reset when doing a seek.
self.fetch_horizon = (self.fetch_horizon * 2).min(MAX_HORIZON);
}
}
/// check if the distance between the seek calls is large
fn is_last_seek_distance_large(&self, new_seek: DocId) -> bool {
if let Some(last_seek_pos) = self.last_seek_pos_opt {
(new_seek - last_seek_pos) >= 128
} else {
true
}
}
/// Fetches a block for docid range [next_fetch_start .. next_fetch_start + HORIZON]
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.ip_addr_fast_field.num_vals();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
finished_to_end = true;
}
let data = self.loaded_docs.get_cleared_data();
self.ip_addr_fast_field.get_positions_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
data,
);
self.next_fetch_start = end;
finished_to_end
}
}
impl DocSet for IpRangeDocSet {
#[inline]
fn advance(&mut self) -> DocId {
if let Some(docid) = self.loaded_docs.next() {
docid as u32
} else {
if self.next_fetch_start >= self.ip_addr_fast_field.num_vals() as u32 {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
}
#[inline]
fn doc(&self) -> DocId {
self.loaded_docs
.current()
.map(|el| el as u32)
.unwrap_or(TERMINATED)
}
/// Advances the `DocSet` forward until reaching the target, or going to the
/// lowest [`DocId`] greater than the target.
///
/// If the end of the `DocSet` is reached, [`TERMINATED`] is returned.
///
/// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
fn seek(&mut self, target: DocId) -> DocId {
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
if target > self.next_fetch_start {
self.next_fetch_start = target;
}
let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target {
doc = self.advance();
}
self.last_seek_pos_opt = Some(target);
doc
}
fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::Index;
#[derive(Clone, Debug)]
pub struct Doc {
pub id: String,
pub ip: Ipv6Addr,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..100u64).prop_map(doc_from_id_1),
(1u64..100u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
Doc {
// ip != id
id: id.to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
fn doc_from_id_2(id: u64) -> Doc {
Doc {
// ip != id
id: (id - 1).to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(ops).is_ok());
}
}
#[test]
fn ip_range_regression1_test() {
let ops = vec![
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn ip_range_regression2_test() {
let ops = vec![doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED | FAST);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ip_field => doc.ip,
text_field => doc.id.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_ip_range_for_docs(docs: Vec<Doc>) -> crate::Result<()> {
let index = create_index_from_docs(&docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |from: Ipv6Addr, to: Ipv6Addr| {
format!("ip:[{} TO {}]", &from.to_string(), &to.to_string())
};
let test_sample = |sample_docs: Vec<Doc>| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive(ips[0], ips[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!("{} AND id:{}", query, &id_filter);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(vec![docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(vec![docs[0].clone(), docs[1].clone()]);
test_sample(vec![docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(vec![docs[1].clone(), docs[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::{thread_rng, Rng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = thread_rng();
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id: id,
// Multiply by 1000, so that we create many buckets in the compact space
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
let index = create_index_from_docs(&docs);
index
}
fn excute_query(
start_inclusive: Ipv6Addr,
end_inclusive: Ipv6Addr,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: Ipv6Addr, to: Ipv6Addr| {
format!(
"ip:[{} TO {}] {}",
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(start_inclusive, end_inclusive);
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
excute_query(start, end, "", &index)
});
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "", &index)
});
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "", &index)
});
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "AND id:few", &index)
});
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "AND id:few", &index)
});
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "AND id:many", &index)
});
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "AND id:veryfew", &index)
});
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
excute_query(start, end, "AND id:many", &index)
});
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
excute_query(start, end, "AND id:many", &index)
});
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
excute_query(start, end, "AND id:few", &index)
});
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
excute_query(start, end, "AND id:veryfew", &index)
});
}
}

View File

@@ -115,7 +115,7 @@ mod tests {
pub fn test_term_set_query() -> crate::Result<()> { pub fn test_term_set_query() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field1 = schema_builder.add_text_field("field1", TEXT); let field1 = schema_builder.add_text_field("field1", TEXT);
let field2 = schema_builder.add_text_field("field2", TEXT); let field2 = schema_builder.add_text_field("field1", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {

View File

@@ -1,105 +1,35 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use std::mem;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::sync::Arc;
use std::{fmt, mem};
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
use itertools::Either;
use yoke::erased::ErasedArcCart;
use yoke::Yoke;
use super::*; use super::*;
use crate::schema::value::MaybeOwnedString;
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
use crate::DateTime; use crate::DateTime;
/// A group of FieldValue sharing an underlying storage
///
/// Or a single owned FieldValue.
#[derive(Clone)]
enum FieldValueGroup {
Single(FieldValue<'static>),
Group(Yoke<VecFieldValue<'static>, ErasedArcCart>),
}
// this NewType is required to make it possible to yoke a vec with non 'static inner values.
#[derive(yoke::Yokeable, Clone)]
struct VecFieldValue<'a>(Vec<FieldValue<'a>>);
impl<'a> std::ops::Deref for VecFieldValue<'a> {
type Target = Vec<FieldValue<'a>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<'a> From<Vec<FieldValue<'a>>> for VecFieldValue<'a> {
fn from(field_values: Vec<FieldValue>) -> VecFieldValue {
VecFieldValue(field_values)
}
}
impl FieldValueGroup {
fn iter(&self) -> impl Iterator<Item = &FieldValue> {
match self {
FieldValueGroup::Single(field_value) => Either::Left(std::iter::once(field_value)),
FieldValueGroup::Group(field_values) => Either::Right(field_values.get().iter()),
}
}
fn count(&self) -> usize {
match self {
FieldValueGroup::Single(_) => 1,
FieldValueGroup::Group(field_values) => field_values.get().len(),
}
}
}
impl From<Vec<FieldValue<'static>>> for FieldValueGroup {
fn from(field_values: Vec<FieldValue<'static>>) -> FieldValueGroup {
FieldValueGroup::Group(
Yoke::new_always_owned(field_values.into())
.wrap_cart_in_arc()
.erase_arc_cart(),
)
}
}
/// Tantivy's Document is the object that can /// Tantivy's Document is the object that can
/// be indexed and then searched for. /// be indexed and then searched for.
/// ///
/// Documents are fundamentally a collection of unordered couples `(field, value)`. /// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once. /// In this list, one field may appear more than once.
#[derive(Clone, Default)] #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
// TODO bring back Ser/De and Debug
//#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
//#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
pub struct Document { pub struct Document {
field_values: Vec<FieldValueGroup>, field_values: Vec<FieldValue>,
} }
impl fmt::Debug for Document { impl From<Vec<FieldValue>> for Document {
fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result { fn from(field_values: Vec<FieldValue>) -> Self {
todo!()
}
}
impl From<Vec<FieldValue<'static>>> for Document {
fn from(field_values: Vec<FieldValue<'static>>) -> Self {
let field_values = vec![field_values.into()];
Document { field_values } Document { field_values }
} }
} }
impl PartialEq for Document { impl PartialEq for Document {
fn eq(&self, other: &Document) -> bool { fn eq(&self, other: &Document) -> bool {
// super slow, but only here for tests // super slow, but only here for tests
let convert_to_comparable_map = |field_values| { let convert_to_comparable_map = |field_values: &[FieldValue]| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default(); let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values { for field_value in field_values.iter() {
// for some reason rustc fails to guess the type
let field_value: &FieldValue = field_value;
let json_val = serde_json::to_string(field_value.value()).unwrap(); let json_val = serde_json::to_string(field_value.value()).unwrap();
field_value_set field_value_set
.entry(field_value.field()) .entry(field_value.field())
@@ -109,9 +39,9 @@ impl PartialEq for Document {
field_value_set field_value_set
}; };
let self_field_values: HashMap<Field, HashSet<String>> = let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(self.field_values()); convert_to_comparable_map(&self.field_values);
let other_field_values: HashMap<Field, HashSet<String>> = let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(other.field_values()); convert_to_comparable_map(&other.field_values);
self_field_values.eq(&other_field_values) self_field_values.eq(&other_field_values)
} }
} }
@@ -119,13 +49,12 @@ impl PartialEq for Document {
impl Eq for Document {} impl Eq for Document {}
impl IntoIterator for Document { impl IntoIterator for Document {
type Item = FieldValue<'static>; type Item = FieldValue;
type IntoIter = std::vec::IntoIter<FieldValue<'static>>; type IntoIter = std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter { fn into_iter(self) -> Self::IntoIter {
todo!() self.field_values.into_iter()
// self.field_values.into_iter()
} }
} }
@@ -155,7 +84,7 @@ impl Document {
/// Add a text field. /// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) { pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(MaybeOwnedString::from_string(text.to_string())); let value = Value::Str(text.to_string());
self.add_field_value(field, value); self.add_field_value(field, value);
} }
@@ -209,35 +138,15 @@ impl Document {
} }
/// Add a (field, value) to the document. /// Add a (field, value) to the document.
pub fn add_field_value<T: Into<Value<'static>>>(&mut self, field: Field, typed_val: T) { pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) {
let value = typed_val.into(); let value = typed_val.into();
let field_value = FieldValue { field, value }; let field_value = FieldValue { field, value };
self.field_values.push(FieldValueGroup::Single(field_value)); self.field_values.push(field_value);
}
/// Add multiple borrowed values, also taking the container they're borrowing from
// TODO add a try_ variant?
pub fn add_borrowed_values<T, F>(&mut self, storage: T, f: F)
where
T: Send + Sync + 'static,
F: FnOnce(&T) -> Vec<FieldValue>,
{
let yoke =
Yoke::attach_to_cart(Arc::new(storage), |storage| f(storage).into()).erase_arc_cart();
self.field_values.push(FieldValueGroup::Group(yoke));
} }
/// field_values accessor /// field_values accessor
pub fn field_values(&self) -> impl Iterator<Item = &FieldValue> { pub fn field_values(&self) -> &[FieldValue] {
self.field_values.iter().flat_map(|group| group.iter()) &self.field_values
}
/// Return the total number of values
///
/// More efficient than calling `self.field_values().count()`
pub fn value_count(&self) -> usize {
self.field_values.iter().map(|group| group.count()).sum()
} }
/// Sort and groups the field_values by field. /// Sort and groups the field_values by field.
@@ -245,7 +154,7 @@ impl Document {
/// The result of this method is not cached and is /// The result of this method is not cached and is
/// computed on the fly when this method is called. /// computed on the fly when this method is called.
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> { pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> {
let mut field_values: Vec<&FieldValue> = self.field_values().collect(); let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
field_values.sort_by_key(|field_value| field_value.field()); field_values.sort_by_key(|field_value| field_value.field());
let mut field_values_it = field_values.into_iter(); let mut field_values_it = field_values.into_iter();
@@ -280,7 +189,6 @@ impl Document {
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> { pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
self.field_values self.field_values
.iter() .iter()
.flat_map(|group| group.iter())
.filter(move |field_value| field_value.field() == field) .filter(move |field_value| field_value.field() == field)
.map(FieldValue::value) .map(FieldValue::value)
} }
@@ -294,6 +202,7 @@ impl Document {
pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> { pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> {
let stored_field_values = || { let stored_field_values = || {
self.field_values() self.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
}; };
let num_field_values = stored_field_values().count(); let num_field_values = stored_field_values().count();
@@ -307,9 +216,7 @@ impl Document {
} => { } => {
let field_value = FieldValue { let field_value = FieldValue {
field: *field, field: *field,
value: Value::Str(MaybeOwnedString::from_string( value: Value::Str(pre_tokenized_text.text.to_string()),
pre_tokenized_text.text.to_string(),
)),
}; };
field_value.serialize(writer)?; field_value.serialize(writer)?;
} }
@@ -323,7 +230,7 @@ impl Document {
impl BinarySerializable for Document { impl BinarySerializable for Document {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let field_values = self.field_values(); let field_values = self.field_values();
VInt(self.value_count() as u64).serialize(writer)?; VInt(field_values.len() as u64).serialize(writer)?;
for field_value in field_values { for field_value in field_values {
field_value.serialize(writer)?; field_value.serialize(writer)?;
} }
@@ -352,7 +259,7 @@ mod tests {
let text_field = schema_builder.add_text_field("title", TEXT); let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_text(text_field, "My title"); doc.add_text(text_field, "My title");
assert_eq!(doc.value_count(), 1); assert_eq!(doc.field_values().len(), 1);
} }
#[test] #[test]
@@ -366,7 +273,7 @@ mod tests {
.clone(), .clone(),
); );
doc.add_text(Field::from_field_id(1), "hello"); doc.add_text(Field::from_field_id(1), "hello");
assert_eq!(doc.value_count(), 2); assert_eq!(doc.field_values().len(), 2);
let mut payload: Vec<u8> = Vec::new(); let mut payload: Vec<u8> = Vec::new();
doc.serialize(&mut payload).unwrap(); doc.serialize(&mut payload).unwrap();
assert_eq!(payload.len(), 26); assert_eq!(payload.len(), 26);

View File

@@ -9,7 +9,6 @@ use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr}; use super::{Cardinality, IntoIpv6Addr};
use crate::schema::bytes_options::BytesOptions; use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions; use crate::schema::facet_options::FacetOptions;
use crate::schema::value::MaybeOwnedString;
use crate::schema::{ use crate::schema::{
DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing,
TextOptions, Value, TextOptions, Value,
@@ -177,11 +176,6 @@ impl FieldType {
} }
} }
/// returns true if this is an ip address field
pub fn is_ip_addr(&self) -> bool {
matches!(self, FieldType::IpAddr(_))
}
/// returns true if the field is indexed. /// returns true if the field is indexed.
pub fn is_indexed(&self) -> bool { pub fn is_indexed(&self) -> bool {
match *self { match *self {
@@ -238,11 +232,11 @@ impl FieldType {
/// returns true if the field is fast. /// returns true if the field is fast.
pub fn fastfield_cardinality(&self) -> Option<Cardinality> { pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
match *self { match *self {
FieldType::Bytes(ref bytes_options) => { FieldType::Bytes(ref bytes_options) if bytes_options.is_fast() => {
bytes_options.is_fast().then_some(Cardinality::SingleValue) Some(Cardinality::SingleValue)
} }
FieldType::Str(ref text_options) => { FieldType::Str(ref text_options) if text_options.is_fast() => {
text_options.is_fast().then_some(Cardinality::MultiValues) Some(Cardinality::MultiValues)
} }
FieldType::U64(ref int_options) FieldType::U64(ref int_options)
| FieldType::I64(ref int_options) | FieldType::I64(ref int_options)
@@ -251,7 +245,7 @@ impl FieldType {
FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(), FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues), FieldType::Facet(_) => Some(Cardinality::MultiValues),
FieldType::JsonObject(_) => None, FieldType::JsonObject(_) => None,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.get_fastfield_cardinality(), _ => None,
} }
} }
@@ -330,7 +324,7 @@ impl FieldType {
/// Tantivy will not try to cast values. /// Tantivy will not try to cast values.
/// For instance, If the json value is the integer `3` and the /// For instance, If the json value is the integer `3` and the
/// target field is a `Str`, this method will return an Error. /// target field is a `Str`, this method will return an Error.
pub fn value_from_json(&self, json: JsonValue) -> Result<Value<'static>, ValueParsingError> { pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
match json { match json {
JsonValue::String(field_text) => { JsonValue::String(field_text) => {
match self { match self {
@@ -342,7 +336,7 @@ impl FieldType {
})?; })?;
Ok(DateTime::from_utc(dt_with_fixed_tz).into()) Ok(DateTime::from_utc(dt_with_fixed_tz).into())
} }
FieldType::Str(_) => Ok(Value::Str(MaybeOwnedString::from_string(field_text))), FieldType::Str(_) => Ok(Value::Str(field_text)),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError { Err(ValueParsingError::TypeError {
expected: "an integer", expected: "an integer",

View File

@@ -7,13 +7,12 @@ use crate::schema::{Field, Value};
/// `FieldValue` holds together a `Field` and its `Value`. /// `FieldValue` holds together a `Field` and its `Value`.
#[allow(missing_docs)] #[allow(missing_docs)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(bound(deserialize = "'a: 'de, 'de: 'a"))] pub struct FieldValue {
pub struct FieldValue<'a> {
pub field: Field, pub field: Field,
pub value: Value<'a>, pub value: Value,
} }
impl<'a> FieldValue<'a> { impl FieldValue {
/// Constructor /// Constructor
pub fn new(field: Field, value: Value) -> FieldValue { pub fn new(field: Field, value: Value) -> FieldValue {
FieldValue { field, value } FieldValue { field, value }
@@ -30,13 +29,13 @@ impl<'a> FieldValue<'a> {
} }
} }
impl<'a> From<FieldValue<'a>> for Value<'a> { impl From<FieldValue> for Value {
fn from(field_value: FieldValue<'a>) -> Self { fn from(field_value: FieldValue) -> Self {
field_value.value field_value.value
} }
} }
impl<'a> BinarySerializable for FieldValue<'a> { impl BinarySerializable for FieldValue {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.field.serialize(writer)?; self.field.serialize(writer)?;
self.value.serialize(writer) self.value.serialize(writer)

View File

@@ -10,5 +10,4 @@ use crate::schema::Value;
/// A `NamedFieldDocument` is a simple representation of a document /// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`. /// as a `BTreeMap<String, Vec<Value>>`.
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize)]
#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))] pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value<'static>>>);

View File

@@ -46,9 +46,13 @@ impl SchemaBuilder {
/// Adds a new u64 field. /// Adds a new u64 field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_u64_field<T: Into<NumericOptions>>( pub fn add_u64_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -62,9 +66,13 @@ impl SchemaBuilder {
/// Adds a new i64 field. /// Adds a new i64 field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_i64_field<T: Into<NumericOptions>>( pub fn add_i64_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -78,9 +86,13 @@ impl SchemaBuilder {
/// Adds a new f64 field. /// Adds a new f64 field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_f64_field<T: Into<NumericOptions>>( pub fn add_f64_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -94,9 +106,13 @@ impl SchemaBuilder {
/// Adds a new bool field. /// Adds a new bool field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_bool_field<T: Into<NumericOptions>>( pub fn add_bool_field<T: Into<NumericOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -112,9 +128,13 @@ impl SchemaBuilder {
/// Internally, Tantivy simply stores dates as i64 UTC timestamps, /// Internally, Tantivy simply stores dates as i64 UTC timestamps,
/// while the user supplies DateTime values for convenience. /// while the user supplies DateTime values for convenience.
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_date_field<T: Into<DateOptions>>( pub fn add_date_field<T: Into<DateOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -128,9 +148,13 @@ impl SchemaBuilder {
/// Adds a ip field. /// Adds a ip field.
/// Returns the associated field handle. /// Returns the associated field handle.
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_ip_addr_field<T: Into<IpAddrOptions>>( pub fn add_ip_addr_field<T: Into<IpAddrOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -144,9 +168,13 @@ impl SchemaBuilder {
/// Adds a new text field. /// Adds a new text field.
/// Returns the associated field handle /// Returns the associated field handle
/// ///
/// # Panics /// # Caution
/// ///
/// Panics when field already exists. /// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_text_field<T: Into<TextOptions>>( pub fn add_text_field<T: Into<TextOptions>>(
&mut self, &mut self,
field_name_str: &str, field_name_str: &str,
@@ -200,10 +228,8 @@ impl SchemaBuilder {
pub fn add_field(&mut self, field_entry: FieldEntry) -> Field { pub fn add_field(&mut self, field_entry: FieldEntry) -> Field {
let field = Field::from_field_id(self.fields.len() as u32); let field = Field::from_field_id(self.fields.len() as u32);
let field_name = field_entry.name().to_string(); let field_name = field_entry.name().to_string();
if let Some(_previous_value) = self.fields_map.insert(field_name, field) {
panic!("Field already exists in schema {}", field_entry.name());
};
self.fields.push(field_entry); self.fields.push(field_entry);
self.fields_map.insert(field_name, field);
field field
} }
@@ -308,11 +334,7 @@ impl Schema {
let mut field_map = BTreeMap::new(); let mut field_map = BTreeMap::new();
for (field, field_values) in doc.get_sorted_field_values() { for (field, field_values) in doc.get_sorted_field_values() {
let field_name = self.get_field_name(field); let field_name = self.get_field_name(field);
let values: Vec<Value> = field_values let values: Vec<Value> = field_values.into_iter().cloned().collect();
.into_iter()
.cloned()
.map(Value::into_owned)
.collect();
field_map.insert(field_name.to_string(), values); field_map.insert(field_name.to_string(), values);
} }
NamedFieldDocument(field_map) NamedFieldDocument(field_map)
@@ -342,21 +364,20 @@ impl Schema {
if let Some(field) = self.get_field(&field_name) { if let Some(field) = self.get_field(&field_name) {
let field_entry = self.get_field_entry(field); let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
// TODO rewrite this with shared allocation?
match json_value { match json_value {
JsonValue::Array(json_items) => { JsonValue::Array(json_items) => {
for json_item in json_items { for json_item in json_items {
let value = field_type let value = field_type
.value_from_json(json_item) .value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value.into_owned()); doc.add_field_value(field, value);
} }
} }
_ => { _ => {
let value = field_type let value = field_type
.value_from_json(json_value) .value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value.into_owned()); doc.add_field_value(field, value);
} }
} }
} }
@@ -711,7 +732,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
{ {
let doc = schema.parse_document("{}").unwrap(); let doc = schema.parse_document("{}").unwrap();
assert_eq!(doc.value_count(), 0); assert!(doc.field_values().is_empty());
} }
{ {
let doc = schema let doc = schema

View File

@@ -1,7 +1,6 @@
use std::fmt; use std::fmt;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
pub use not_safe::MaybeOwnedString;
use serde::de::Visitor; use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Map; use serde_json::Map;
@@ -13,9 +12,9 @@ use crate::DateTime;
/// Value represents the value of a any field. /// Value represents the value of a any field.
/// It is an enum over all over all of the possible field type. /// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum Value<'a> { pub enum Value {
/// The str type is used for any text information. /// The str type is used for any text information.
Str(MaybeOwnedString<'a>), Str(String),
/// Pre-tokenized str type, /// Pre-tokenized str type,
PreTokStr(PreTokenizedString), PreTokStr(PreTokenizedString),
/// Unsigned 64-bits Integer `u64` /// Unsigned 64-bits Integer `u64`
@@ -31,38 +30,16 @@ pub enum Value<'a> {
/// Facet /// Facet
Facet(Facet), Facet(Facet),
/// Arbitrarily sized byte array /// Arbitrarily sized byte array
// TODO allow Cow<'a, [u8]>
Bytes(Vec<u8>), Bytes(Vec<u8>),
/// Json object value. /// Json object value.
// TODO allow Cow keys and borrowed values
JsonObject(serde_json::Map<String, serde_json::Value>), JsonObject(serde_json::Map<String, serde_json::Value>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr), IpAddr(Ipv6Addr),
} }
impl<'a> Value<'a> { impl Eq for Value {}
/// Convert a borrowing [`Value`] to an owning one.
pub fn into_owned(self) -> Value<'static> {
use Value::*;
match self {
Str(val) => Str(MaybeOwnedString::from_string(val.into_string())),
PreTokStr(val) => PreTokStr(val),
U64(val) => U64(val),
I64(val) => I64(val),
F64(val) => F64(val),
Bool(val) => Bool(val),
Date(val) => Date(val),
Facet(val) => Facet(val),
Bytes(val) => Bytes(val),
JsonObject(val) => JsonObject(val),
IpAddr(val) => IpAddr(val),
}
}
}
impl<'a> Eq for Value<'a> {} impl Serialize for Value {
impl<'a> Serialize for Value<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer { where S: Serializer {
match *self { match *self {
@@ -88,13 +65,13 @@ impl<'a> Serialize for Value<'a> {
} }
} }
impl<'de> Deserialize<'de> for Value<'de> { impl<'de> Deserialize<'de> for Value {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> { where D: Deserializer<'de> {
struct ValueVisitor; struct ValueVisitor;
impl<'de> Visitor<'de> for ValueVisitor { impl<'de> Visitor<'de> for ValueVisitor {
type Value = Value<'de>; type Value = Value;
fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str("a string or u32") formatter.write_str("a string or u32")
@@ -116,13 +93,12 @@ impl<'de> Deserialize<'de> for Value<'de> {
Ok(Value::Bool(v)) Ok(Value::Bool(v))
} }
// TODO add visit_borrowed_str
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> { fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Value::Str(MaybeOwnedString::from_string(v.to_owned()))) Ok(Value::Str(v.to_owned()))
} }
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> { fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
Ok(Value::Str(MaybeOwnedString::from_string(v))) Ok(Value::Str(v))
} }
} }
@@ -130,7 +106,7 @@ impl<'de> Deserialize<'de> for Value<'de> {
} }
} }
impl<'a> Value<'a> { impl Value {
/// Returns the text value, provided the value is of the `Str` type. /// Returns the text value, provided the value is of the `Str` type.
/// (Returns `None` if the value is not of the `Str` type). /// (Returns `None` if the value is not of the `Str` type).
pub fn as_text(&self) -> Option<&str> { pub fn as_text(&self) -> Option<&str> {
@@ -248,87 +224,86 @@ impl<'a> Value<'a> {
} }
} }
impl From<String> for Value<'static> { impl From<String> for Value {
fn from(s: String) -> Value<'static> { fn from(s: String) -> Value {
Value::Str(MaybeOwnedString::from_string(s)) Value::Str(s)
} }
} }
impl From<Ipv6Addr> for Value<'static> { impl From<Ipv6Addr> for Value {
fn from(v: Ipv6Addr) -> Value<'static> { fn from(v: Ipv6Addr) -> Value {
Value::IpAddr(v) Value::IpAddr(v)
} }
} }
impl From<u64> for Value<'static> { impl From<u64> for Value {
fn from(v: u64) -> Value<'static> { fn from(v: u64) -> Value {
Value::U64(v) Value::U64(v)
} }
} }
impl From<i64> for Value<'static> { impl From<i64> for Value {
fn from(v: i64) -> Value<'static> { fn from(v: i64) -> Value {
Value::I64(v) Value::I64(v)
} }
} }
impl From<f64> for Value<'static> { impl From<f64> for Value {
fn from(v: f64) -> Value<'static> { fn from(v: f64) -> Value {
Value::F64(v) Value::F64(v)
} }
} }
impl From<bool> for Value<'static> { impl From<bool> for Value {
fn from(b: bool) -> Self { fn from(b: bool) -> Self {
Value::Bool(b) Value::Bool(b)
} }
} }
impl From<DateTime> for Value<'static> { impl From<DateTime> for Value {
fn from(dt: DateTime) -> Value<'static> { fn from(dt: DateTime) -> Value {
Value::Date(dt) Value::Date(dt)
} }
} }
impl<'a> From<&'a str> for Value<'a> { impl<'a> From<&'a str> for Value {
fn from(s: &'a str) -> Value<'a> { fn from(s: &'a str) -> Value {
Value::Str(MaybeOwnedString::from_str(s)) Value::Str(s.to_string())
} }
} }
// TODO change lifetime to 'a impl<'a> From<&'a [u8]> for Value {
impl<'a> From<&'a [u8]> for Value<'static> { fn from(bytes: &'a [u8]) -> Value {
fn from(bytes: &'a [u8]) -> Value<'static> {
Value::Bytes(bytes.to_vec()) Value::Bytes(bytes.to_vec())
} }
} }
impl From<Facet> for Value<'static> { impl From<Facet> for Value {
fn from(facet: Facet) -> Value<'static> { fn from(facet: Facet) -> Value {
Value::Facet(facet) Value::Facet(facet)
} }
} }
impl From<Vec<u8>> for Value<'static> { impl From<Vec<u8>> for Value {
fn from(bytes: Vec<u8>) -> Value<'static> { fn from(bytes: Vec<u8>) -> Value {
Value::Bytes(bytes) Value::Bytes(bytes)
} }
} }
impl From<PreTokenizedString> for Value<'static> { impl From<PreTokenizedString> for Value {
fn from(pretokenized_string: PreTokenizedString) -> Value<'static> { fn from(pretokenized_string: PreTokenizedString) -> Value {
Value::PreTokStr(pretokenized_string) Value::PreTokStr(pretokenized_string)
} }
} }
impl From<serde_json::Map<String, serde_json::Value>> for Value<'static> { impl From<serde_json::Map<String, serde_json::Value>> for Value {
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value<'static> { fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value {
Value::JsonObject(json_object) Value::JsonObject(json_object)
} }
} }
impl From<serde_json::Value> for Value<'static> { impl From<serde_json::Value> for Value {
fn from(json_value: serde_json::Value) -> Value<'static> { fn from(json_value: serde_json::Value) -> Value {
match json_value { match json_value {
serde_json::Value::Object(json_object) => Value::JsonObject(json_object), serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
_ => { _ => {
@@ -345,7 +320,7 @@ mod binary_serialize {
use common::{f64_to_u64, u64_to_f64, BinarySerializable}; use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use fastfield_codecs::MonotonicallyMappableToU128; use fastfield_codecs::MonotonicallyMappableToU128;
use super::{MaybeOwnedString, Value}; use super::Value;
use crate::schema::Facet; use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
use crate::DateTime; use crate::DateTime;
@@ -366,13 +341,12 @@ mod binary_serialize {
const TOK_STR_CODE: u8 = 0; const TOK_STR_CODE: u8 = 0;
impl<'a> BinarySerializable for Value<'a> { impl BinarySerializable for Value {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
match *self { match *self {
Value::Str(ref text) => { Value::Str(ref text) => {
TEXT_CODE.serialize(writer)?; TEXT_CODE.serialize(writer)?;
// TODO impl trait for MaybeOwnedString text.serialize(writer)
text.as_str().to_owned().serialize(writer)
} }
Value::PreTokStr(ref tok_str) => { Value::PreTokStr(ref tok_str) => {
EXT_CODE.serialize(writer)?; EXT_CODE.serialize(writer)?;
@@ -434,7 +408,7 @@ mod binary_serialize {
match type_code { match type_code {
TEXT_CODE => { TEXT_CODE => {
let text = String::deserialize(reader)?; let text = String::deserialize(reader)?;
Ok(Value::Str(MaybeOwnedString::from_string(text))) Ok(Value::Str(text))
} }
U64_CODE => { U64_CODE => {
let value = u64::deserialize(reader)?; let value = u64::deserialize(reader)?;
@@ -576,104 +550,3 @@ mod tests {
assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#); assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#);
} }
} }
mod not_safe {
use std::ops::Deref;
union Ref<'a, T: ?Sized> {
shared: &'a T,
uniq: &'a mut T,
}
pub struct MaybeOwnedString<'a> {
string: Ref<'a, str>,
capacity: usize,
}
impl<'a> MaybeOwnedString<'a> {
pub fn from_str(string: &'a str) -> MaybeOwnedString<'a> {
MaybeOwnedString {
string: Ref { shared: string },
capacity: 0,
}
}
pub fn from_string(mut string: String) -> MaybeOwnedString<'static> {
string.shrink_to_fit(); // <= actually important for safety, todo use the Vec .as_ptr instead
let mut s = std::mem::ManuallyDrop::new(string);
let ptr = s.as_mut_ptr();
let len = s.len();
let capacity = s.capacity();
let string = unsafe {
std::str::from_utf8_unchecked_mut(std::slice::from_raw_parts_mut(ptr, len))
};
MaybeOwnedString {
string: Ref { uniq: string },
capacity,
}
}
pub fn into_string(mut self) -> String {
if self.capacity != 0 {
let string = unsafe { &mut self.string.uniq };
unsafe {
return String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity);
};
}
self.deref().to_owned()
}
pub fn as_str(&self) -> &str {
self.deref()
}
}
impl<'a> Deref for MaybeOwnedString<'a> {
type Target = str;
#[inline]
fn deref(&self) -> &str {
unsafe { self.string.shared }
}
}
impl<'a> Drop for MaybeOwnedString<'a> {
fn drop(&mut self) {
// if capacity is 0, either it's an empty String so there is no dealloc to do, or it's
// borrowed
if self.capacity != 0 {
let string = unsafe { &mut self.string.uniq };
unsafe { String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity) };
}
}
}
impl<'a> Clone for MaybeOwnedString<'a> {
fn clone(&self) -> Self {
if self.capacity == 0 {
MaybeOwnedString {
string: Ref {
shared: unsafe { self.string.shared },
},
capacity: 0,
}
} else {
MaybeOwnedString::from_string(self.deref().to_owned())
}
}
}
impl<'a> std::fmt::Debug for MaybeOwnedString<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.deref())
}
}
impl<'a> PartialEq for MaybeOwnedString<'a> {
fn eq(&self, other: &Self) -> bool {
self.deref() == other.deref()
}
}
}

View File

@@ -126,7 +126,6 @@ mod ngram_tokenizer;
mod raw_tokenizer; mod raw_tokenizer;
mod remove_long; mod remove_long;
mod simple_tokenizer; mod simple_tokenizer;
mod split_compound_words;
mod stemmer; mod stemmer;
mod stop_word_filter; mod stop_word_filter;
mod tokenized_string; mod tokenized_string;
@@ -142,7 +141,6 @@ pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer; pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter; pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer; pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer}; pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter; pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};

View File

@@ -1,252 +0,0 @@
use std::sync::Arc;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
///
/// Words only will be split if they can be fully decomposed into
/// consecutive matches into the given dictionary.
///
/// This is mostly useful to split [compound nouns][compound] common to many
/// Germanic languages into their constituents.
///
/// # Example
///
/// The quality of the dictionary determines the quality of the splits,
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
/// is not split in the following example.
///
/// ```rust
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
///
/// let tokenizer =
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ]));
///
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
/// assert_eq!(stream.next().unwrap().text, "dampf");
/// assert_eq!(stream.next().unwrap().text, "schiff");
/// assert_eq!(stream.next().unwrap().text, "fahrt");
/// assert_eq!(stream.next(), None);
///
/// let mut stream = tokenizer.token_stream("brotbackautomat");
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
/// assert_eq!(stream.next(), None);
/// ```
///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)]
pub struct SplitCompoundWords<S: StateID> {
dict: Arc<AhoCorasick<S>>,
}
impl SplitCompoundWords<usize> {
/// Create a filter from a given dictionary.
///
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Self
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let dict = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(dict);
Self::from_automaton(dict)
}
}
impl<S: StateID> SplitCompoundWords<S> {
/// Create a filter from a given automaton.
///
/// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
Self {
dict: Arc::new(dict),
}
}
}
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: stream,
cuts: Vec::new(),
parts: Vec::new(),
})
}
}
struct SplitCompoundWordsTokenStream<'a, S: StateID> {
dict: Arc<AhoCorasick<S>>,
tail: BoxTokenStream<'a>,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
let token = self.tail.token();
let mut text = token.text.as_str();
self.cuts.clear();
let mut pos = 0;
for match_ in self.dict.find_iter(text) {
if pos != match_.start() {
break;
}
self.cuts.push(pos);
pos = match_.end();
}
if pos == token.text.len() {
// Fill `self.parts` in reverse order,
// so that `self.parts.pop()` yields
// the tokens in their original order.
for pos in self.cuts.iter().rev() {
let (head, tail) = text.split_at(*pos);
text = head;
self.parts.push(Token {
text: tail.to_owned(),
..*token
});
}
}
}
}
impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
fn advance(&mut self) -> bool {
self.parts.pop();
if !self.parts.is_empty() {
return true;
}
if !self.tail.advance() {
return false;
}
// Will yield either `self.parts.last()` or
// `self.tail.token()` if it could not be split.
self.split();
true
}
fn token(&self) -> &Token {
self.parts.last().unwrap_or_else(|| self.tail.token())
}
fn token_mut(&mut self) -> &mut Token {
self.parts
.last_mut()
.unwrap_or_else(|| self.tail.token_mut())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer};
#[test]
fn splitting_compound_words_works() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
{
let mut stream = tokenizer.token_stream("");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foo bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobarbaz");
assert_eq!(stream.next().unwrap().text, "foobarbaz");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("baz foobar qux");
assert_eq!(stream.next().unwrap().text, "baz");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobazbar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foobazbar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar qux foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("barfoo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next(), None);
}
}
}

View File

@@ -1,6 +1,3 @@
use std::borrow::Cow;
use std::mem;
use rust_stemmers::{self, Algorithm}; use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -87,7 +84,6 @@ impl TokenFilter for Stemmer {
BoxTokenStream::from(StemmerTokenStream { BoxTokenStream::from(StemmerTokenStream {
tail: token_stream, tail: token_stream,
stemmer: inner_stemmer, stemmer: inner_stemmer,
buffer: String::new(),
}) })
} }
} }
@@ -95,7 +91,6 @@ impl TokenFilter for Stemmer {
pub struct StemmerTokenStream<'a> { pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer, stemmer: rust_stemmers::Stemmer,
buffer: String,
} }
impl<'a> TokenStream for StemmerTokenStream<'a> { impl<'a> TokenStream for StemmerTokenStream<'a> {
@@ -103,16 +98,10 @@ impl<'a> TokenStream for StemmerTokenStream<'a> {
if !self.tail.advance() { if !self.tail.advance() {
return false; return false;
} }
let token = self.tail.token_mut(); // TODO remove allocation
let stemmed_str = self.stemmer.stem(&token.text); let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
match stemmed_str { self.token_mut().text.clear();
Cow::Owned(stemmed_str) => token.text = stemmed_str, self.token_mut().text.push_str(&stemmed_str);
Cow::Borrowed(stemmed_str) => {
self.buffer.clear();
self.buffer.push_str(stemmed_str);
mem::swap(&mut token.text, &mut self.buffer);
}
}
true true
} }

View File

@@ -10,21 +10,28 @@
//! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
use rustc_hash::FxHashSet; use std::collections::HashSet;
use std::hash::BuildHasherDefault;
use fnv::FnvHasher;
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream; use crate::tokenizer::BoxTokenStream;
// configure our hashers for SPEED
type StopWordHasher = BuildHasherDefault<FnvHasher>;
type StopWordHashSet = HashSet<String, StopWordHasher>;
/// `TokenFilter` that removes stop words from a token stream /// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)] #[derive(Clone)]
pub struct StopWordFilter { pub struct StopWordFilter {
words: FxHashSet<String>, words: StopWordHashSet,
} }
impl StopWordFilter { impl StopWordFilter {
/// Creates a `StopWordFilter` given a list of words to remove /// Creates a `StopWordFilter` given a list of words to remove
pub fn remove(words: Vec<String>) -> StopWordFilter { pub fn remove(words: Vec<String>) -> StopWordFilter {
let mut set = FxHashSet::default(); let mut set = StopWordHashSet::default();
for word in words { for word in words {
set.insert(word); set.insert(word);
@@ -45,7 +52,7 @@ impl StopWordFilter {
} }
pub struct StopWordFilterStream<'a> { pub struct StopWordFilterStream<'a> {
words: FxHashSet<String>, words: StopWordHashSet,
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }