mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 21:12:54 +00:00
Compare commits
27 Commits
update_exa
...
agg_format
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b345c11786 | ||
|
|
7ebcc15b17 | ||
|
|
1b4076691f | ||
|
|
eab660873a | ||
|
|
232f37126e | ||
|
|
13e9885dfd | ||
|
|
56d79cb203 | ||
|
|
0f4c2e27cf | ||
|
|
f9ae295507 | ||
|
|
d9db5302d9 | ||
|
|
e453848134 | ||
|
|
59084143ef | ||
|
|
511b027350 | ||
|
|
322f47eb47 | ||
|
|
72f61ff89c | ||
|
|
a141c3ec59 | ||
|
|
e90e7a25ae | ||
|
|
c3b92a5412 | ||
|
|
2f55511064 | ||
|
|
08b9fc0b31 | ||
|
|
714f363d43 | ||
|
|
93ff7365b0 | ||
|
|
8151925068 | ||
|
|
b960e40bc8 | ||
|
|
1095c9b073 | ||
|
|
c0686515a9 | ||
|
|
455156f51c |
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
@@ -15,11 +15,11 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
|
||||
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
continue-on-error: true
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -11,12 +11,11 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.63"
|
||||
rust-version = "1.66"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
|
||||
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" }
|
||||
oneshot = "0.1.7"
|
||||
base64 = "0.22.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
@@ -39,7 +38,7 @@ levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
downcast-rs = "1.2.1"
|
||||
bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||
"bitpacker4x",
|
||||
] }
|
||||
@@ -64,7 +63,8 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
|
||||
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
|
||||
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
|
||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
fnv = "1.0.7"
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
## Benchmark
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
|
||||
performance for different types of queries/collections.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
@@ -101,7 +101,8 @@ cargo test
|
||||
## Companies Using Tantivy
|
||||
|
||||
<p align="left">
|
||||
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/paradedb.png" alt="ParadeDB" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
|
||||
@@ -47,13 +47,19 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
register!(group, average_f64);
|
||||
register!(group, average_f64_u64);
|
||||
register!(group, stats_f64);
|
||||
register!(group, extendedstats_f64);
|
||||
register!(group, percentiles_f64);
|
||||
register!(group, terms_few);
|
||||
register!(group, terms_many);
|
||||
register!(group, terms_many_top_1000);
|
||||
register!(group, terms_many_order_by_term);
|
||||
register!(group, terms_many_with_top_hits);
|
||||
register!(group, terms_many_with_avg_sub_agg);
|
||||
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
|
||||
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
||||
|
||||
register!(group, cardinality_agg);
|
||||
register!(group, terms_few_with_cardinality_agg);
|
||||
|
||||
register!(group, range_agg);
|
||||
register!(group, range_agg_with_avg_sub_agg);
|
||||
register!(group, range_agg_with_term_agg_few);
|
||||
@@ -105,7 +111,12 @@ fn stats_f64(index: &Index) {
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
|
||||
fn extendedstats_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
fn percentiles_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"mypercentiles": {
|
||||
@@ -117,6 +128,33 @@ fn percentiles_f64(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_many_terms"
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_few_with_cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms" },
|
||||
"aggs": {
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_many_terms"
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_few(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||
@@ -129,6 +167,12 @@ fn terms_many(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_top_1000(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_order_by_term(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
||||
@@ -165,7 +209,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
|
||||
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "json.mixed_type" },
|
||||
@@ -262,6 +306,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
|
||||
@@ -23,6 +23,16 @@ downcast-rs = "1.2.0"
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.8"
|
||||
binggan = "0.8.1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_merge"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_access"
|
||||
harness = false
|
||||
|
||||
|
||||
[features]
|
||||
unstable = []
|
||||
|
||||
67
columnar/benches/bench_access.rs
Normal file
67
columnar/benches/bench_access.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
use binggan::{black_box, InputGroup};
|
||||
use common::*;
|
||||
use tantivy_columnar::Column;
|
||||
|
||||
pub mod common;
|
||||
|
||||
const NUM_DOCS: u32 = 2_000_000;
|
||||
|
||||
pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
|
||||
let reader = generate_columnar_with_name(card, num_docs, "price");
|
||||
reader.read_columns("price").unwrap()[0]
|
||||
.open_u64_lenient()
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut inputs = Vec::new();
|
||||
|
||||
let mut add_card = |card1: Card| {
|
||||
inputs.push((
|
||||
format!("{card1}"),
|
||||
generate_columnar_and_open(card1, NUM_DOCS),
|
||||
));
|
||||
};
|
||||
|
||||
add_card(Card::MultiSparse);
|
||||
add_card(Card::Multi);
|
||||
add_card(Card::Sparse);
|
||||
add_card(Card::Dense);
|
||||
add_card(Card::Full);
|
||||
|
||||
bench_group(InputGroup::new_with_inputs(inputs));
|
||||
}
|
||||
|
||||
fn bench_group(mut runner: InputGroup<Column>) {
|
||||
runner.register("access_values_for_doc", |column| {
|
||||
let mut sum = 0;
|
||||
for i in 0..NUM_DOCS {
|
||||
for value in column.values_for_doc(i) {
|
||||
sum += value;
|
||||
}
|
||||
}
|
||||
black_box(sum);
|
||||
});
|
||||
runner.register("access_first_vals", |column| {
|
||||
let mut sum = 0;
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
let mut docs = vec![0; BLOCK_SIZE];
|
||||
let mut buffer = vec![None; BLOCK_SIZE];
|
||||
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
|
||||
// fill docs
|
||||
for idx in 0..BLOCK_SIZE {
|
||||
docs[idx] = idx as u32 + i;
|
||||
}
|
||||
|
||||
column.first_vals(&docs, &mut buffer);
|
||||
for val in buffer.iter() {
|
||||
let Some(val) = val else { continue };
|
||||
sum += *val;
|
||||
}
|
||||
}
|
||||
|
||||
black_box(sum);
|
||||
});
|
||||
runner.run();
|
||||
}
|
||||
@@ -31,7 +31,7 @@ fn get_test_columns() -> Columns {
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(data.len() as u32, None, &mut buffer)
|
||||
.serialize(data.len() as u32, &mut buffer)
|
||||
.unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
|
||||
|
||||
49
columnar/benches/bench_merge.rs
Normal file
49
columnar/benches/bench_merge.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
pub mod common;
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use common::{generate_columnar_with_name, Card};
|
||||
use tantivy_columnar::*;
|
||||
|
||||
const NUM_DOCS: u32 = 100_000;
|
||||
|
||||
fn main() {
|
||||
let mut inputs = Vec::new();
|
||||
|
||||
let mut add_combo = |card1: Card, card2: Card| {
|
||||
inputs.push((
|
||||
format!("merge_{card1}_and_{card2}"),
|
||||
vec![
|
||||
generate_columnar_with_name(card1, NUM_DOCS, "price"),
|
||||
generate_columnar_with_name(card2, NUM_DOCS, "price"),
|
||||
],
|
||||
));
|
||||
};
|
||||
|
||||
add_combo(Card::Multi, Card::Multi);
|
||||
add_combo(Card::MultiSparse, Card::MultiSparse);
|
||||
add_combo(Card::Dense, Card::Dense);
|
||||
add_combo(Card::Sparse, Card::Sparse);
|
||||
add_combo(Card::Sparse, Card::Dense);
|
||||
add_combo(Card::MultiSparse, Card::Dense);
|
||||
add_combo(Card::MultiSparse, Card::Sparse);
|
||||
add_combo(Card::Multi, Card::Dense);
|
||||
add_combo(Card::Multi, Card::Sparse);
|
||||
|
||||
let runner: BenchRunner = BenchRunner::new();
|
||||
let mut group = runner.new_group();
|
||||
for (input_name, columnar_readers) in inputs.iter() {
|
||||
group.register_with_input(
|
||||
input_name,
|
||||
columnar_readers,
|
||||
move |columnar_readers: &Vec<ColumnarReader>| {
|
||||
let mut out = Vec::new();
|
||||
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
|
||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||
|
||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||
black_box(out);
|
||||
},
|
||||
);
|
||||
}
|
||||
group.run();
|
||||
}
|
||||
59
columnar/benches/common.rs
Normal file
59
columnar/benches/common.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
extern crate tantivy_columnar;
|
||||
|
||||
use core::fmt;
|
||||
use std::fmt::{Display, Formatter};
|
||||
|
||||
use tantivy_columnar::{ColumnarReader, ColumnarWriter};
|
||||
|
||||
pub enum Card {
|
||||
MultiSparse,
|
||||
Multi,
|
||||
Sparse,
|
||||
Dense,
|
||||
Full,
|
||||
}
|
||||
impl Display for Card {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Card::MultiSparse => write!(f, "multi sparse 1/13"),
|
||||
Card::Multi => write!(f, "multi 2x"),
|
||||
Card::Sparse => write!(f, "sparse 1/13"),
|
||||
Card::Dense => write!(f, "dense 1/12"),
|
||||
Card::Full => write!(f, "full"),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
|
||||
if let Card::MultiSparse = card {
|
||||
columnar_writer.record_numerical(0, column_name, 10u64);
|
||||
columnar_writer.record_numerical(0, column_name, 10u64);
|
||||
}
|
||||
|
||||
for i in 0..num_docs {
|
||||
match card {
|
||||
Card::MultiSparse | Card::Sparse => {
|
||||
if i % 13 == 0 {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
}
|
||||
Card::Dense => {
|
||||
if i % 12 == 0 {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
}
|
||||
Card::Full => {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
Card::Multi => {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||
ColumnarReader::open(wrt).unwrap()
|
||||
}
|
||||
BIN
columnar/compat_tests_data/v1.columnar
Normal file
BIN
columnar/compat_tests_data/v1.columnar
Normal file
Binary file not shown.
BIN
columnar/compat_tests_data/v2.columnar
Normal file
BIN
columnar/compat_tests_data/v2.columnar
Normal file
Binary file not shown.
@@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
||||
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
|
||||
}
|
||||
|
||||
/// Get the docids of values which are in the provided value range.
|
||||
/// Get the docids of values which are in the provided value and docid range.
|
||||
#[inline]
|
||||
pub fn get_docids_for_value_range(
|
||||
&self,
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::column_values::{
|
||||
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::StrColumn;
|
||||
use crate::{StrColumn, Version};
|
||||
|
||||
pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
|
||||
column_index: SerializableColumnIndex<'_>,
|
||||
@@ -40,25 +40,9 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
.as_slice()
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_values = load_u64_based_column_values(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
values: column_values,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
pub fn open_column_u64<T: MonotonicallyMappableToU64>(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
@@ -68,7 +52,27 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||
let column_values = load_u64_based_column_values(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
values: column_values,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
.as_slice()
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
@@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
/// Open the column as u64.
|
||||
///
|
||||
/// See [`open_u128_as_compact_u64`] for more details.
|
||||
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
|
||||
pub fn open_column_u128_as_compact_u64(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<Column<u64>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
@@ -88,7 +95,7 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
@@ -96,19 +103,19 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
|
||||
pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
|
||||
let (body, dictionary_len_bytes) = data.rsplit(4);
|
||||
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
|
||||
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
|
||||
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
|
||||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
||||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
|
||||
Ok(BytesColumn {
|
||||
dictionary,
|
||||
term_ord_column,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
|
||||
let bytes_column = open_column_bytes(data)?;
|
||||
pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
|
||||
let bytes_column = open_column_bytes(data, format_version)?;
|
||||
Ok(StrColumn::wrap(bytes_column))
|
||||
}
|
||||
|
||||
@@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use crate::column_index::merge::detect_cardinality;
|
||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
||||
use crate::column_index::multivalued_index::{
|
||||
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
|
||||
};
|
||||
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
|
||||
use crate::{
|
||||
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
|
||||
@@ -171,7 +175,11 @@ mod tests {
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
let mut output = Vec::new();
|
||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||
let multivalue =
|
||||
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
|
||||
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5]);
|
||||
}
|
||||
|
||||
@@ -200,11 +208,16 @@ mod tests {
|
||||
],
|
||||
)
|
||||
.into();
|
||||
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
let mut output = Vec::new();
|
||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||
let multivalue =
|
||||
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
|
||||
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::iter;
|
||||
|
||||
use crate::column_index::{SerializableColumnIndex, Set};
|
||||
use crate::column_index::{
|
||||
SerializableColumnIndex, SerializableMultivalueIndex, SerializableOptionalIndex, Set,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
|
||||
|
||||
@@ -14,15 +16,24 @@ pub fn merge_column_index_shuffled<'a>(
|
||||
Cardinality::Optional => {
|
||||
let non_null_row_ids =
|
||||
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows: shuffle_merge_order.num_rows(),
|
||||
}
|
||||
})
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalue_start_index =
|
||||
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
|
||||
SerializableColumnIndex::Multivalued(multivalue_start_index)
|
||||
let non_null_row_ids =
|
||||
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
||||
SerializableColumnIndex::Multivalued(SerializableMultivalueIndex {
|
||||
doc_ids_with_values: SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows: shuffle_merge_order.num_rows(),
|
||||
},
|
||||
start_offsets: merge_column_index_shuffled_multivalued(
|
||||
column_indexes,
|
||||
shuffle_merge_order,
|
||||
),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -102,11 +113,18 @@ fn iter_num_values<'a>(
|
||||
|
||||
/// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
|
||||
/// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
|
||||
///
|
||||
/// This will filter values with 0 values as these are covered by the optional index in the
|
||||
/// multivalue index.
|
||||
fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
|
||||
iter::once(0u32).chain(num_vals.scan(0, |state, num_vals| {
|
||||
*state += num_vals;
|
||||
Some(*state)
|
||||
}))
|
||||
iter::once(0u32).chain(
|
||||
num_vals
|
||||
.filter(|num_vals| *num_vals != 0)
|
||||
.scan(0, |state, num_vals| {
|
||||
*state += num_vals;
|
||||
Some(*state)
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
|
||||
@@ -134,7 +152,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_integrate_num_vals_several() {
|
||||
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 3, 13, 33].into_iter()));
|
||||
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 13, 33].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -157,10 +175,10 @@ mod tests {
|
||||
Cardinality::Optional,
|
||||
&shuffle_merge_order,
|
||||
);
|
||||
let SerializableColumnIndex::Optional {
|
||||
let SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} = serializable_index
|
||||
}) = serializable_index
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::iter;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::column_index::{SerializableColumnIndex, Set};
|
||||
use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
|
||||
use crate::column_index::serialize::SerializableOptionalIndex;
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
|
||||
|
||||
@@ -15,23 +17,149 @@ pub fn merge_column_index_stacked<'a>(
|
||||
) -> SerializableColumnIndex<'a> {
|
||||
match cardinality_after_merge {
|
||||
Cardinality::Full => SerializableColumnIndex::Full,
|
||||
Cardinality::Optional => SerializableColumnIndex::Optional {
|
||||
Cardinality::Optional => SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(StackedOptionalIndex {
|
||||
columns,
|
||||
stack_merge_order,
|
||||
}),
|
||||
num_rows: stack_merge_order.num_rows(),
|
||||
},
|
||||
}),
|
||||
Cardinality::Multivalued => {
|
||||
let stacked_multivalued_index = StackedMultivaluedIndex {
|
||||
columns,
|
||||
stack_merge_order,
|
||||
};
|
||||
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
|
||||
let serializable_multivalue_index =
|
||||
make_serializable_multivalued_index(columns, stack_merge_order);
|
||||
SerializableColumnIndex::Multivalued(serializable_multivalue_index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct StackedDocIdsWithValues<'a> {
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
}
|
||||
|
||||
impl Iterable<u32> for StackedDocIdsWithValues<'_> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new((0..self.column_indexes.len()).flat_map(|i| {
|
||||
let column_index = &self.column_indexes[i];
|
||||
let doc_range = self.stack_merge_order.columnar_range(i);
|
||||
get_doc_ids_with_values(column_index, doc_range)
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
fn get_doc_ids_with_values<'a>(
|
||||
column_index: &'a ColumnIndex,
|
||||
doc_range: Range<u32>,
|
||||
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||
match column_index {
|
||||
ColumnIndex::Empty { .. } => Box::new(0..0),
|
||||
ColumnIndex::Full => Box::new(doc_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_rows()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
MultiValueIndex::MultiValueIndexV1(multivalued_index) => {
|
||||
Box::new((0..multivalued_index.num_docs()).filter_map(move |docid| {
|
||||
let range = multivalued_index.range(docid);
|
||||
if range.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(docid + doc_range.start)
|
||||
}
|
||||
}))
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.optional_index
|
||||
.iter_rows()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn stack_doc_ids_with_values<'a>(
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
) -> SerializableOptionalIndex<'a> {
|
||||
let num_rows = stack_merge_order.num_rows();
|
||||
SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(StackedDocIdsWithValues {
|
||||
column_indexes,
|
||||
stack_merge_order,
|
||||
}),
|
||||
num_rows,
|
||||
}
|
||||
}
|
||||
|
||||
struct StackedStartOffsets<'a> {
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
}
|
||||
|
||||
fn get_num_values_iterator<'a>(
|
||||
column_index: &'a ColumnIndex,
|
||||
num_docs: u32,
|
||||
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||
match column_index {
|
||||
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
|
||||
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.get_start_index_column()
|
||||
.iter()
|
||||
.scan(0u32, |previous_start_offset, current_start_offset| {
|
||||
let num_vals = current_start_offset - *previous_start_offset;
|
||||
*previous_start_offset = current_start_offset;
|
||||
Some(num_vals)
|
||||
})
|
||||
.skip(1),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
|
||||
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
|
||||
let column_index = &self.column_indexes[columnar_id];
|
||||
get_num_values_iterator(column_index, num_docs)
|
||||
});
|
||||
Box::new(std::iter::once(0u32).chain(num_values_it.into_iter().scan(
|
||||
0u32,
|
||||
|cumulated, el| {
|
||||
*cumulated += el;
|
||||
Some(*cumulated)
|
||||
},
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
fn stack_start_offsets<'a>(
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
) -> Box<dyn Iterable<u32> + 'a> {
|
||||
Box::new(StackedStartOffsets {
|
||||
column_indexes,
|
||||
stack_merge_order,
|
||||
})
|
||||
}
|
||||
|
||||
fn make_serializable_multivalued_index<'a>(
|
||||
columns: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
) -> SerializableMultivalueIndex<'a> {
|
||||
SerializableMultivalueIndex {
|
||||
doc_ids_with_values: stack_doc_ids_with_values(columns, stack_merge_order),
|
||||
start_offsets: stack_start_offsets(columns, stack_merge_order),
|
||||
}
|
||||
}
|
||||
|
||||
struct StackedOptionalIndex<'a> {
|
||||
columns: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
@@ -62,87 +190,3 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct StackedMultivaluedIndex<'a> {
|
||||
columns: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
}
|
||||
|
||||
fn convert_column_opt_to_multivalued_index<'a>(
|
||||
column_index_opt: &'a ColumnIndex,
|
||||
num_rows: RowId,
|
||||
) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
||||
match column_index_opt {
|
||||
ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
|
||||
ColumnIndex::Full => Box::new(0..num_rows + 1),
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
Box::new(
|
||||
(0..num_rows)
|
||||
// TODO optimize
|
||||
.map(|row_id| optional_index.rank(row_id))
|
||||
.chain(std::iter::once(optional_index.num_non_nulls())),
|
||||
)
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + '_> {
|
||||
let multivalued_indexes =
|
||||
self.columns
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(columnar_id, column_opt)| {
|
||||
let num_rows =
|
||||
self.stack_merge_order.columnar_range(columnar_id).len() as RowId;
|
||||
convert_column_opt_to_multivalued_index(column_opt, num_rows)
|
||||
});
|
||||
stack_multivalued_indexes(multivalued_indexes)
|
||||
}
|
||||
}
|
||||
|
||||
// Refactor me
|
||||
fn stack_multivalued_indexes<'a>(
|
||||
mut multivalued_indexes: impl Iterator<Item = Box<dyn Iterator<Item = RowId> + 'a>> + 'a,
|
||||
) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
||||
let mut offset = 0;
|
||||
let mut last_row_id = 0;
|
||||
let mut current_it = multivalued_indexes.next();
|
||||
Box::new(std::iter::from_fn(move || loop {
|
||||
if let Some(row_id) = current_it.as_mut()?.next() {
|
||||
last_row_id = offset + row_id;
|
||||
return Some(last_row_id);
|
||||
}
|
||||
offset = last_row_id;
|
||||
loop {
|
||||
current_it = multivalued_indexes.next();
|
||||
if current_it.as_mut()?.next().is_some() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::RowId;
|
||||
|
||||
fn it<'a>(row_ids: &'a [RowId]) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
||||
Box::new(row_ids.iter().copied())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let columns = [
|
||||
it(&[0u32, 0u32]),
|
||||
it(&[0u32, 1u32, 1u32, 4u32]),
|
||||
it(&[0u32, 3u32, 5u32]),
|
||||
it(&[0u32, 4u32]),
|
||||
]
|
||||
.into_iter();
|
||||
let start_offsets: Vec<RowId> = super::stack_multivalued_indexes(columns).collect();
|
||||
assert_eq!(start_offsets, &[0, 0, 1, 1, 4, 7, 9, 13]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,8 +11,11 @@ mod serialize;
|
||||
use std::ops::Range;
|
||||
|
||||
pub use merge::merge_column_index;
|
||||
pub(crate) use multivalued_index::SerializableMultivalueIndex;
|
||||
pub use optional_index::{OptionalIndex, Set};
|
||||
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
|
||||
pub use serialize::{
|
||||
open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
|
||||
};
|
||||
|
||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
||||
use crate::{Cardinality, DocId, RowId};
|
||||
@@ -131,15 +134,41 @@ impl ColumnIndex {
|
||||
let row_end = optional_index.rank(doc_id_range.end);
|
||||
row_start..row_end
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => {
|
||||
let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1;
|
||||
let start_docid = doc_id_range.start.min(end_docid);
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
MultiValueIndex::MultiValueIndexV1(index) => {
|
||||
let row_start = index.start_index_column.get_val(doc_id_range.start);
|
||||
let row_end = index.start_index_column.get_val(doc_id_range.end);
|
||||
row_start..row_end
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(index) => {
|
||||
// In this case we will use the optional_index select the next values
|
||||
// that are valid. There are different cases to consider:
|
||||
// Not exists below means does not exist in the optional
|
||||
// index, because it has no values.
|
||||
// * doc_id_range may cover a range of docids which are non existent
|
||||
// => rank
|
||||
// will give us the next document outside the range with a value. They both
|
||||
// get the same rank and therefore return a zero range
|
||||
//
|
||||
// * doc_id_range.start and doc_id_range.end may not exist, but docids in
|
||||
// between may have values
|
||||
// => rank will give us the next document outside the range with a value.
|
||||
//
|
||||
// * doc_id_range.start may be not existent but doc_id_range.end may exist
|
||||
// * doc_id_range.start may exist but doc_id_range.end may not exist
|
||||
// * doc_id_range.start and doc_id_range.end may exist
|
||||
// => rank on doc_id_range.end will give use the next value, which matches
|
||||
// how the `start_index_column` works, so we get the value start of the next
|
||||
// docid which we use to create the exclusive range.
|
||||
//
|
||||
let rank_start = index.optional_index.rank(doc_id_range.start);
|
||||
let row_start = index.start_index_column.get_val(rank_start);
|
||||
let rank_end = index.optional_index.rank(doc_id_range.end);
|
||||
let row_end = index.start_index_column.get_val(rank_end);
|
||||
|
||||
let row_start = multivalued_index.start_index_column.get_val(start_docid);
|
||||
let row_end = multivalued_index.start_index_column.get_val(end_docid);
|
||||
|
||||
row_start..row_end
|
||||
}
|
||||
row_start..row_end
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,64 +3,98 @@ use std::io::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::OwnedBytes;
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
|
||||
use super::optional_index::{open_optional_index, serialize_optional_index};
|
||||
use super::{OptionalIndex, SerializableOptionalIndex, Set};
|
||||
use crate::column_values::{
|
||||
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{DocId, RowId};
|
||||
use crate::{DocId, RowId, Version};
|
||||
|
||||
pub struct SerializableMultivalueIndex<'a> {
|
||||
pub doc_ids_with_values: SerializableOptionalIndex<'a>,
|
||||
pub start_offsets: Box<dyn Iterable<u32> + 'a>,
|
||||
}
|
||||
|
||||
pub fn serialize_multivalued_index(
|
||||
multivalued_index: &dyn Iterable<RowId>,
|
||||
multivalued_index: &SerializableMultivalueIndex,
|
||||
output: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
let SerializableMultivalueIndex {
|
||||
doc_ids_with_values,
|
||||
start_offsets,
|
||||
} = multivalued_index;
|
||||
let mut count_writer = CountingWriter::wrap(output);
|
||||
let SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} = doc_ids_with_values;
|
||||
serialize_optional_index(&**non_null_row_ids, *num_rows, &mut count_writer)?;
|
||||
let optional_len = count_writer.written_bytes() as u32;
|
||||
let output = count_writer.finish();
|
||||
serialize_u64_based_column_values(
|
||||
multivalued_index,
|
||||
&**start_offsets,
|
||||
&[CodecType::Bitpacked, CodecType::Linear],
|
||||
output,
|
||||
)?;
|
||||
output.write_all(&optional_len.to_le_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
|
||||
Ok(MultiValueIndex { start_index_column })
|
||||
pub fn open_multivalued_index(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<MultiValueIndex> {
|
||||
match format_version {
|
||||
Version::V1 => {
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||
load_u64_based_column_values(bytes)?;
|
||||
Ok(MultiValueIndex::MultiValueIndexV1(MultiValueIndexV1 {
|
||||
start_index_column,
|
||||
}))
|
||||
}
|
||||
Version::V2 => {
|
||||
let (body_bytes, optional_index_len) = bytes.rsplit(4);
|
||||
let optional_index_len =
|
||||
u32::from_le_bytes(optional_index_len.as_slice().try_into().unwrap());
|
||||
let (optional_index_bytes, start_index_bytes) =
|
||||
body_bytes.split(optional_index_len as usize);
|
||||
let optional_index = open_optional_index(optional_index_bytes)?;
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||
load_u64_based_column_values(start_index_bytes)?;
|
||||
Ok(MultiValueIndex::MultiValueIndexV2(MultiValueIndexV2 {
|
||||
optional_index,
|
||||
start_index_column,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndex {
|
||||
pub enum MultiValueIndex {
|
||||
MultiValueIndexV1(MultiValueIndexV1),
|
||||
MultiValueIndexV2(MultiValueIndexV2),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndexV1 {
|
||||
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MultiValueIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("MultiValuedIndex")
|
||||
.field("num_rows", &self.start_index_column.num_vals())
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
|
||||
fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
|
||||
MultiValueIndex { start_index_column }
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiValueIndex {
|
||||
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
|
||||
let mut buffer = Vec::new();
|
||||
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
|
||||
let bytes = OwnedBytes::new(buffer);
|
||||
open_multivalued_index(bytes).unwrap()
|
||||
}
|
||||
|
||||
impl MultiValueIndexV1 {
|
||||
/// Returns `[start, end)`, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||
if doc_id >= self.num_docs() {
|
||||
return 0..0;
|
||||
}
|
||||
let start = self.start_index_column.get_val(doc_id);
|
||||
let end = self.start_index_column.get_val(doc_id + 1);
|
||||
start..end
|
||||
@@ -83,7 +117,6 @@ impl MultiValueIndex {
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
#[allow(clippy::bool_to_int_with_if)]
|
||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||
if ranks.is_empty() {
|
||||
return;
|
||||
@@ -111,11 +144,170 @@ impl MultiValueIndex {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndexV2 {
|
||||
pub optional_index: OptionalIndex,
|
||||
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MultiValueIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let index = match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
|
||||
};
|
||||
f.debug_struct("MultiValuedIndex")
|
||||
.field("num_rows", &index.num_vals())
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiValueIndex {
|
||||
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
|
||||
assert!(!start_offsets.is_empty());
|
||||
assert_eq!(start_offsets[0], 0);
|
||||
let mut doc_with_values = Vec::new();
|
||||
let mut compact_start_offsets: Vec<u32> = vec![0];
|
||||
for doc in 0..start_offsets.len() - 1 {
|
||||
if start_offsets[doc] < start_offsets[doc + 1] {
|
||||
doc_with_values.push(doc as RowId);
|
||||
compact_start_offsets.push(start_offsets[doc + 1]);
|
||||
}
|
||||
}
|
||||
let serializable_multivalued_index = SerializableMultivalueIndex {
|
||||
doc_ids_with_values: SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(&doc_with_values[..]),
|
||||
num_rows: start_offsets.len() as u32 - 1,
|
||||
},
|
||||
start_offsets: Box::new(&compact_start_offsets[..]),
|
||||
};
|
||||
let mut buffer = Vec::new();
|
||||
serialize_multivalued_index(&serializable_multivalued_index, &mut buffer).unwrap();
|
||||
let bytes = OwnedBytes::new(buffer);
|
||||
open_multivalued_index(bytes, Version::V2).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_start_index_column(&self) -> &Arc<dyn crate::ColumnValues<RowId>> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)` values range, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => idx.range(doc_id),
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => idx.range(doc_id),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the index.
|
||||
#[inline]
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => idx.start_index_column.num_vals() - 1,
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => idx.optional_index.num_docs(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||
/// index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||
/// increasing positions.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => {
|
||||
idx.select_batch_in_place(docid_start, ranks)
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => {
|
||||
idx.select_batch_in_place(docid_start, ranks)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
impl MultiValueIndexV2 {
|
||||
/// Returns `[start, end)`, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||
let Some(rank) = self.optional_index.rank_if_exists(doc_id) else {
|
||||
return 0..0;
|
||||
};
|
||||
let start = self.start_index_column.get_val(rank);
|
||||
let end = self.start_index_column.get_val(rank + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the index.
|
||||
#[inline]
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.optional_index.num_docs()
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||
/// index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||
/// increasing positions.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||
if ranks.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut cur_pos_in_idx = self.optional_index.rank(docid_start);
|
||||
let mut last_doc = None;
|
||||
|
||||
assert!(cur_pos_in_idx <= ranks[0]);
|
||||
|
||||
let mut write_doc_pos = 0;
|
||||
for i in 0..ranks.len() {
|
||||
let pos = ranks[i];
|
||||
loop {
|
||||
let end = self.start_index_column.get_val(cur_pos_in_idx + 1);
|
||||
if end > pos {
|
||||
ranks[write_doc_pos] = cur_pos_in_idx;
|
||||
write_doc_pos += if last_doc == Some(cur_pos_in_idx) {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
last_doc = Some(cur_pos_in_idx);
|
||||
break;
|
||||
}
|
||||
cur_pos_in_idx += 1;
|
||||
}
|
||||
}
|
||||
ranks.truncate(write_doc_pos);
|
||||
|
||||
for rank in ranks.iter_mut() {
|
||||
*rank = self.optional_index.select(*rank);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Range;
|
||||
|
||||
use super::MultiValueIndex;
|
||||
use crate::{ColumnarReader, DynamicColumn};
|
||||
|
||||
fn index_to_pos_helper(
|
||||
index: &MultiValueIndex,
|
||||
@@ -134,6 +326,7 @@ mod tests {
|
||||
let positions = &[10u32, 11, 15, 20, 21, 22];
|
||||
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
|
||||
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
|
||||
|
||||
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
|
||||
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
|
||||
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
|
||||
@@ -141,4 +334,67 @@ mod tests {
|
||||
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
|
||||
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_to_rowids() {
|
||||
use crate::ColumnarWriter;
|
||||
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
|
||||
// This column gets coerced to u64
|
||||
columnar_writer.record_numerical(1, "full", u64::MAX);
|
||||
columnar_writer.record_numerical(1, "full", u64::MAX);
|
||||
|
||||
columnar_writer.record_numerical(5, "full", u64::MAX);
|
||||
columnar_writer.record_numerical(5, "full", u64::MAX);
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(7, &mut wrt).unwrap();
|
||||
|
||||
let reader = ColumnarReader::open(wrt).unwrap();
|
||||
// Open the column as u64
|
||||
let column = reader.read_columns("full").unwrap()[0]
|
||||
.open()
|
||||
.unwrap()
|
||||
.coerce_numerical(crate::NumericalType::U64)
|
||||
.unwrap();
|
||||
let DynamicColumn::U64(column) = column else {
|
||||
panic!();
|
||||
};
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(1..2);
|
||||
assert_eq!(row_id_range, 0..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..2);
|
||||
assert_eq!(row_id_range, 0..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..4);
|
||||
assert_eq!(row_id_range, 0..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(3..4);
|
||||
assert_eq!(row_id_range, 2..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(1..6);
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(3..6);
|
||||
assert_eq!(row_id_range, 2..4);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..6);
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..6);
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let check = |range, expected| {
|
||||
let full_range = 0..=u64::MAX;
|
||||
let mut docids = Vec::new();
|
||||
column.get_docids_for_value_range(full_range, range, &mut docids);
|
||||
assert_eq!(docids, expected);
|
||||
};
|
||||
|
||||
// check(0..1, vec![]);
|
||||
// check(0..2, vec![1]);
|
||||
check(1..2, vec![1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,8 +86,14 @@ pub struct OptionalIndex {
|
||||
block_metas: Arc<[BlockMeta]>,
|
||||
}
|
||||
|
||||
impl<'a> Iterable<u32> for &'a OptionalIndex {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.iter_rows())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for OptionalIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("OptionalIndex")
|
||||
.field("num_rows", &self.num_rows)
|
||||
.field("num_non_null_rows", &self.num_non_null_rows)
|
||||
@@ -196,6 +202,7 @@ impl Set<RowId> for OptionalIndex {
|
||||
} = row_addr_from_row_id(doc_id);
|
||||
let block_meta = self.block_metas[block_id as usize];
|
||||
let block = self.block(block_meta);
|
||||
|
||||
let block_offset_row_id = match block {
|
||||
Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
|
||||
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),
|
||||
|
||||
@@ -28,10 +28,11 @@ pub trait Set<T> {
|
||||
/// Returns true if the elements is contained in the Set
|
||||
fn contains(&self, el: T) -> bool;
|
||||
|
||||
/// Returns the number of rows in the set that are < `el`
|
||||
/// Returns the element's rank (its position in the set).
|
||||
/// If the set does not contain the element, it will return the next existing elements rank.
|
||||
fn rank(&self, el: T) -> T;
|
||||
|
||||
/// If the set contains `el` returns the element rank.
|
||||
/// If the set contains `el`, returns the element's rank (its position in the set).
|
||||
/// If the set does not contain the element, it returns `None`.
|
||||
fn rank_if_exists(&self, el: T) -> Option<T>;
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
|
||||
vals.iter().cloned().take_while(|v| *v < val).count() as u16
|
||||
);
|
||||
}
|
||||
for rank in 0..vals.len() {
|
||||
assert_eq!(tested_set.select(rank as u16), vals[rank]);
|
||||
for (rank, val) in vals.iter().enumerate() {
|
||||
assert_eq!(tested_set.select(rank as u16), *val);
|
||||
}
|
||||
buffer.len()
|
||||
}
|
||||
@@ -107,3 +107,41 @@ fn test_simple_translate_codec_idx_to_original_idx_dense() {
|
||||
assert_eq!(i, select_cursor.select(i));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_translate_idx_to_value_idx_dense() {
|
||||
let mut buffer = Vec::new();
|
||||
DenseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
|
||||
let tested_set = DenseBlockCodec::open(buffer.as_slice());
|
||||
assert!(tested_set.contains(1));
|
||||
assert!(!tested_set.contains(2));
|
||||
assert_eq!(tested_set.rank(0), 0);
|
||||
assert_eq!(tested_set.rank(1), 0);
|
||||
for rank in 2..10 {
|
||||
// ranks that don't exist select the next highest one
|
||||
assert_eq!(tested_set.rank_if_exists(rank), None);
|
||||
assert_eq!(tested_set.rank(rank), 1);
|
||||
}
|
||||
assert_eq!(tested_set.rank(10), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_translate_idx_to_value_idx_sparse() {
|
||||
let mut buffer = Vec::new();
|
||||
SparseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
|
||||
let tested_set = SparseBlockCodec::open(buffer.as_slice());
|
||||
assert!(tested_set.contains(1));
|
||||
assert!(!tested_set.contains(2));
|
||||
assert_eq!(tested_set.rank(0), 0);
|
||||
assert_eq!(tested_set.select(tested_set.rank(0)), 1);
|
||||
assert_eq!(tested_set.rank(1), 0);
|
||||
assert_eq!(tested_set.select(tested_set.rank(1)), 1);
|
||||
for rank in 2..10 {
|
||||
// ranks that don't exist select the next highest one
|
||||
assert_eq!(tested_set.rank_if_exists(rank), None);
|
||||
assert_eq!(tested_set.rank(rank), 1);
|
||||
assert_eq!(tested_set.select(tested_set.rank(rank)), 10);
|
||||
}
|
||||
assert_eq!(tested_set.rank(10), 1);
|
||||
assert_eq!(tested_set.select(tested_set.rank(10)), 10);
|
||||
}
|
||||
|
||||
@@ -15,9 +15,7 @@ fn test_optional_index_with_num_docs(num_docs: u32) {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(100, "score", 80i64);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_docs, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_docs, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
|
||||
|
||||
@@ -3,28 +3,39 @@ use std::io::Write;
|
||||
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
|
||||
use super::multivalued_index::SerializableMultivalueIndex;
|
||||
use super::OptionalIndex;
|
||||
use crate::column_index::multivalued_index::serialize_multivalued_index;
|
||||
use crate::column_index::optional_index::serialize_optional_index;
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, RowId};
|
||||
use crate::{Cardinality, RowId, Version};
|
||||
|
||||
pub struct SerializableOptionalIndex<'a> {
|
||||
pub non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
|
||||
pub num_rows: RowId,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a OptionalIndex> for SerializableOptionalIndex<'a> {
|
||||
fn from(optional_index: &'a OptionalIndex) -> Self {
|
||||
SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(optional_index),
|
||||
num_rows: optional_index.num_docs(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SerializableColumnIndex<'a> {
|
||||
Full,
|
||||
Optional {
|
||||
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
|
||||
num_rows: RowId,
|
||||
},
|
||||
// TODO remove the Arc<dyn> apart from serialization this is not
|
||||
// dynamic at all.
|
||||
Multivalued(Box<dyn Iterable<RowId> + 'a>),
|
||||
Optional(SerializableOptionalIndex<'a>),
|
||||
Multivalued(SerializableMultivalueIndex<'a>),
|
||||
}
|
||||
|
||||
impl<'a> SerializableColumnIndex<'a> {
|
||||
pub fn get_cardinality(&self) -> Cardinality {
|
||||
match self {
|
||||
SerializableColumnIndex::Full => Cardinality::Full,
|
||||
SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
|
||||
SerializableColumnIndex::Optional(_) => Cardinality::Optional,
|
||||
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
|
||||
}
|
||||
}
|
||||
@@ -40,12 +51,12 @@ pub fn serialize_column_index(
|
||||
output.write_all(&[cardinality])?;
|
||||
match column_index {
|
||||
SerializableColumnIndex::Full => {}
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
|
||||
}) => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
|
||||
SerializableColumnIndex::Multivalued(multivalued_index) => {
|
||||
serialize_multivalued_index(&*multivalued_index, &mut output)?
|
||||
serialize_multivalued_index(&multivalued_index, &mut output)?
|
||||
}
|
||||
}
|
||||
let column_index_num_bytes = output.written_bytes() as u32;
|
||||
@@ -53,7 +64,10 @@ pub fn serialize_column_index(
|
||||
}
|
||||
|
||||
/// Open a serialized column index.
|
||||
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
|
||||
pub fn open_column_index(
|
||||
mut bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<ColumnIndex> {
|
||||
if bytes.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
@@ -70,7 +84,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
|
||||
Ok(ColumnIndex::Optional(optional_index))
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
|
||||
let multivalue_index =
|
||||
super::multivalued_index::open_multivalued_index(bytes, format_version)?;
|
||||
Ok(ColumnIndex::Multivalued(multivalue_index))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
}
|
||||
|
||||
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
||||
let mut bytes = Vec::new();
|
||||
let stats = compute_stats(data.iter().cloned());
|
||||
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
|
||||
for val in data {
|
||||
codec_serializer.collect(*val);
|
||||
}
|
||||
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
|
||||
codec_serializer
|
||||
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
|
||||
.unwrap();
|
||||
|
||||
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
||||
}
|
||||
|
||||
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
||||
let col = get_reader_for_bench::<Codec>(data);
|
||||
b.iter(|| {
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
use core::fmt;
|
||||
use std::fmt::{Display, Formatter};
|
||||
|
||||
use crate::InvalidData;
|
||||
|
||||
pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_of::<u32>();
|
||||
@@ -8,7 +11,7 @@ const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];
|
||||
|
||||
pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
|
||||
let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
|
||||
footer_bytes[0..4].copy_from_slice(&Version::V1.to_bytes());
|
||||
footer_bytes[0..4].copy_from_slice(&CURRENT_VERSION.to_bytes());
|
||||
footer_bytes[4..8].copy_from_slice(&MAGIC_BYTES[..]);
|
||||
footer_bytes
|
||||
}
|
||||
@@ -20,10 +23,22 @@ pub fn parse_footer(footer_bytes: [u8; VERSION_FOOTER_NUM_BYTES]) -> Result<Vers
|
||||
Version::try_from_bytes(footer_bytes[0..4].try_into().unwrap())
|
||||
}
|
||||
|
||||
pub const CURRENT_VERSION: Version = Version::V2;
|
||||
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||
#[repr(u32)]
|
||||
pub enum Version {
|
||||
V1 = 1u32,
|
||||
V2 = 2u32,
|
||||
}
|
||||
|
||||
impl Display for Version {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Version::V1 => write!(f, "v1"),
|
||||
Version::V2 => write!(f, "v2"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Version {
|
||||
@@ -35,6 +50,7 @@ impl Version {
|
||||
let code = u32::from_le_bytes(bytes);
|
||||
match code {
|
||||
1u32 => Ok(Version::V1),
|
||||
2u32 => Ok(Version::V2),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
}
|
||||
@@ -47,9 +63,9 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_footer_dserialization() {
|
||||
fn test_footer_deserialization() {
|
||||
let parsed_version: Version = parse_footer(footer()).unwrap();
|
||||
assert_eq!(Version::V1, parsed_version);
|
||||
assert_eq!(Version::V2, parsed_version);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -63,11 +79,10 @@ mod tests {
|
||||
for &i in &version_to_tests {
|
||||
let version_res = Version::try_from_bytes(i.to_le_bytes());
|
||||
if let Ok(version) = version_res {
|
||||
assert_eq!(version, Version::V1);
|
||||
assert_eq!(version.to_bytes(), i.to_le_bytes());
|
||||
valid_versions.insert(i);
|
||||
}
|
||||
}
|
||||
assert_eq!(valid_versions.len(), 1);
|
||||
assert_eq!(valid_versions.len(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use itertools::Itertools;
|
||||
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||
|
||||
use super::writer::ColumnarSerializer;
|
||||
@@ -371,20 +370,8 @@ fn is_empty_after_merge(
|
||||
true
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => {
|
||||
for (doc_id, (start_index, end_index)) in multivalued_index
|
||||
.start_index_column
|
||||
.iter()
|
||||
.tuple_windows()
|
||||
.enumerate()
|
||||
{
|
||||
let doc_id = doc_id as u32;
|
||||
if start_index == end_index {
|
||||
// There are no values in this document
|
||||
continue;
|
||||
}
|
||||
// The document contains values and is present in the alive bitset.
|
||||
// The column is therefore not empty.
|
||||
if alive_bitset.contains(doc_id) {
|
||||
for alive_docid in alive_bitset.iter() {
|
||||
if !multivalued_index.range(alive_docid).is_empty() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::*;
|
||||
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
|
||||
|
||||
@@ -12,7 +14,7 @@ fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(vals.len() as RowId, None, &mut buffer)
|
||||
.serialize(vals.len() as RowId, &mut buffer)
|
||||
.unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
@@ -157,9 +159,7 @@ fn make_numerical_columnar_multiple_columns(
|
||||
.max()
|
||||
.unwrap_or(0u32);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_rows, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
@@ -182,9 +182,7 @@ fn make_byte_columnar_multiple_columns(
|
||||
}
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_rows, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
@@ -203,9 +201,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column
|
||||
.max()
|
||||
.unwrap_or(0u32);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_rows, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use column_type::{ColumnType, HasAssociatedColumnType};
|
||||
pub use format_version::{Version, CURRENT_VERSION};
|
||||
#[cfg(test)]
|
||||
pub(crate) use merge::ColumnTypeCategory;
|
||||
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||
|
||||
@@ -6,7 +6,7 @@ use sstable::{Dictionary, RangeSSTable};
|
||||
|
||||
use crate::columnar::{format_version, ColumnType};
|
||||
use crate::dynamic_column::DynamicColumnHandle;
|
||||
use crate::RowId;
|
||||
use crate::{RowId, Version};
|
||||
|
||||
fn io_invalid_data(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::InvalidData, msg)
|
||||
@@ -19,6 +19,7 @@ pub struct ColumnarReader {
|
||||
column_dictionary: Dictionary<RangeSSTable>,
|
||||
column_data: FileSlice,
|
||||
num_rows: RowId,
|
||||
format_version: Version,
|
||||
}
|
||||
|
||||
impl fmt::Debug for ColumnarReader {
|
||||
@@ -53,6 +54,7 @@ impl fmt::Debug for ColumnarReader {
|
||||
fn read_all_columns_in_stream(
|
||||
mut stream: sstable::Streamer<'_, RangeSSTable>,
|
||||
column_data: &FileSlice,
|
||||
format_version: Version,
|
||||
) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let mut results = Vec::new();
|
||||
while stream.advance() {
|
||||
@@ -67,6 +69,7 @@ fn read_all_columns_in_stream(
|
||||
let dynamic_column_handle = DynamicColumnHandle {
|
||||
file_slice,
|
||||
column_type,
|
||||
format_version,
|
||||
};
|
||||
results.push(dynamic_column_handle);
|
||||
}
|
||||
@@ -88,7 +91,7 @@ impl ColumnarReader {
|
||||
let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
|
||||
let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
|
||||
footer_bytes[12..].try_into().unwrap();
|
||||
let _version = format_version::parse_footer(version_footer_bytes)?;
|
||||
let format_version = format_version::parse_footer(version_footer_bytes)?;
|
||||
let (column_data, sstable) =
|
||||
file_slice_without_sstable_len.split_from_end(sstable_len as usize);
|
||||
let column_dictionary = Dictionary::open(sstable)?;
|
||||
@@ -96,6 +99,7 @@ impl ColumnarReader {
|
||||
column_dictionary,
|
||||
column_data,
|
||||
num_rows,
|
||||
format_version,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -126,6 +130,7 @@ impl ColumnarReader {
|
||||
let column_handle = DynamicColumnHandle {
|
||||
file_slice,
|
||||
column_type,
|
||||
format_version: self.format_version,
|
||||
};
|
||||
Some((column_name, column_handle))
|
||||
} else {
|
||||
@@ -167,7 +172,7 @@ impl ColumnarReader {
|
||||
.stream_for_column_range(column_name)
|
||||
.into_stream_async()
|
||||
.await?;
|
||||
read_all_columns_in_stream(stream, &self.column_data)
|
||||
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||
}
|
||||
|
||||
/// Get all columns for the given column name.
|
||||
@@ -176,7 +181,7 @@ impl ColumnarReader {
|
||||
/// different types.
|
||||
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let stream = self.stream_for_column_range(column_name).into_stream()?;
|
||||
read_all_columns_in_stream(stream, &self.column_data)
|
||||
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||
}
|
||||
|
||||
/// Return the number of columns in the columnar.
|
||||
@@ -195,7 +200,7 @@ mod tests {
|
||||
columnar_writer.record_column_type("col1", ColumnType::Str, false);
|
||||
columnar_writer.record_column_type("col2", ColumnType::U64, false);
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer.serialize(1, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(1, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
let columns = columnar.list_columns().unwrap();
|
||||
assert_eq!(columns.len(), 2);
|
||||
@@ -211,7 +216,7 @@ mod tests {
|
||||
columnar_writer.record_column_type("count", ColumnType::U64, false);
|
||||
columnar_writer.record_numerical(1, "count", 1u64);
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer.serialize(2, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(2, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
let columns = columnar.list_columns().unwrap();
|
||||
assert_eq!(columns.len(), 1);
|
||||
|
||||
@@ -41,31 +41,10 @@ impl ColumnWriter {
|
||||
pub(super) fn operation_iterator<'a, V: SymbolValue>(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
old_to_new_ids_opt: Option<&[RowId]>,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
|
||||
buffer.clear();
|
||||
self.values.read_to_end(arena, buffer);
|
||||
if let Some(old_to_new_ids) = old_to_new_ids_opt {
|
||||
// TODO avoid the extra deserialization / serialization.
|
||||
let mut sorted_ops: Vec<(RowId, ColumnOperation<V>)> = Vec::new();
|
||||
let mut new_doc = 0u32;
|
||||
let mut cursor = &buffer[..];
|
||||
for op in std::iter::from_fn(|| ColumnOperation::<V>::deserialize(&mut cursor)) {
|
||||
if let ColumnOperation::NewDoc(doc) = &op {
|
||||
new_doc = old_to_new_ids[*doc as usize];
|
||||
sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc)));
|
||||
} else {
|
||||
sorted_ops.push((new_doc, op));
|
||||
}
|
||||
}
|
||||
// stable sort is crucial here.
|
||||
sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id);
|
||||
buffer.clear();
|
||||
for (_, op) in sorted_ops {
|
||||
buffer.extend_from_slice(op.serialize().as_ref());
|
||||
}
|
||||
}
|
||||
let mut cursor: &[u8] = &buffer[..];
|
||||
std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
|
||||
}
|
||||
@@ -231,11 +210,9 @@ impl NumericalColumnWriter {
|
||||
pub(super) fn operation_iterator<'a>(
|
||||
self,
|
||||
arena: &MemoryArena,
|
||||
old_to_new_ids: Option<&[RowId]>,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
|
||||
self.column_writer
|
||||
.operation_iterator(arena, old_to_new_ids, buffer)
|
||||
self.column_writer.operation_iterator(arena, buffer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,11 +254,9 @@ impl StrOrBytesColumnWriter {
|
||||
pub(super) fn operation_iterator<'a>(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
old_to_new_ids: Option<&[RowId]>,
|
||||
byte_buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
|
||||
self.column_writer
|
||||
.operation_iterator(arena, old_to_new_ids, byte_buffer)
|
||||
self.column_writer.operation_iterator(arena, byte_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,11 +8,12 @@ use std::net::Ipv6Addr;
|
||||
|
||||
use column_operation::ColumnOperation;
|
||||
pub(crate) use column_writers::CompatibleNumericalTypes;
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::CountingWriter;
|
||||
pub(crate) use serializer::ColumnarSerializer;
|
||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
|
||||
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use crate::columnar::column_type::ColumnType;
|
||||
use crate::columnar::writer::column_writers::{
|
||||
@@ -43,7 +44,7 @@ struct SpareBuffers {
|
||||
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
||||
/// let mut wrt: Vec<u8> = Vec::new();
|
||||
/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
|
||||
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct ColumnarWriter {
|
||||
@@ -75,63 +76,6 @@ impl ColumnarWriter {
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
|
||||
/// column.
|
||||
///
|
||||
/// If the column is multivalued, use the first value for scoring.
|
||||
/// If no value is associated to a specific row, the document is assigned
|
||||
/// the lowest possible score.
|
||||
///
|
||||
/// The sort applied is stable.
|
||||
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
|
||||
let Some(numerical_col_writer) = self
|
||||
.numerical_field_hash_map
|
||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
||||
.or_else(|| {
|
||||
self.datetime_field_hash_map
|
||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
||||
})
|
||||
else {
|
||||
return Vec::new();
|
||||
};
|
||||
let mut symbols_buffer = Vec::new();
|
||||
let mut values = Vec::new();
|
||||
let mut start_doc_check_fill = 0;
|
||||
let mut current_doc_opt: Option<RowId> = None;
|
||||
// Assumption: NewDoc will never call the same doc twice and is strictly increasing between
|
||||
// calls
|
||||
for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
|
||||
match op {
|
||||
ColumnOperation::NewDoc(doc) => {
|
||||
current_doc_opt = Some(doc);
|
||||
}
|
||||
ColumnOperation::Value(numerical_value) => {
|
||||
if let Some(current_doc) = current_doc_opt {
|
||||
// Fill up with 0.0 since last doc
|
||||
values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
|
||||
start_doc_check_fill = current_doc + 1;
|
||||
// handle multi values
|
||||
current_doc_opt = None;
|
||||
|
||||
let score: f32 = f64::coerce(numerical_value) as f32;
|
||||
values.push((score, current_doc));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for doc in values.len() as u32..num_docs {
|
||||
values.push((0.0f32, doc));
|
||||
}
|
||||
values.sort_by(|(left_score, _), (right_score, _)| {
|
||||
if reversed {
|
||||
right_score.total_cmp(left_score)
|
||||
} else {
|
||||
left_score.total_cmp(right_score)
|
||||
}
|
||||
});
|
||||
values.into_iter().map(|(_score, doc)| doc).collect()
|
||||
}
|
||||
|
||||
/// Records a column type. This is useful to bypass the coercion process,
|
||||
/// makes sure the empty is present in the resulting columnar, or set
|
||||
/// the `sort_values_within_row`.
|
||||
@@ -302,13 +246,9 @@ impl ColumnarWriter {
|
||||
},
|
||||
);
|
||||
}
|
||||
pub fn serialize(
|
||||
&mut self,
|
||||
num_docs: RowId,
|
||||
old_to_new_row_ids: Option<&[RowId]>,
|
||||
wrt: &mut dyn io::Write,
|
||||
) -> io::Result<()> {
|
||||
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||
let mut serializer = ColumnarSerializer::new(wrt);
|
||||
|
||||
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
||||
.numerical_field_hash_map
|
||||
.iter()
|
||||
@@ -322,7 +262,7 @@ impl ColumnarWriter {
|
||||
columns.extend(
|
||||
self.bytes_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
|
||||
.map(|(column_name, addr)| (column_name, ColumnType::Bytes, addr)),
|
||||
);
|
||||
columns.extend(
|
||||
self.str_field_hash_map
|
||||
@@ -349,6 +289,12 @@ impl ColumnarWriter {
|
||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
||||
for (column_name, column_type, addr) in columns {
|
||||
if column_name.contains(&JSON_END_OF_PATH) {
|
||||
// Tantivy uses b'0' as a separator for nested fields in JSON.
|
||||
// Column names with a b'0' are not simply ignored by the columnar (and the inverted
|
||||
// index).
|
||||
continue;
|
||||
}
|
||||
match column_type {
|
||||
ColumnType::Bool => {
|
||||
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
||||
@@ -358,11 +304,7 @@ impl ColumnarWriter {
|
||||
serialize_bool_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -376,11 +318,7 @@ impl ColumnarWriter {
|
||||
serialize_ip_addr_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -405,11 +343,8 @@ impl ColumnarWriter {
|
||||
num_docs,
|
||||
str_or_bytes_column_writer.sort_values_within_row,
|
||||
dictionary_builder,
|
||||
str_or_bytes_column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
str_or_bytes_column_writer
|
||||
.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&self.arena,
|
||||
&mut column_serializer,
|
||||
@@ -427,11 +362,7 @@ impl ColumnarWriter {
|
||||
cardinality,
|
||||
num_docs,
|
||||
numerical_type,
|
||||
numerical_column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -446,11 +377,7 @@ impl ColumnarWriter {
|
||||
cardinality,
|
||||
num_docs,
|
||||
NumericalType::I64,
|
||||
column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -635,16 +562,16 @@ fn send_to_serialize_column_mappable_to_u128<
|
||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||
let optional_index = optional_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
num_rows,
|
||||
non_null_row_ids: Box::new(optional_index),
|
||||
}
|
||||
})
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
|
||||
}
|
||||
};
|
||||
crate::column::serialize_column_mappable_to_u128(
|
||||
@@ -655,15 +582,6 @@ fn send_to_serialize_column_mappable_to_u128<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
|
||||
let mut start_index: usize = 0;
|
||||
for end_index in multivalued_index.iter().copied() {
|
||||
let end_index = end_index as usize;
|
||||
values[start_index..end_index].sort_unstable();
|
||||
start_index = end_index;
|
||||
}
|
||||
}
|
||||
|
||||
fn send_to_serialize_column_mappable_to_u64(
|
||||
op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
|
||||
cardinality: Cardinality,
|
||||
@@ -687,19 +605,22 @@ fn send_to_serialize_column_mappable_to_u64(
|
||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||
let optional_index = optional_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(optional_index),
|
||||
num_rows,
|
||||
}
|
||||
})
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
if sort_values_within_row {
|
||||
sort_values_within_row_in_place(multivalued_index, values);
|
||||
sort_values_within_row_in_place(
|
||||
serializable_multivalued_index.start_offsets.boxed_iter(),
|
||||
values,
|
||||
);
|
||||
}
|
||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
|
||||
}
|
||||
};
|
||||
crate::column::serialize_column_mappable_to_u64(
|
||||
@@ -710,6 +631,18 @@ fn send_to_serialize_column_mappable_to_u64(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_values_within_row_in_place(
|
||||
multivalued_index: impl Iterator<Item = RowId>,
|
||||
values: &mut [u64],
|
||||
) {
|
||||
let mut start_index: usize = 0;
|
||||
for end_index in multivalued_index {
|
||||
let end_index = end_index as usize;
|
||||
values[start_index..end_index].sort_unstable();
|
||||
start_index = end_index;
|
||||
}
|
||||
}
|
||||
|
||||
fn coerce_numerical_symbol<T>(
|
||||
operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
|
||||
) -> impl Iterator<Item = ColumnOperation<u64>>
|
||||
@@ -757,7 +690,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 6);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
@@ -786,7 +719,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 4);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
|
||||
@@ -809,7 +742,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 2);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
@@ -828,7 +761,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 3);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::{BinarySerializable, CountingWriter};
|
||||
use sstable::value::RangeValueWriter;
|
||||
use sstable::RangeSSTable;
|
||||
@@ -18,13 +19,8 @@ pub struct ColumnarSerializer<W: io::Write> {
|
||||
/// code.
|
||||
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
||||
buffer.clear();
|
||||
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
|
||||
if key.contains(&0u8) {
|
||||
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
||||
} else {
|
||||
buffer.extend_from_slice(key);
|
||||
}
|
||||
buffer.push(0u8);
|
||||
buffer.extend_from_slice(key);
|
||||
buffer.push(JSON_END_OF_PATH);
|
||||
buffer.push(column_type.to_code());
|
||||
}
|
||||
|
||||
@@ -97,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
|
||||
self.columnar_serializer.wrt.write_all(buf)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_prepare_key_bytes() {
|
||||
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
|
||||
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
|
||||
assert_eq!(buffer.len(), 12);
|
||||
assert_eq!(&buffer[..10], b"root0child");
|
||||
assert_eq!(buffer[10], 0u8);
|
||||
assert_eq!(buffer[11], ColumnType::Str.to_code());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::RowId;
|
||||
|
||||
@@ -59,31 +60,47 @@ impl IndexBuilder for OptionalIndexBuilder {
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MultivaluedIndexBuilder {
|
||||
start_offsets: Vec<RowId>,
|
||||
doc_with_values: Vec<RowId>,
|
||||
start_offsets: Vec<u32>,
|
||||
total_num_vals_seen: u32,
|
||||
current_row: RowId,
|
||||
current_row_has_value: bool,
|
||||
}
|
||||
|
||||
impl MultivaluedIndexBuilder {
|
||||
pub fn finish(&mut self, num_docs: RowId) -> &[u32] {
|
||||
self.start_offsets
|
||||
.resize(num_docs as usize + 1, self.total_num_vals_seen);
|
||||
&self.start_offsets[..]
|
||||
pub fn finish(&mut self, num_docs: RowId) -> SerializableMultivalueIndex<'_> {
|
||||
self.start_offsets.push(self.total_num_vals_seen);
|
||||
let non_null_row_ids: Box<dyn Iterable<RowId>> = Box::new(&self.doc_with_values[..]);
|
||||
SerializableMultivalueIndex {
|
||||
doc_ids_with_values: SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows: num_docs,
|
||||
},
|
||||
start_offsets: Box::new(&self.start_offsets[..]),
|
||||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.doc_with_values.clear();
|
||||
self.start_offsets.clear();
|
||||
self.start_offsets.push(0u32);
|
||||
self.total_num_vals_seen = 0;
|
||||
self.current_row = 0;
|
||||
self.current_row_has_value = false;
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexBuilder for MultivaluedIndexBuilder {
|
||||
fn record_row(&mut self, row_id: RowId) {
|
||||
self.start_offsets
|
||||
.resize(row_id as usize + 1, self.total_num_vals_seen);
|
||||
self.current_row = row_id;
|
||||
self.current_row_has_value = false;
|
||||
}
|
||||
|
||||
fn record_value(&mut self) {
|
||||
if !self.current_row_has_value {
|
||||
self.current_row_has_value = true;
|
||||
self.doc_with_values.push(self.current_row);
|
||||
self.start_offsets.push(self.total_num_vals_seen);
|
||||
}
|
||||
self.total_num_vals_seen += 1;
|
||||
}
|
||||
}
|
||||
@@ -141,6 +158,32 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_value_index_builder_simple() {
|
||||
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||
{
|
||||
multivalued_value_index_builder.record_row(0u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
|
||||
let start_offsets: Vec<u32> = serialized_multivalue_index
|
||||
.start_offsets
|
||||
.boxed_iter()
|
||||
.collect();
|
||||
assert_eq!(&start_offsets, &[0, 2]);
|
||||
}
|
||||
multivalued_value_index_builder.reset();
|
||||
multivalued_value_index_builder.record_row(0u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
|
||||
let start_offsets: Vec<u32> = serialized_multivalue_index
|
||||
.start_offsets
|
||||
.boxed_iter()
|
||||
.collect();
|
||||
assert_eq!(&start_offsets, &[0, 2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_value_index_builder() {
|
||||
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||
@@ -149,17 +192,15 @@ mod tests {
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_row(2u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
multivalued_value_index_builder.finish(4u32).to_vec(),
|
||||
vec![0, 0, 2, 3, 3]
|
||||
);
|
||||
multivalued_value_index_builder.reset();
|
||||
multivalued_value_index_builder.record_row(2u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
multivalued_value_index_builder.finish(4u32).to_vec(),
|
||||
vec![0, 0, 0, 2, 2]
|
||||
);
|
||||
let SerializableMultivalueIndex {
|
||||
doc_ids_with_values,
|
||||
start_offsets,
|
||||
} = multivalued_value_index_builder.finish(4u32);
|
||||
assert_eq!(doc_ids_with_values.num_rows, 4u32);
|
||||
let doc_ids_with_values: Vec<u32> =
|
||||
doc_ids_with_values.non_null_row_ids.boxed_iter().collect();
|
||||
assert_eq!(&doc_ids_with_values, &[1u32, 2u32]);
|
||||
let start_offsets: Vec<u32> = start_offsets.boxed_iter().collect();
|
||||
assert_eq!(&start_offsets[..], &[0, 2, 3]);
|
||||
}
|
||||
}
|
||||
|
||||
183
columnar/src/compat_tests.rs
Normal file
183
columnar/src/compat_tests.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::{
|
||||
merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
|
||||
CURRENT_VERSION,
|
||||
};
|
||||
|
||||
const NUM_DOCS: u32 = u16::MAX as u32;
|
||||
|
||||
fn generate_columnar(num_docs: u32, value_offset: u64) -> Vec<u8> {
|
||||
use crate::ColumnarWriter;
|
||||
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
|
||||
for i in 0..num_docs {
|
||||
if i % 100 == 0 {
|
||||
columnar_writer.record_numerical(i, "sparse", value_offset + i as u64);
|
||||
}
|
||||
if i % 5 == 0 {
|
||||
columnar_writer.record_numerical(i, "dense", value_offset + i as u64);
|
||||
}
|
||||
columnar_writer.record_numerical(i, "full", value_offset + i as u64);
|
||||
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
||||
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
||||
}
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||
|
||||
wrt
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Writes a columnar for the CURRENT_VERSION to disk.
|
||||
fn create_format() {
|
||||
let version = CURRENT_VERSION.to_string();
|
||||
let file_path = path_for_version(&version);
|
||||
if PathBuf::from(file_path.clone()).exists() {
|
||||
return;
|
||||
}
|
||||
let columnar = generate_columnar(NUM_DOCS, 0);
|
||||
std::fs::write(file_path, columnar).unwrap();
|
||||
}
|
||||
|
||||
fn path_for_version(version: &str) -> String {
|
||||
format!("./compat_tests_data/{}.columnar", version)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_v1() {
|
||||
let path = path_for_version("v1");
|
||||
test_format(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_v2() {
|
||||
let path = path_for_version("v2");
|
||||
test_format(&path);
|
||||
}
|
||||
|
||||
fn test_format(path: &str) {
|
||||
let file_content = std::fs::read(path).unwrap();
|
||||
let reader = ColumnarReader::open(file_content).unwrap();
|
||||
|
||||
check_columns(&reader);
|
||||
|
||||
// Test merge
|
||||
let reader2 = ColumnarReader::open(generate_columnar(NUM_DOCS, NUM_DOCS as u64)).unwrap();
|
||||
let columnar_readers = vec![&reader, &reader2];
|
||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||
let mut out = Vec::new();
|
||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||
let reader = ColumnarReader::open(out).unwrap();
|
||||
check_columns(&reader);
|
||||
}
|
||||
|
||||
fn check_columns(reader: &ColumnarReader) {
|
||||
let column = open_column(reader, "full");
|
||||
check_column(&column, |doc_id| vec![(doc_id, doc_id as u64).into()]);
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Full);
|
||||
|
||||
let column = open_column(reader, "multi");
|
||||
check_column(&column, |doc_id| {
|
||||
vec![
|
||||
(doc_id * 2, doc_id as u64).into(),
|
||||
(doc_id * 2 + 1, doc_id as u64).into(),
|
||||
]
|
||||
});
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Multivalued);
|
||||
|
||||
let column = open_column(reader, "sparse");
|
||||
check_column(&column, |doc_id| {
|
||||
if doc_id % 100 == 0 {
|
||||
vec![(doc_id / 100, doc_id as u64).into()]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
});
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
||||
|
||||
let column = open_column(reader, "dense");
|
||||
check_column(&column, |doc_id| {
|
||||
if doc_id % 5 == 0 {
|
||||
vec![(doc_id / 5, doc_id as u64).into()]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
});
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
||||
}
|
||||
|
||||
struct RowIdAndValue {
|
||||
row_id: u32,
|
||||
value: u64,
|
||||
}
|
||||
impl From<(u32, u64)> for RowIdAndValue {
|
||||
fn from((row_id, value): (u32, u64)) -> Self {
|
||||
Self { row_id, value }
|
||||
}
|
||||
}
|
||||
|
||||
fn check_column<F: Fn(u32) -> Vec<RowIdAndValue>>(column: &Column<u64>, expected: F) {
|
||||
let num_docs = column.num_docs();
|
||||
let test_doc = |doc: u32| {
|
||||
if expected(doc).is_empty() {
|
||||
assert_eq!(column.first(doc), None);
|
||||
} else {
|
||||
assert_eq!(column.first(doc), Some(expected(doc)[0].value));
|
||||
}
|
||||
let values = column.values_for_doc(doc).collect_vec();
|
||||
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
||||
let mut row_ids = Vec::new();
|
||||
column.row_ids_for_docs(&[doc], &mut vec![], &mut row_ids);
|
||||
assert_eq!(
|
||||
row_ids,
|
||||
expected(doc).iter().map(|x| x.row_id).collect_vec()
|
||||
);
|
||||
let values = column.values_for_doc(doc).collect_vec();
|
||||
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
||||
|
||||
// Docid rowid conversion
|
||||
let mut row_ids = Vec::new();
|
||||
let safe_next_doc = |doc: u32| (doc + 1).min(num_docs - 1);
|
||||
column
|
||||
.index
|
||||
.docids_to_rowids(&[doc, safe_next_doc(doc)], &mut vec![], &mut row_ids);
|
||||
let expected_rowids = expected(doc)
|
||||
.iter()
|
||||
.map(|x| x.row_id)
|
||||
.chain(expected(safe_next_doc(doc)).iter().map(|x| x.row_id))
|
||||
.collect_vec();
|
||||
assert_eq!(row_ids, expected_rowids);
|
||||
let rowid_range = column
|
||||
.index
|
||||
.docid_range_to_rowids(doc..safe_next_doc(doc) + 1);
|
||||
if expected_rowids.is_empty() {
|
||||
assert!(rowid_range.is_empty());
|
||||
} else {
|
||||
assert_eq!(
|
||||
rowid_range,
|
||||
expected_rowids[0]..expected_rowids.last().unwrap() + 1
|
||||
);
|
||||
}
|
||||
};
|
||||
test_doc(0);
|
||||
test_doc(num_docs - 1);
|
||||
test_doc(num_docs - 2);
|
||||
test_doc(65000);
|
||||
}
|
||||
|
||||
fn open_column(reader: &ColumnarReader, name: &str) -> Column<u64> {
|
||||
let column = reader.read_columns(name).unwrap()[0]
|
||||
.open()
|
||||
.unwrap()
|
||||
.coerce_numerical(crate::NumericalType::U64)
|
||||
.unwrap();
|
||||
let DynamicColumn::U64(column) = column else {
|
||||
panic!();
|
||||
};
|
||||
column
|
||||
}
|
||||
@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
||||
use crate::column::{BytesColumn, Column, StrColumn};
|
||||
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType};
|
||||
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum DynamicColumn {
|
||||
@@ -232,6 +232,7 @@ static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
|
||||
pub struct DynamicColumnHandle {
|
||||
pub(crate) file_slice: FileSlice,
|
||||
pub(crate) column_type: ColumnType,
|
||||
pub(crate) format_version: Version,
|
||||
}
|
||||
|
||||
impl DynamicColumnHandle {
|
||||
@@ -260,11 +261,15 @@ impl DynamicColumnHandle {
|
||||
let column_bytes = self.file_slice.read_bytes()?;
|
||||
match self.column_type {
|
||||
ColumnType::Str | ColumnType::Bytes => {
|
||||
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
|
||||
let column: BytesColumn =
|
||||
crate::column::open_column_bytes(column_bytes, self.format_version)?;
|
||||
Ok(Some(column.term_ord_column))
|
||||
}
|
||||
ColumnType::IpAddr => {
|
||||
let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
|
||||
let column = crate::column::open_column_u128_as_compact_u64(
|
||||
column_bytes,
|
||||
self.format_version,
|
||||
)?;
|
||||
Ok(Some(column))
|
||||
}
|
||||
ColumnType::Bool
|
||||
@@ -272,7 +277,8 @@ impl DynamicColumnHandle {
|
||||
| ColumnType::U64
|
||||
| ColumnType::F64
|
||||
| ColumnType::DateTime => {
|
||||
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
||||
let column =
|
||||
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?;
|
||||
Ok(Some(column))
|
||||
}
|
||||
}
|
||||
@@ -280,15 +286,31 @@ impl DynamicColumnHandle {
|
||||
|
||||
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
|
||||
let dynamic_column: DynamicColumn = match self.column_type {
|
||||
ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
|
||||
ColumnType::Str => crate::column::open_column_str(column_bytes)?.into(),
|
||||
ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
|
||||
ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
|
||||
ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
|
||||
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
|
||||
ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
|
||||
ColumnType::Bytes => {
|
||||
crate::column::open_column_bytes(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::Str => {
|
||||
crate::column::open_column_str(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::I64 => {
|
||||
crate::column::open_column_u64::<i64>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::U64 => {
|
||||
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::F64 => {
|
||||
crate::column::open_column_u64::<f64>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::Bool => {
|
||||
crate::column::open_column_u64::<bool>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::IpAddr => {
|
||||
crate::column::open_column_u128::<Ipv6Addr>(column_bytes, self.format_version)?
|
||||
.into()
|
||||
}
|
||||
ColumnType::DateTime => {
|
||||
crate::column::open_column_u64::<DateTime>(column_bytes)?.into()
|
||||
crate::column::open_column_u64::<DateTime>(column_bytes, self.format_version)?
|
||||
.into()
|
||||
}
|
||||
};
|
||||
Ok(dynamic_column)
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{ColumnValues, RowId};
|
||||
|
||||
pub trait Iterable<T = u64> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
|
||||
@@ -17,3 +20,9 @@ where Range<T>: Iterator<Item = T>
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterable for Arc<dyn crate::ColumnValues<RowId>> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
Box::new(self.iter().map(|row_id| row_id as u64))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ pub use column_values::{
|
||||
};
|
||||
pub use columnar::{
|
||||
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
|
||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
|
||||
};
|
||||
use sstable::VoidSSTable;
|
||||
pub use value::{NumericalType, NumericalValue};
|
||||
@@ -131,3 +131,6 @@ impl Cardinality {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
#[cfg(test)]
|
||||
mod compat_tests;
|
||||
|
||||
@@ -21,7 +21,7 @@ fn test_dataframe_writer_str() {
|
||||
dataframe_writer.record_str(1u32, "my_string", "hello");
|
||||
dataframe_writer.record_str(3u32, "my_string", "helloeee");
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||
@@ -35,7 +35,7 @@ fn test_dataframe_writer_bytes() {
|
||||
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
|
||||
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||
@@ -49,7 +49,7 @@ fn test_dataframe_writer_bool() {
|
||||
dataframe_writer.record_bool(1u32, "bool.value", false);
|
||||
dataframe_writer.record_bool(3u32, "bool.value", true);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
|
||||
@@ -74,12 +74,12 @@ fn test_dataframe_writer_u64_multivalued() {
|
||||
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
|
||||
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(7, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(7, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 29);
|
||||
assert_eq!(cols[0].num_bytes(), 50);
|
||||
let dyn_i64_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
|
||||
panic!();
|
||||
@@ -97,7 +97,7 @@ fn test_dataframe_writer_ip_addr() {
|
||||
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
|
||||
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
|
||||
@@ -128,7 +128,7 @@ fn test_dataframe_writer_numerical() {
|
||||
dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
|
||||
dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(6, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(6, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
|
||||
@@ -153,46 +153,6 @@ fn test_dataframe_writer_numerical() {
|
||||
assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_sort_by_full() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
|
||||
let data = dataframe_writer.sort_order("value", 2, false);
|
||||
assert_eq!(data, vec![0, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_sort_by_opt() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
|
||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
|
||||
let data = dataframe_writer.sort_order("value", 5, false);
|
||||
// 0, 2, 4 is 0.0
|
||||
assert_eq!(data, vec![0, 2, 4, 3, 1]);
|
||||
let data = dataframe_writer.sort_order("value", 5, true);
|
||||
assert_eq!(
|
||||
data,
|
||||
vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_sort_by_multi() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
// valid for sort
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
|
||||
// those are ignored for sort
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
|
||||
// valid for sort
|
||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
|
||||
// ignored, would change sort order
|
||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
|
||||
let data = dataframe_writer.sort_order("value", 4, false);
|
||||
assert_eq!(data, vec![0, 2, 1, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary_encoded_str() {
|
||||
let mut buffer = Vec::new();
|
||||
@@ -201,7 +161,7 @@ fn test_dictionary_encoded_str() {
|
||||
columnar_writer.record_str(3, "my.column", "c");
|
||||
columnar_writer.record_str(3, "my.column2", "different_column!");
|
||||
columnar_writer.record_str(4, "my.column", "b");
|
||||
columnar_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
@@ -235,7 +195,7 @@ fn test_dictionary_encoded_bytes() {
|
||||
columnar_writer.record_bytes(3, "my.column", b"c");
|
||||
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
|
||||
columnar_writer.record_bytes(4, "my.column", b"b");
|
||||
columnar_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
@@ -344,7 +304,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
|
||||
ip_addr_byte
|
||||
))),
|
||||
1 => any::<bool>().prop_map(ColumnValue::Bool),
|
||||
1 => (0_679_723_993i64..1_679_723_995i64)
|
||||
1 => (679_723_993i64..1_679_723_995i64)
|
||||
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
|
||||
]
|
||||
}
|
||||
@@ -369,26 +329,12 @@ fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, Colu
|
||||
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
|
||||
}
|
||||
|
||||
fn columnar_docs_and_mapping_strategy(
|
||||
) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
|
||||
columnar_docs_strategy().prop_flat_map(|docs| {
|
||||
permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
|
||||
})
|
||||
}
|
||||
|
||||
fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
|
||||
Just((0u32..n as RowId).collect()).prop_shuffle()
|
||||
}
|
||||
|
||||
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
|
||||
let vals: Vec<usize> = (0..n).collect();
|
||||
subsequence(vals, 0..=n).prop_shuffle()
|
||||
}
|
||||
|
||||
fn build_columnar_with_mapping(
|
||||
docs: &[Vec<(&'static str, ColumnValue)>],
|
||||
old_to_new_row_ids_opt: Option<&[RowId]>,
|
||||
) -> ColumnarReader {
|
||||
fn build_columnar_with_mapping(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
||||
let num_docs = docs.len() as u32;
|
||||
let mut buffer = Vec::new();
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
@@ -416,15 +362,13 @@ fn build_columnar_with_mapping(
|
||||
}
|
||||
}
|
||||
}
|
||||
columnar_writer
|
||||
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
|
||||
.unwrap();
|
||||
columnar_writer.serialize(num_docs, &mut buffer).unwrap();
|
||||
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
||||
build_columnar_with_mapping(docs, None)
|
||||
build_columnar_with_mapping(docs)
|
||||
}
|
||||
|
||||
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
|
||||
@@ -448,6 +392,7 @@ fn assert_columnar_eq(
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
|
||||
left: &Column<T>,
|
||||
right: &Column<T>,
|
||||
@@ -683,54 +628,6 @@ proptest! {
|
||||
}
|
||||
}
|
||||
|
||||
// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
||||
#[test]
|
||||
fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
|
||||
let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
|
||||
assert_eq!(columnar.num_rows() as usize, docs.len());
|
||||
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
|
||||
for (doc_id, doc_vals) in docs.iter().enumerate() {
|
||||
for (col_name, col_val) in doc_vals {
|
||||
expected_columns
|
||||
.entry((col_name, col_val.column_type_category()))
|
||||
.or_default()
|
||||
.entry(mapping[doc_id])
|
||||
.or_default()
|
||||
.push(col_val);
|
||||
}
|
||||
}
|
||||
let column_list = columnar.list_columns().unwrap();
|
||||
assert_eq!(expected_columns.len(), column_list.len());
|
||||
for (column_name, column) in column_list {
|
||||
let dynamic_column = column.open().unwrap();
|
||||
let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
|
||||
let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
|
||||
for _doc_id in 0..columnar.num_rows() {
|
||||
match &dynamic_column {
|
||||
DynamicColumn::Bool(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::I64(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::U64(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::F64(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::IpAddr(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::DateTime(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::Bytes(col) =>
|
||||
assert_bytes_column_values(col, expected_col_values, false),
|
||||
DynamicColumn::Str(col) =>
|
||||
assert_bytes_column_values(col, expected_col_values, true),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This tests create 2 or 3 random small columnar and attempts to merge them.
|
||||
// It compares the resulting merged dataframe with what would have been obtained by building the
|
||||
// dataframe from the concatenated rows to begin with.
|
||||
@@ -844,24 +741,68 @@ fn columnar_docs_and_remap(
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(1000))]
|
||||
#[test]
|
||||
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
|
||||
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
|
||||
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
|
||||
.collect();
|
||||
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
|
||||
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
|
||||
.map(|docs| build_columnar(&docs[..]))
|
||||
.collect::<Vec<_>>();
|
||||
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
|
||||
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
|
||||
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
|
||||
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in
|
||||
columnar_docs_and_remap()) {
|
||||
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_columnar_merge_and_remap(
|
||||
columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>,
|
||||
shuffle_merge_order: Vec<RowAddr>,
|
||||
) {
|
||||
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order
|
||||
.iter()
|
||||
.map(|row_addr| {
|
||||
columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone()
|
||||
})
|
||||
.collect();
|
||||
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
|
||||
let columnar_readers: Vec<ColumnarReader> = columnar_docs
|
||||
.iter()
|
||||
.map(|docs| build_columnar(&docs[..]))
|
||||
.collect::<Vec<_>>();
|
||||
let columnar_readers_ref: Vec<&ColumnarReader> = columnar_readers.iter().collect();
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let segment_num_rows: Vec<RowId> = columnar_docs
|
||||
.iter()
|
||||
.map(|docs| docs.len() as RowId)
|
||||
.collect();
|
||||
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
|
||||
crate::merge_columnar(
|
||||
&columnar_readers_ref[..],
|
||||
&[],
|
||||
shuffle_merge_order.into(),
|
||||
&mut output,
|
||||
)
|
||||
.unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columnar_merge_and_remap_bug_1() {
|
||||
let columnar_docs = vec![vec![
|
||||
vec![
|
||||
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
|
||||
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
|
||||
],
|
||||
vec![],
|
||||
]];
|
||||
let shuffle_merge_order: Vec<RowAddr> = vec![
|
||||
RowAddr {
|
||||
segment_ord: 0,
|
||||
row_id: 1,
|
||||
},
|
||||
RowAddr {
|
||||
segment_ord: 0,
|
||||
row_id: 0,
|
||||
},
|
||||
];
|
||||
|
||||
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columnar_merge_empty() {
|
||||
let columnar_reader_1 = build_columnar(&[]);
|
||||
|
||||
@@ -9,7 +9,6 @@ documentation = "https://docs.rs/tantivy_common/"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
@@ -20,5 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.8.1"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
|
||||
|
||||
@@ -1,39 +1,64 @@
|
||||
#![feature(test)]
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
|
||||
|
||||
extern crate test;
|
||||
fn bench_vint() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_common::serialize_vint_u32;
|
||||
use test::Bencher;
|
||||
let vals: Vec<u32> = (0..20_000).collect();
|
||||
runner.bench_function("bench_vint", move |_| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
|
||||
#[bench]
|
||||
fn bench_vint(b: &mut Bencher) {
|
||||
let vals: Vec<u32> = (0..20_000).collect();
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_vint_rand(b: &mut Bencher) {
|
||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||
runner.bench_function("bench_vint_rand", move |_| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_bitset() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
runner.bench_function("bench_tinyset_pop", move |_| {
|
||||
let mut tinyset = TinySet::singleton(black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
black_box(tinyset);
|
||||
});
|
||||
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
runner.bench_function("bench_tinyset_sum", move |_| {
|
||||
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
runner.bench_function("bench_tinyarr_sum", move |_| {
|
||||
black_box(v.iter().cloned().sum::<u32>());
|
||||
});
|
||||
|
||||
runner.bench_function("bench_bitset_initialize", move |_| {
|
||||
black_box(BitSet::with_max_value(1_000_000));
|
||||
});
|
||||
}
|
||||
|
||||
fn main() {
|
||||
bench_vint();
|
||||
bench_bitset();
|
||||
}
|
||||
|
||||
@@ -696,43 +696,3 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use test;
|
||||
|
||||
use super::{BitSet, TinySet};
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
|
||||
BIN
doc/assets/images/paradedb.png
Normal file
BIN
doc/assets/images/paradedb.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
@@ -7,6 +7,11 @@
|
||||
- [Other](#other)
|
||||
- [Usage](#usage)
|
||||
|
||||
# Index Sorting has been removed!
|
||||
More infos here:
|
||||
|
||||
https://github.com/quickwit-oss/tantivy/issues/2352
|
||||
|
||||
# Index Sorting
|
||||
|
||||
Tantivy allows you to sort the index according to a property.
|
||||
|
||||
@@ -19,14 +19,13 @@ use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Normally you would use `MMapDirectory` instead to persist data on disk.
|
||||
// https://docs.rs/tantivy/latest/tantivy/directory/struct.MmapDirectory.html
|
||||
// But for this example, we will use a temporary directory `TempDir`.
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a schema.
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::ops::Bound;
|
||||
|
||||
// # Searching a range on an indexed int field.
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
@@ -5,7 +7,7 @@
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, IndexWriter, Result};
|
||||
use tantivy::{doc, Index, IndexWriter, Result, Term};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// The end is excluded i.e. here we are searching up to 1969
|
||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
||||
let docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
// Uses a Count collector to sum the total number of docs in the range
|
||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(num_60s_books, 10);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::iter::once;
|
||||
|
||||
use nom::branch::alt;
|
||||
@@ -19,7 +20,7 @@ use crate::Occur;
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
];
|
||||
|
||||
/// consume a field name followed by colon. Return the field name with escape sequence
|
||||
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
|
||||
)(inp)
|
||||
}
|
||||
|
||||
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
|
||||
|
||||
fn interpret_escape(source: &str) -> String {
|
||||
let mut res = String::with_capacity(source.len());
|
||||
let mut in_escape = false;
|
||||
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
|
||||
|
||||
for c in source.chars() {
|
||||
if in_escape {
|
||||
if !require_escape(c) {
|
||||
// we re-add the escape sequence
|
||||
res.push('\\');
|
||||
}
|
||||
res.push(c);
|
||||
in_escape = false;
|
||||
} else if c == '\\' {
|
||||
in_escape = true;
|
||||
} else {
|
||||
res.push(c);
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Consume a word outside of any context.
|
||||
// TODO should support escape sequences
|
||||
fn word(inp: &str) -> IResult<&str, &str> {
|
||||
fn word(inp: &str) -> IResult<&str, Cow<str>> {
|
||||
map_res(
|
||||
recognize(tuple((
|
||||
satisfy(|c| {
|
||||
!c.is_whitespace()
|
||||
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
|
||||
}),
|
||||
many0(satisfy(|c: char| {
|
||||
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
|
||||
})),
|
||||
alt((
|
||||
preceded(char('\\'), anychar),
|
||||
satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
|
||||
)),
|
||||
many0(alt((
|
||||
preceded(char('\\'), anychar),
|
||||
satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
|
||||
))),
|
||||
))),
|
||||
|s| match s {
|
||||
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
|
||||
_ => Ok(s),
|
||||
s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
|
||||
s => Ok(Cow::Borrowed(s)),
|
||||
},
|
||||
)(inp)
|
||||
}
|
||||
|
||||
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|
||||
|inp| {
|
||||
opt_i_err(
|
||||
preceded(
|
||||
multispace0,
|
||||
recognize(many1(satisfy(|c| {
|
||||
!c.is_whitespace() && !delimiter.contains(c)
|
||||
}))),
|
||||
fn word_infallible(
|
||||
delimiter: &str,
|
||||
emit_error: bool,
|
||||
) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
|
||||
// emit error is set when receiving an unescaped `:` should emit an error
|
||||
|
||||
move |inp| {
|
||||
map(
|
||||
opt_i_err(
|
||||
preceded(
|
||||
multispace0,
|
||||
recognize(many1(alt((
|
||||
preceded(char::<&str, _>('\\'), anychar),
|
||||
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
|
||||
)))),
|
||||
),
|
||||
"expected word",
|
||||
),
|
||||
"expected word",
|
||||
|(opt_s, mut errors)| match opt_s {
|
||||
Some(s) => {
|
||||
if emit_error
|
||||
&& (s
|
||||
.as_bytes()
|
||||
.windows(2)
|
||||
.any(|window| window[0] != b'\\' && window[1] == b':')
|
||||
|| s.starts_with(':'))
|
||||
{
|
||||
errors.push(LenientErrorInternal {
|
||||
pos: inp.len(),
|
||||
message: "parsed possible invalid field as term".to_string(),
|
||||
});
|
||||
}
|
||||
if s.contains('\\') {
|
||||
(Some(Cow::Owned(interpret_escape(s))), errors)
|
||||
} else {
|
||||
(Some(Cow::Borrowed(s)), errors)
|
||||
}
|
||||
}
|
||||
None => (None, errors),
|
||||
},
|
||||
)(inp)
|
||||
}
|
||||
}
|
||||
@@ -159,7 +216,7 @@ fn simple_term_infallible(
|
||||
(value((), char('\'')), simple_quotes),
|
||||
),
|
||||
// numbers are parsed with words in this case, as we allow string starting with a -
|
||||
map(word_infallible(delimiter), |(text, errors)| {
|
||||
map(word_infallible(delimiter, true), |(text, errors)| {
|
||||
(text.map(|text| (Delimiter::None, text.to_string())), errors)
|
||||
}),
|
||||
)(inp)
|
||||
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|
||||
|((field_name, _, leaf), mut errors)| {
|
||||
(
|
||||
leaf.map(|leaf| {
|
||||
if matches!(&leaf, UserInputLeaf::Literal(literal)
|
||||
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
|
||||
&& field_name.is_none()
|
||||
{
|
||||
errors.push(LenientErrorInternal {
|
||||
pos: inp.len(),
|
||||
message: "parsed possible invalid field as term".to_string(),
|
||||
});
|
||||
}
|
||||
if matches!(&leaf, UserInputLeaf::Literal(literal)
|
||||
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
|
||||
&& field_name.is_none()
|
||||
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
tuple_infallible((
|
||||
opt_i(anychar),
|
||||
space0_infallible,
|
||||
word_infallible("]}"),
|
||||
word_infallible("]}", false),
|
||||
space1_infallible,
|
||||
opt_i_err(
|
||||
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
|
||||
"missing keyword TO",
|
||||
),
|
||||
word_infallible("]}"),
|
||||
word_infallible("]}", false),
|
||||
opt_i_err(one_of("]}"), "missing range delimiter"),
|
||||
)),
|
||||
|(
|
||||
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
|
||||
errs,
|
||||
)| {
|
||||
let lower_bound = match (lower_bound_kind, lower) {
|
||||
let lower_bound = match (lower_bound_kind, lower.as_deref()) {
|
||||
(_, Some("*")) => UserInputBound::Unbounded,
|
||||
(_, None) => UserInputBound::Unbounded,
|
||||
// if it is some, TO was actually the bound (i.e. [TO TO something])
|
||||
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
|
||||
_ => unreachable!("precondition failed, range did not start with [ or {{"),
|
||||
};
|
||||
let upper_bound = match (upper_bound_kind, upper) {
|
||||
let upper_bound = match (upper_bound_kind, upper.as_deref()) {
|
||||
(_, Some("*")) => UserInputBound::Unbounded,
|
||||
(_, None) => UserInputBound::Unbounded,
|
||||
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
|
||||
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
(
|
||||
(
|
||||
value((), tag(">=")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
bound
|
||||
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag("<=")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
UserInputBound::Unbounded,
|
||||
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag(">")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
bound
|
||||
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag("<")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
UserInputBound::Unbounded,
|
||||
@@ -1157,6 +1205,12 @@ mod test {
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
|
||||
|
||||
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1590,5 +1644,21 @@ mod test {
|
||||
r#"myfield:'hello\"happy\'tax'"#,
|
||||
r#""myfield":'hello"happy'tax'"#,
|
||||
);
|
||||
// we don't process escape sequence for chars which don't require it
|
||||
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_queries_with_colons() {
|
||||
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
|
||||
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
|
||||
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
|
||||
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
|
||||
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_field() {
|
||||
test_is_parse_err(r#"!bc:def"#, "!bc:def");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,8 +34,9 @@ use super::bucket::{
|
||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
|
||||
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
|
||||
TopHitsAggregationReq,
|
||||
};
|
||||
|
||||
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
|
||||
@@ -146,6 +147,11 @@ pub enum AggregationVariants {
|
||||
/// extracted values.
|
||||
#[serde(rename = "stats")]
|
||||
Stats(StatsAggregation),
|
||||
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
|
||||
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
|
||||
/// `std_deviation_sampling`) over the extracted values.
|
||||
#[serde(rename = "extended_stats")]
|
||||
ExtendedStats(ExtendedStatsAggregation),
|
||||
/// Computes the sum of the extracted values.
|
||||
#[serde(rename = "sum")]
|
||||
Sum(SumAggregation),
|
||||
@@ -154,7 +160,10 @@ pub enum AggregationVariants {
|
||||
Percentiles(PercentilesAggregationReq),
|
||||
/// Finds the top k values matching some order
|
||||
#[serde(rename = "top_hits")]
|
||||
TopHits(TopHitsAggregation),
|
||||
TopHits(TopHitsAggregationReq),
|
||||
/// Computes an estimate of the number of unique values
|
||||
#[serde(rename = "cardinality")]
|
||||
Cardinality(CardinalityAggregationReq),
|
||||
}
|
||||
|
||||
impl AggregationVariants {
|
||||
@@ -170,9 +179,11 @@ impl AggregationVariants {
|
||||
AggregationVariants::Max(max) => vec![max.field_name()],
|
||||
AggregationVariants::Min(min) => vec![min.field_name()],
|
||||
AggregationVariants::Stats(stats) => vec![stats.field_name()],
|
||||
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
|
||||
AggregationVariants::Sum(sum) => vec![sum.field_name()],
|
||||
AggregationVariants::Percentiles(per) => vec![per.field_name()],
|
||||
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
|
||||
AggregationVariants::Cardinality(per) => vec![per.field_name()],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,6 +208,12 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
|
||||
match &self {
|
||||
AggregationVariants::TopHits(top_hits) => Some(top_hits),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
|
||||
match &self {
|
||||
|
||||
@@ -11,8 +11,8 @@ use super::bucket::{
|
||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
|
||||
SumAggregation,
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
||||
};
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::VecWithNames;
|
||||
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
|
||||
field: ref field_name,
|
||||
ref missing,
|
||||
..
|
||||
})
|
||||
| Cardinality(CardinalityAggregationReq {
|
||||
field: ref field_name,
|
||||
ref missing,
|
||||
..
|
||||
}) => {
|
||||
let str_dict_column = reader.fast_fields().str(field_name)?;
|
||||
let allowed_column_types = [
|
||||
@@ -276,6 +281,10 @@ impl AggregationWithAccessor {
|
||||
field: ref field_name,
|
||||
..
|
||||
})
|
||||
| ExtendedStats(ExtendedStatsAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
})
|
||||
| Sum(SumAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
|
||||
@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::GetDocCount;
|
||||
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
|
||||
use super::metric::{
|
||||
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
|
||||
};
|
||||
use super::{AggregationError, Key};
|
||||
use crate::TantivyError;
|
||||
|
||||
@@ -88,12 +90,16 @@ pub enum MetricResult {
|
||||
Min(SingleMetricResult),
|
||||
/// Stats metric result.
|
||||
Stats(Stats),
|
||||
/// ExtendedStats metric result.
|
||||
ExtendedStats(Box<ExtendedStats>),
|
||||
/// Sum metric result.
|
||||
Sum(SingleMetricResult),
|
||||
/// Percentiles metric result.
|
||||
Percentiles(PercentilesMetricResult),
|
||||
/// Top hits metric result
|
||||
TopHits(TopHitsMetricResult),
|
||||
/// Cardinality metric result
|
||||
Cardinality(SingleMetricResult),
|
||||
}
|
||||
|
||||
impl MetricResult {
|
||||
@@ -104,6 +110,7 @@ impl MetricResult {
|
||||
MetricResult::Max(max) => Ok(max.value),
|
||||
MetricResult::Min(min) => Ok(min.value),
|
||||
MetricResult::Stats(stats) => stats.get_value(agg_property),
|
||||
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
|
||||
MetricResult::Sum(sum) => Ok(sum.value),
|
||||
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
|
||||
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
|
||||
@@ -111,6 +118,7 @@ impl MetricResult {
|
||||
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
|
||||
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
|
||||
)),
|
||||
MetricResult::Cardinality(card) => Ok(card.value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"cardinality_string_id":{
|
||||
"cardinality": {
|
||||
"field": "string_id"
|
||||
}
|
||||
},
|
||||
"cardinality_score":{
|
||||
"cardinality": {
|
||||
"field": "score"
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
|
||||
)
|
||||
);
|
||||
|
||||
assert_eq!(res["cardinality_string_id"]["value"], 2.0);
|
||||
assert_eq!(res["cardinality_score"]["value"], 80.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -926,10 +939,10 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
},
|
||||
"termagg": {
|
||||
"buckets": [
|
||||
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
|
||||
{ "doc_count": 1, "key": 10.0, "key_as_string": "10", "min_price": { "value": 10.0 } },
|
||||
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
||||
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
||||
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
||||
{ "doc_count": 1, "key": -20.5, "key_as_string": "-20.5", "min_price": { "value": -20.5 } },
|
||||
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
||||
],
|
||||
"sum_other_doc_count": 0
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use std::fmt::Debug;
|
||||
use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{
|
||||
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
|
||||
};
|
||||
use columnar::{ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -466,49 +465,66 @@ impl SegmentTermCollector {
|
||||
};
|
||||
|
||||
if self.column_type == ColumnType::Str {
|
||||
let fallback_dict = Dictionary::empty();
|
||||
let term_dict = agg_with_accessor
|
||||
.str_dict_column
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or_else(|| {
|
||||
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
|
||||
});
|
||||
let mut buffer = String::new();
|
||||
for (term_id, doc_count) in entries {
|
||||
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
|
||||
// Special case for missing key
|
||||
if term_id == u64::MAX {
|
||||
let missing_key = self
|
||||
.req
|
||||
.missing
|
||||
.as_ref()
|
||||
.expect("Found placeholder term_id but `missing` is None");
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
buffer.clear();
|
||||
buffer.push_str(missing);
|
||||
dict.insert(
|
||||
IntermediateKey::Str(buffer.to_string()),
|
||||
intermediate_entry,
|
||||
);
|
||||
}
|
||||
Key::F64(val) => {
|
||||
buffer.push_str(&val.to_string());
|
||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||
}
|
||||
.map(|el| el.dictionary())
|
||||
.unwrap_or_else(|| &fallback_dict);
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
// special case for missing key
|
||||
if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
|
||||
let entry = entries[index];
|
||||
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
|
||||
let missing_key = self
|
||||
.req
|
||||
.missing
|
||||
.as_ref()
|
||||
.expect("Found placeholder term_id but `missing` is None");
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(missing.as_bytes());
|
||||
dict.insert(
|
||||
IntermediateKey::Str(
|
||||
String::from_utf8(buffer.to_vec())
|
||||
.expect("could not convert to String"),
|
||||
),
|
||||
intermediate_entry,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
if !term_dict.ord_to_str(term_id, &mut buffer)? {
|
||||
return Err(TantivyError::InternalError(format!(
|
||||
"Couldn't find term_id {term_id} in dict"
|
||||
)));
|
||||
Key::F64(val) => {
|
||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||
}
|
||||
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
|
||||
}
|
||||
|
||||
entries.swap_remove(index);
|
||||
}
|
||||
|
||||
// Sort by term ord
|
||||
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||||
let mut idx = 0;
|
||||
term_dict.sorted_ords_to_term_cb(
|
||||
entries.iter().map(|(term_id, _)| *term_id),
|
||||
|term| {
|
||||
let entry = entries[idx];
|
||||
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
|
||||
dict.insert(
|
||||
IntermediateKey::Str(
|
||||
String::from_utf8(term.to_vec()).expect("could not convert to String"),
|
||||
),
|
||||
intermediate_entry,
|
||||
);
|
||||
idx += 1;
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
|
||||
if self.req.min_doc_count == 0 {
|
||||
// TODO: Handle rev streaming for descending sorting by keys
|
||||
let mut stream = term_dict.dictionary().stream()?;
|
||||
let mut stream = term_dict.stream()?;
|
||||
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
|
||||
agg_with_accessor.agg.sub_aggregation(),
|
||||
);
|
||||
|
||||
@@ -19,13 +19,14 @@ use super::bucket::{
|
||||
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
|
||||
IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
||||
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||
};
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
use crate::aggregation::metric::CardinalityCollector;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
@@ -215,6 +216,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
||||
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
|
||||
IntermediateStats::default(),
|
||||
)),
|
||||
ExtendedStats(_) => IntermediateAggregationResult::Metric(
|
||||
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
|
||||
),
|
||||
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
|
||||
IntermediateSum::default(),
|
||||
)),
|
||||
@@ -222,7 +226,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
||||
IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
|
||||
),
|
||||
TopHits(ref req) => IntermediateAggregationResult::Metric(
|
||||
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
|
||||
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
|
||||
),
|
||||
Cardinality(_) => IntermediateAggregationResult::Metric(
|
||||
IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -282,10 +289,14 @@ pub enum IntermediateMetricResult {
|
||||
Min(IntermediateMin),
|
||||
/// Intermediate stats result.
|
||||
Stats(IntermediateStats),
|
||||
/// Intermediate stats result.
|
||||
ExtendedStats(IntermediateExtendedStats),
|
||||
/// Intermediate sum result.
|
||||
Sum(IntermediateSum),
|
||||
/// Intermediate top_hits result
|
||||
TopHits(TopHitsTopNComputer),
|
||||
/// Intermediate cardinality result
|
||||
Cardinality(CardinalityCollector),
|
||||
}
|
||||
|
||||
impl IntermediateMetricResult {
|
||||
@@ -306,6 +317,9 @@ impl IntermediateMetricResult {
|
||||
IntermediateMetricResult::Stats(intermediate_stats) => {
|
||||
MetricResult::Stats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
|
||||
MetricResult::ExtendedStats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::Sum(intermediate_sum) => {
|
||||
MetricResult::Sum(intermediate_sum.finalize().into())
|
||||
}
|
||||
@@ -316,6 +330,9 @@ impl IntermediateMetricResult {
|
||||
IntermediateMetricResult::TopHits(top_hits) => {
|
||||
MetricResult::TopHits(top_hits.into_final_result())
|
||||
}
|
||||
IntermediateMetricResult::Cardinality(cardinality) => {
|
||||
MetricResult::Cardinality(cardinality.finalize().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,6 +363,12 @@ impl IntermediateMetricResult {
|
||||
) => {
|
||||
stats_left.merge_fruits(stats_right);
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::ExtendedStats(extended_stats_left),
|
||||
IntermediateMetricResult::ExtendedStats(extended_stats_right),
|
||||
) => {
|
||||
extended_stats_left.merge_fruits(extended_stats_right);
|
||||
}
|
||||
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
|
||||
sum_left.merge_fruits(sum_right);
|
||||
}
|
||||
@@ -358,6 +381,12 @@ impl IntermediateMetricResult {
|
||||
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
|
||||
left.merge_fruits(right)?;
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::Cardinality(left),
|
||||
IntermediateMetricResult::Cardinality(right),
|
||||
) => {
|
||||
left.merge_fruits(right)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible fruit types in tree or missing merge_fruits handler");
|
||||
}
|
||||
@@ -570,6 +599,7 @@ impl IntermediateTermBucketResult {
|
||||
let val = if key { "true" } else { "false" };
|
||||
Some(val.to_string())
|
||||
}
|
||||
IntermediateKey::F64(val) => Some(val.to_string()),
|
||||
_ => None,
|
||||
};
|
||||
Ok(BucketEntry {
|
||||
|
||||
466
src/aggregation/metric/cardinality.rs
Normal file
466
src/aggregation/metric/cardinality.rs
Normal file
@@ -0,0 +1,466 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{BuildHasher, Hasher};
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::Dictionary;
|
||||
use common::f64_to_u64;
|
||||
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
|
||||
use rustc_hash::FxHashSet;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationWithAccessor, AggregationsWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||
use crate::aggregation::*;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct BuildSaltedHasher {
|
||||
salt: u8,
|
||||
}
|
||||
|
||||
impl BuildHasher for BuildSaltedHasher {
|
||||
type Hasher = DefaultHasher;
|
||||
|
||||
fn build_hasher(&self) -> Self::Hasher {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
hasher.write_u8(self.salt);
|
||||
|
||||
hasher
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cardinality
|
||||
///
|
||||
/// The cardinality aggregation allows for computing an estimate
|
||||
/// of the number of different values in a data set based on the
|
||||
/// HyperLogLog++ algorithm. This is particularly useful for understanding the
|
||||
/// uniqueness of values in a large dataset where counting each unique value
|
||||
/// individually would be computationally expensive.
|
||||
///
|
||||
/// For example, you might use a cardinality aggregation to estimate the number
|
||||
/// of unique visitors to a website by aggregating on a field that contains
|
||||
/// user IDs or session IDs.
|
||||
///
|
||||
/// To use the cardinality aggregation, you'll need to provide a field to
|
||||
/// aggregate on. The following example demonstrates a request for the cardinality
|
||||
/// of the "user_id" field:
|
||||
///
|
||||
/// ```JSON
|
||||
/// {
|
||||
/// "cardinality": {
|
||||
/// "field": "user_id"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// This request will return an estimate of the number of unique values in the
|
||||
/// "user_id" field.
|
||||
///
|
||||
/// ## Missing Values
|
||||
///
|
||||
/// The `missing` parameter defines how documents that are missing a value should be treated.
|
||||
/// By default, documents without a value for the specified field are ignored. However, you can
|
||||
/// specify a default value for these documents using the `missing` parameter. This can be useful
|
||||
/// when you want to include documents with missing values in the aggregation.
|
||||
///
|
||||
/// For example, the following request treats documents with missing values in the "user_id"
|
||||
/// field as if they had a value of "unknown":
|
||||
///
|
||||
/// ```JSON
|
||||
/// {
|
||||
/// "cardinality": {
|
||||
/// "field": "user_id",
|
||||
/// "missing": "unknown"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Estimation Accuracy
|
||||
///
|
||||
/// The cardinality aggregation provides an approximate count, which is usually
|
||||
/// accurate within a small error range. This trade-off allows for efficient
|
||||
/// computation even on very large datasets.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CardinalityAggregationReq {
|
||||
/// The field name to compute the percentiles on.
|
||||
pub field: String,
|
||||
/// The missing parameter defines how documents that are missing a value should be treated.
|
||||
/// By default they will be ignored but it is also possible to treat them as if they had a
|
||||
/// value. Examples in JSON format:
|
||||
/// { "field": "my_numbers", "missing": "10.0" }
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub missing: Option<Key>,
|
||||
}
|
||||
|
||||
impl CardinalityAggregationReq {
|
||||
/// Creates a new [`CardinalityAggregationReq`] instance from a field name.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
Self {
|
||||
field: field_name,
|
||||
missing: None,
|
||||
}
|
||||
}
|
||||
/// Returns the field name the aggregation is computed on.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentCardinalityCollector {
|
||||
cardinality: CardinalityCollector,
|
||||
entries: FxHashSet<u64>,
|
||||
column_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
missing: Option<Key>,
|
||||
}
|
||||
|
||||
impl SegmentCardinalityCollector {
|
||||
pub fn from_req(column_type: ColumnType, accessor_idx: usize, missing: &Option<Key>) -> Self {
|
||||
Self {
|
||||
cardinality: CardinalityCollector::new(column_type as u8),
|
||||
entries: Default::default(),
|
||||
column_type,
|
||||
accessor_idx,
|
||||
missing: missing.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_block_with_field(
|
||||
&mut self,
|
||||
docs: &[crate::DocId],
|
||||
agg_accessor: &mut AggregationWithAccessor,
|
||||
) {
|
||||
if let Some(missing) = agg_accessor.missing_value_for_accessor {
|
||||
agg_accessor.column_block_accessor.fetch_block_with_missing(
|
||||
docs,
|
||||
&agg_accessor.accessor,
|
||||
missing,
|
||||
);
|
||||
} else {
|
||||
agg_accessor
|
||||
.column_block_accessor
|
||||
.fetch_block(docs, &agg_accessor.accessor);
|
||||
}
|
||||
}
|
||||
|
||||
fn into_intermediate_metric_result(
|
||||
mut self,
|
||||
agg_with_accessor: &AggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateMetricResult> {
|
||||
if self.column_type == ColumnType::Str {
|
||||
let fallback_dict = Dictionary::empty();
|
||||
let dict = agg_with_accessor
|
||||
.str_dict_column
|
||||
.as_ref()
|
||||
.map(|el| el.dictionary())
|
||||
.unwrap_or_else(|| &fallback_dict);
|
||||
let mut has_missing = false;
|
||||
|
||||
// TODO: replace FxHashSet with something that allows iterating in order
|
||||
// (e.g. sparse bitvec)
|
||||
let mut term_ids = Vec::new();
|
||||
for term_ord in self.entries.into_iter() {
|
||||
if term_ord == u64::MAX {
|
||||
has_missing = true;
|
||||
} else {
|
||||
// we can reasonably exclude values above u32::MAX
|
||||
term_ids.push(term_ord as u32);
|
||||
}
|
||||
}
|
||||
term_ids.sort_unstable();
|
||||
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
|
||||
self.cardinality.sketch.insert_any(&term);
|
||||
Ok(())
|
||||
})?;
|
||||
if has_missing {
|
||||
let missing_key = self
|
||||
.missing
|
||||
.as_ref()
|
||||
.expect("Found placeholder term_ord but `missing` is None");
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
self.cardinality.sketch.insert_any(&missing);
|
||||
}
|
||||
Key::F64(val) => {
|
||||
let val = f64_to_u64(*val);
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
self: Box<Self>,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
results: &mut IntermediateAggregationResults,
|
||||
) -> crate::Result<()> {
|
||||
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
|
||||
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
|
||||
|
||||
let intermediate_result = self.into_intermediate_metric_result(agg_with_accessor)?;
|
||||
results.push(
|
||||
name,
|
||||
IntermediateAggregationResult::Metric(intermediate_result),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
agg_with_accessor: &mut AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
self.collect_block(&[doc], agg_with_accessor)
|
||||
}
|
||||
|
||||
fn collect_block(
|
||||
&mut self,
|
||||
docs: &[crate::DocId],
|
||||
agg_with_accessor: &mut AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
|
||||
self.fetch_block_with_field(docs, bucket_agg_accessor);
|
||||
|
||||
let col_block_accessor = &bucket_agg_accessor.column_block_accessor;
|
||||
if self.column_type == ColumnType::Str {
|
||||
for term_ord in col_block_accessor.iter_vals() {
|
||||
self.entries.insert(term_ord);
|
||||
}
|
||||
} else if self.column_type == ColumnType::IpAddr {
|
||||
let compact_space_accessor = bucket_agg_accessor
|
||||
.accessor
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(
|
||||
crate::aggregation::AggregationError::InternalError(
|
||||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
}
|
||||
} else {
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
/// The percentiles collector used during segment collection and for merging results.
|
||||
pub struct CardinalityCollector {
|
||||
sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
|
||||
}
|
||||
impl Default for CardinalityCollector {
|
||||
fn default() -> Self {
|
||||
Self::new(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for CardinalityCollector {
|
||||
fn eq(&self, _other: &Self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl CardinalityCollector {
|
||||
/// Compute the final cardinality estimate.
|
||||
pub fn finalize(self) -> Option<f64> {
|
||||
Some(self.sketch.clone().count().trunc())
|
||||
}
|
||||
|
||||
fn new(salt: u8) -> Self {
|
||||
Self {
|
||||
sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
|
||||
self.sketch.merge(&right.sketch).map_err(|err| {
|
||||
TantivyError::AggregationError(AggregationError::InternalError(format!(
|
||||
"Error while merging cardinality {err:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST};
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_test_empty_index() -> crate::Result<()> {
|
||||
let values = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "string_id",
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 0.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_test_single_segment() -> crate::Result<()> {
|
||||
cardinality_aggregation_test_merge_segment(true)
|
||||
}
|
||||
#[test]
|
||||
fn cardinality_aggregation_test() -> crate::Result<()> {
|
||||
cardinality_aggregation_test_merge_segment(false)
|
||||
}
|
||||
fn cardinality_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
|
||||
let segment_and_terms = vec![
|
||||
vec!["terma"],
|
||||
vec!["termb"],
|
||||
vec!["termc"],
|
||||
vec!["terma"],
|
||||
vec!["terma"],
|
||||
vec!["terma"],
|
||||
vec!["termb"],
|
||||
vec!["terma"],
|
||||
];
|
||||
let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "string_id",
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 3.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_u64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(id_field => 1u64))?;
|
||||
writer.add_document(doc!(id_field => 2u64))?;
|
||||
writer.add_document(doc!(id_field => 3u64))?;
|
||||
writer.add_document(doc!())?;
|
||||
writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "id",
|
||||
"missing": 0u64
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_ip_addr() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
// IpV6 loopback
|
||||
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||||
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||||
// IpV4
|
||||
writer.add_document(
|
||||
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
|
||||
)?;
|
||||
writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "ip_field"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 2.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_json() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_json_field("json", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(field => json!({"value": false})))?;
|
||||
writer.add_document(doc!(field => json!({"value": true})))?;
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(0u64)})))?;
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(1u64)})))?;
|
||||
writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "json.value"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
1181
src/aggregation/metric/extended_stats.rs
Normal file
1181
src/aggregation/metric/extended_stats.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,9 @@
|
||||
//! - [Percentiles](PercentilesAggregationReq)
|
||||
|
||||
mod average;
|
||||
mod cardinality;
|
||||
mod count;
|
||||
mod extended_stats;
|
||||
mod max;
|
||||
mod min;
|
||||
mod percentiles;
|
||||
@@ -28,7 +30,9 @@ mod top_hits;
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub use average::*;
|
||||
pub use cardinality::*;
|
||||
pub use count::*;
|
||||
pub use extended_stats::*;
|
||||
pub use max::*;
|
||||
pub use min::*;
|
||||
pub use percentiles::*;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::*;
|
||||
@@ -85,13 +87,15 @@ impl Stats {
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
/// The number of extracted values.
|
||||
count: u64,
|
||||
pub(crate) count: u64,
|
||||
/// The sum of the extracted values.
|
||||
sum: f64,
|
||||
pub(crate) sum: f64,
|
||||
/// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
|
||||
pub(crate) delta: f64,
|
||||
/// The min value.
|
||||
min: f64,
|
||||
pub(crate) min: f64,
|
||||
/// The max value.
|
||||
max: f64,
|
||||
pub(crate) max: f64,
|
||||
}
|
||||
|
||||
impl Default for IntermediateStats {
|
||||
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
|
||||
Self {
|
||||
count: 0,
|
||||
sum: 0.0,
|
||||
delta: 0.0,
|
||||
min: f64::MAX,
|
||||
max: f64::MIN,
|
||||
}
|
||||
@@ -109,7 +114,13 @@ impl IntermediateStats {
|
||||
/// Merges the other stats intermediate result into self.
|
||||
pub fn merge_fruits(&mut self, other: IntermediateStats) {
|
||||
self.count += other.count;
|
||||
self.sum += other.sum;
|
||||
|
||||
// kahan algorithm for sum
|
||||
let y = other.sum - (self.delta + other.delta);
|
||||
let t = self.sum + y;
|
||||
self.delta = (t - self.sum) - y;
|
||||
self.sum = t;
|
||||
|
||||
self.min = self.min.min(other.min);
|
||||
self.max = self.max.max(other.max);
|
||||
}
|
||||
@@ -141,9 +152,15 @@ impl IntermediateStats {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, value: f64) {
|
||||
pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
|
||||
self.count += 1;
|
||||
self.sum += value;
|
||||
|
||||
// kahan algorithm for sum
|
||||
let y = value - self.delta;
|
||||
let t = self.sum + y;
|
||||
self.delta = (t - self.sum) - y;
|
||||
self.sum = t;
|
||||
|
||||
self.min = self.min.min(value);
|
||||
self.max = self.max.max(value);
|
||||
}
|
||||
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::aggregation::agg_req::{Aggregation, Aggregations};
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::{ColumnarReader, DynamicColumn};
|
||||
use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
|
||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
|
||||
use common::DateTime;
|
||||
use regex::Regex;
|
||||
@@ -89,7 +89,7 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
|
||||
pub struct TopHitsAggregation {
|
||||
pub struct TopHitsAggregationReq {
|
||||
sort: Vec<KeyOrder>,
|
||||
size: usize,
|
||||
from: Option<usize>,
|
||||
@@ -164,7 +164,7 @@ fn unsupported_err(parameter: &str) -> crate::Result<()> {
|
||||
))
|
||||
}
|
||||
|
||||
impl TopHitsAggregation {
|
||||
impl TopHitsAggregationReq {
|
||||
/// Validate and resolve field retrieval parameters
|
||||
pub fn validate_and_resolve_field_names(
|
||||
&mut self,
|
||||
@@ -431,7 +431,7 @@ impl Eq for DocSortValuesAndFields {}
|
||||
/// The TopHitsCollector used for collecting over segments and merging results.
|
||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||
pub struct TopHitsTopNComputer {
|
||||
req: TopHitsAggregation,
|
||||
req: TopHitsAggregationReq,
|
||||
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
|
||||
}
|
||||
|
||||
@@ -443,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
|
||||
|
||||
impl TopHitsTopNComputer {
|
||||
/// Create a new TopHitsCollector
|
||||
pub fn new(req: TopHitsAggregation) -> Self {
|
||||
pub fn new(req: &TopHitsAggregationReq) -> Self {
|
||||
Self {
|
||||
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
||||
req,
|
||||
req: req.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -491,18 +491,16 @@ impl TopHitsTopNComputer {
|
||||
pub(crate) struct TopHitsSegmentCollector {
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
accessor_idx: usize,
|
||||
req: TopHitsAggregation,
|
||||
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
|
||||
}
|
||||
|
||||
impl TopHitsSegmentCollector {
|
||||
pub fn from_req(
|
||||
req: &TopHitsAggregation,
|
||||
req: &TopHitsAggregationReq,
|
||||
accessor_idx: usize,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
) -> Self {
|
||||
Self {
|
||||
req: req.clone(),
|
||||
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
||||
segment_ordinal,
|
||||
accessor_idx,
|
||||
@@ -511,14 +509,13 @@ impl TopHitsSegmentCollector {
|
||||
fn into_top_hits_collector(
|
||||
self,
|
||||
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
||||
req: &TopHitsAggregationReq,
|
||||
) -> TopHitsTopNComputer {
|
||||
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
|
||||
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
||||
let top_results = self.top_n.into_vec();
|
||||
|
||||
for res in top_results {
|
||||
let doc_value_fields = self
|
||||
.req
|
||||
.get_document_field_data(value_accessors, res.doc.doc_id);
|
||||
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
|
||||
top_hits_computer.collect(
|
||||
DocSortValuesAndFields {
|
||||
sorts: res.feature,
|
||||
@@ -530,34 +527,15 @@ impl TopHitsSegmentCollector {
|
||||
|
||||
top_hits_computer
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
self: Box<Self>,
|
||||
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
|
||||
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
|
||||
) -> crate::Result<()> {
|
||||
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
|
||||
|
||||
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
|
||||
|
||||
let intermediate_result =
|
||||
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
|
||||
results.push(
|
||||
name,
|
||||
IntermediateAggregationResult::Metric(intermediate_result),
|
||||
)
|
||||
}
|
||||
|
||||
fn collect(
|
||||
/// TODO add a specialized variant for a single sort field
|
||||
fn collect_with(
|
||||
&mut self,
|
||||
doc_id: crate::DocId,
|
||||
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
|
||||
req: &TopHitsAggregationReq,
|
||||
accessors: &[(Column<u64>, ColumnType)],
|
||||
) -> crate::Result<()> {
|
||||
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
|
||||
let sorts: Vec<DocValueAndOrder> = self
|
||||
.req
|
||||
let sorts: Vec<DocValueAndOrder> = req
|
||||
.sort
|
||||
.iter()
|
||||
.enumerate()
|
||||
@@ -582,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
self: Box<Self>,
|
||||
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
|
||||
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
|
||||
) -> crate::Result<()> {
|
||||
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
|
||||
|
||||
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
|
||||
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
|
||||
.agg
|
||||
.agg
|
||||
.as_top_hits()
|
||||
.expect("aggregation request must be of type top hits");
|
||||
|
||||
let intermediate_result = IntermediateMetricResult::TopHits(
|
||||
self.into_top_hits_collector(value_accessors, tophits_req),
|
||||
);
|
||||
results.push(
|
||||
name,
|
||||
IntermediateAggregationResult::Metric(intermediate_result),
|
||||
)
|
||||
}
|
||||
|
||||
/// TODO: Consider a caching layer to reduce the call overhead
|
||||
fn collect(
|
||||
&mut self,
|
||||
doc_id: crate::DocId,
|
||||
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
|
||||
.agg
|
||||
.agg
|
||||
.as_top_hits()
|
||||
.expect("aggregation request must be of type top hits");
|
||||
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
|
||||
self.collect_with(doc_id, tophits_req, accessors)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect_block(
|
||||
&mut self,
|
||||
docs: &[crate::DocId],
|
||||
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
|
||||
.agg
|
||||
.agg
|
||||
.as_top_hits()
|
||||
.expect("aggregation request must be of type top hits");
|
||||
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
|
||||
// TODO: Consider getting fields with the column block accessor.
|
||||
for doc in docs {
|
||||
self.collect(*doc, agg_with_accessor)?;
|
||||
self.collect_with(*doc, tophits_req, accessors)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -44,11 +44,14 @@
|
||||
//! - [Metric](metric)
|
||||
//! - [Average](metric::AverageAggregation)
|
||||
//! - [Stats](metric::StatsAggregation)
|
||||
//! - [ExtendedStats](metric::ExtendedStatsAggregation)
|
||||
//! - [Min](metric::MinAggregation)
|
||||
//! - [Max](metric::MaxAggregation)
|
||||
//! - [Sum](metric::SumAggregation)
|
||||
//! - [Count](metric::CountAggregation)
|
||||
//! - [Percentiles](metric::PercentilesAggregationReq)
|
||||
//! - [Cardinality](metric::CardinalityAggregationReq)
|
||||
//! - [TopHits](metric::TopHitsAggregationReq)
|
||||
//!
|
||||
//! # Example
|
||||
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
|
||||
|
||||
@@ -11,12 +11,15 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::metric::{
|
||||
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
|
||||
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
|
||||
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
|
||||
SumAggregation,
|
||||
};
|
||||
use crate::aggregation::bucket::TermMissingAgg;
|
||||
use crate::aggregation::metric::TopHitsSegmentCollector;
|
||||
use crate::aggregation::metric::{
|
||||
CardinalityAggregationReq, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
|
||||
TopHitsSegmentCollector,
|
||||
};
|
||||
|
||||
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
|
||||
fn add_intermediate_aggregation_result(
|
||||
@@ -148,6 +151,9 @@ pub(crate) fn build_single_agg_segment_collector(
|
||||
accessor_idx,
|
||||
*missing,
|
||||
))),
|
||||
ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
|
||||
SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
|
||||
)),
|
||||
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
|
||||
req.field_type,
|
||||
SegmentStatsType::Sum,
|
||||
@@ -166,6 +172,9 @@ pub(crate) fn build_single_agg_segment_collector(
|
||||
accessor_idx,
|
||||
req.segment_ordinal,
|
||||
))),
|
||||
Cardinality(CardinalityAggregationReq { missing, .. }) => Ok(Box::new(
|
||||
SegmentCardinalityCollector::from_req(req.field_type, accessor_idx, missing),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -871,7 +871,10 @@ mod tests {
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
|
||||
use crate::{
|
||||
assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
|
||||
SegmentReader,
|
||||
};
|
||||
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -195,7 +195,7 @@ mod tests {
|
||||
let (tx, rx) = crossbeam_channel::bounded::<()>(0);
|
||||
let rx = Arc::new(rx);
|
||||
let executor = Executor::multi_thread(3, "search-test").unwrap();
|
||||
for i in 0..1000 {
|
||||
for _ in 0..1000 {
|
||||
let counter_clone: Arc<AtomicU64> = counter.clone();
|
||||
let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
|
||||
|
||||
@@ -203,18 +203,18 @@ mod tests {
|
||||
let rx_clone2 = rx.clone();
|
||||
let fut = executor.spawn_blocking(move || {
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
let () = rx_clone.recv().unwrap();
|
||||
let _ = rx_clone.recv();
|
||||
});
|
||||
futures.push(fut);
|
||||
let other_fut = executor.spawn_blocking(move || {
|
||||
other_counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
let () = rx_clone2.recv().unwrap();
|
||||
let _ = rx_clone2.recv();
|
||||
});
|
||||
other_futures.push(other_fut);
|
||||
}
|
||||
|
||||
// We execute 100 futures.
|
||||
for i in 0..100 {
|
||||
for _ in 0..100 {
|
||||
tx.send(()).unwrap();
|
||||
}
|
||||
|
||||
@@ -226,7 +226,7 @@ mod tests {
|
||||
drop(other_futures);
|
||||
|
||||
// We execute 100 futures.
|
||||
for i in 0..100 {
|
||||
for _ in 0..100 {
|
||||
tx.send(()).unwrap();
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
|
||||
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use common::{replace_in_place, JsonPathWriter};
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
@@ -83,6 +83,9 @@ fn index_json_object<'a, V: Value<'a>>(
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
for (json_path_segment, json_value_visitor) in json_visitor {
|
||||
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
|
||||
continue;
|
||||
}
|
||||
json_path_writer.push(json_path_segment);
|
||||
index_json_value(
|
||||
doc,
|
||||
|
||||
@@ -127,7 +127,7 @@ mod tests {
|
||||
fast_field_writers
|
||||
.add_document(&doc!(*FIELD=>2u64))
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -178,7 +178,7 @@ mod tests {
|
||||
fast_field_writers
|
||||
.add_document(&doc!(*FIELD=>215u64))
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -211,7 +211,7 @@ mod tests {
|
||||
.add_document(&doc!(*FIELD=>100_000u64))
|
||||
.unwrap();
|
||||
}
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -243,7 +243,7 @@ mod tests {
|
||||
.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id))
|
||||
.unwrap();
|
||||
}
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -276,7 +276,7 @@ mod tests {
|
||||
doc.add_i64(i64_field, i);
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
}
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -315,7 +315,7 @@ mod tests {
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
let doc = TantivyDocument::default();
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
|
||||
@@ -348,7 +348,7 @@ mod tests {
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
let doc = TantivyDocument::default();
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
|
||||
@@ -385,7 +385,7 @@ mod tests {
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
|
||||
}
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -770,7 +770,7 @@ mod tests {
|
||||
fast_field_writers
|
||||
.add_document(&doc!(field=>false))
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -802,7 +802,7 @@ mod tests {
|
||||
.add_document(&doc!(field=>false))
|
||||
.unwrap();
|
||||
}
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -827,7 +827,7 @@ mod tests {
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
let doc = TantivyDocument::default();
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -855,7 +855,7 @@ mod tests {
|
||||
for doc in docs {
|
||||
fast_field_writers.add_document(doc).unwrap();
|
||||
}
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
fast_field_writers.serialize(&mut write).unwrap();
|
||||
write.terminate().unwrap();
|
||||
}
|
||||
Ok(directory)
|
||||
|
||||
@@ -4,7 +4,6 @@ use columnar::{ColumnarWriter, NumericalValue};
|
||||
use common::{DateTimePrecision, JsonPathWriter};
|
||||
use tokenizer_api::Token;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
@@ -106,16 +105,6 @@ impl FastFieldsWriter {
|
||||
self.columnar_writer.mem_usage()
|
||||
}
|
||||
|
||||
pub(crate) fn sort_order(
|
||||
&self,
|
||||
sort_field: &str,
|
||||
num_docs: DocId,
|
||||
reversed: bool,
|
||||
) -> Vec<DocId> {
|
||||
self.columnar_writer
|
||||
.sort_order(sort_field, num_docs, reversed)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
|
||||
let doc_id = self.num_docs;
|
||||
@@ -233,16 +222,9 @@ impl FastFieldsWriter {
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
mut self,
|
||||
wrt: &mut dyn io::Write,
|
||||
doc_id_map_opt: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
pub fn serialize(mut self, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||
let num_docs = self.num_docs;
|
||||
let old_to_new_row_ids =
|
||||
doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
|
||||
self.columnar_writer
|
||||
.serialize(num_docs, old_to_new_row_ids, wrt)?;
|
||||
self.columnar_writer.serialize(num_docs, wrt)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -392,7 +374,7 @@ mod tests {
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer
|
||||
.serialize(json_docs.len() as DocId, None, &mut buffer)
|
||||
.serialize(json_docs.len() as DocId, &mut buffer)
|
||||
.unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ mod tests {
|
||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
||||
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
|
||||
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
|
||||
fieldnorm_writers.serialize(serializer, None)?;
|
||||
fieldnorm_writers.serialize(serializer)?;
|
||||
}
|
||||
let file = directory.open_read(path)?;
|
||||
{
|
||||
|
||||
@@ -2,7 +2,6 @@ use std::cmp::Ordering;
|
||||
use std::{io, iter};
|
||||
|
||||
use super::{fieldnorm_to_id, FieldNormsSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::DocId;
|
||||
|
||||
@@ -92,11 +91,7 @@ impl FieldNormsWriter {
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
|
||||
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|
||||
|(field_id, fieldnorms_buffer_opt)| {
|
||||
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
|
||||
@@ -104,12 +99,7 @@ impl FieldNormsWriter {
|
||||
})
|
||||
},
|
||||
) {
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
|
||||
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
|
||||
} else {
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
||||
}
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
||||
}
|
||||
fieldnorms_serializer.close()?;
|
||||
Ok(())
|
||||
|
||||
@@ -7,7 +7,7 @@ use rand::{thread_rng, Rng};
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::schema::*;
|
||||
#[allow(deprecated)]
|
||||
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
|
||||
use crate::{doc, schema, Index, IndexWriter, Searcher};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
@@ -65,71 +65,6 @@ fn get_num_iterations() -> usize {
|
||||
.map(|str| str.parse().unwrap())
|
||||
.unwrap_or(2000)
|
||||
}
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_indexing_sorted() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
index_builder = index_builder.settings(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "id".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
});
|
||||
let index = index_builder.create_from_tempdir().unwrap();
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
||||
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..get_num_iterations() {
|
||||
let random_val = rng.gen_range(0..20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit()?;
|
||||
committed_docs.extend(&uncommitted_docs);
|
||||
uncommitted_docs.clear();
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(
|
||||
&searcher,
|
||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||
)?;
|
||||
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = TantivyDocument::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
doc.add_text(text_field, get_text());
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
|
||||
|
||||
@@ -20,7 +20,7 @@ use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{Field, FieldType, Schema, Type};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::SegmentReader;
|
||||
|
||||
@@ -232,31 +232,7 @@ impl IndexBuilder {
|
||||
}
|
||||
|
||||
fn validate(&self) -> crate::Result<()> {
|
||||
if let Some(schema) = self.schema.as_ref() {
|
||||
if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() {
|
||||
let schema_field = schema.get_field(&sort_by_field.field).map_err(|_| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"Field to sort index {} not found in schema",
|
||||
sort_by_field.field
|
||||
))
|
||||
})?;
|
||||
let entry = schema.get_field_entry(schema_field);
|
||||
if !entry.is_fast() {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Field {} is no fast field. Field needs to be a single value fast field \
|
||||
to be used to sort an index",
|
||||
sort_by_field.field
|
||||
)));
|
||||
}
|
||||
let supported_field_types = [Type::I64, Type::U64, Type::F64, Type::Date];
|
||||
let field_type = entry.field_type().value_type();
|
||||
if !supported_field_types.contains(&field_type) {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Unsupported field type in sort_by_field: {field_type:?}. Supported field \
|
||||
types: {supported_field_types:?} ",
|
||||
)));
|
||||
}
|
||||
}
|
||||
if let Some(_schema) = self.schema.as_ref() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
|
||||
@@ -249,10 +249,6 @@ fn is_true(val: &bool) -> bool {
|
||||
/// index, like presort documents.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sort_by_field: Option<IndexSortByField>,
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
@@ -275,7 +271,6 @@ fn default_docstore_blocksize() -> usize {
|
||||
impl Default for IndexSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sort_by_field: None,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_blocksize: default_docstore_blocksize(),
|
||||
docstore_compress_dedicated_thread: true,
|
||||
@@ -283,22 +278,6 @@ impl Default for IndexSettings {
|
||||
}
|
||||
}
|
||||
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
/// Presorting documents can greatly improve performance
|
||||
/// in some scenarios, by applying top n
|
||||
/// optimizations.
|
||||
#[deprecated(
|
||||
since = "0.22.0",
|
||||
note = "We plan to remove index sorting in `0.23`. If you need index sorting, please comment on the related issue https://github.com/quickwit-oss/tantivy/issues/2352 and explain your use case."
|
||||
)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSortByField {
|
||||
/// The field to sort the documents by
|
||||
pub field: String,
|
||||
/// The order to sort the documents by
|
||||
pub order: Order,
|
||||
}
|
||||
/// The order to sort by
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub enum Order {
|
||||
@@ -417,7 +396,7 @@ mod tests {
|
||||
use crate::store::Compressor;
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
use crate::store::ZstdCompressor;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
use crate::IndexSettings;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
@@ -427,13 +406,7 @@ mod tests {
|
||||
schema_builder.build()
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
index_settings: IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
index_settings: IndexSettings::default(),
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
@@ -442,7 +415,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
@@ -461,10 +434,6 @@ mod tests {
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
index_settings: IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
|
||||
compression_level: Some(4),
|
||||
}),
|
||||
@@ -479,7 +448,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
@@ -491,35 +460,35 @@ mod tests {
|
||||
#[test]
|
||||
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_invalid_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
let json = r#"{"index_settings":{"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
|
||||
`zstd(compression_level=5)` at line 1 column 96"
|
||||
`zstd(compression_level=5)` at line 1 column 49"
|
||||
.to_string()
|
||||
);
|
||||
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
let json = r#"{"index_settings":{"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown zstd option \"bla\" at line 1 column 103".to_string()
|
||||
"unknown zstd option \"bla\" at line 1 column 56".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_unsupported_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
let json = r#"{"index_settings":{"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
|
||||
line 1 column 95"
|
||||
line 1 column 48"
|
||||
.to_string()
|
||||
);
|
||||
}
|
||||
@@ -531,7 +500,6 @@ mod tests {
|
||||
assert_eq!(
|
||||
index_settings,
|
||||
IndexSettings {
|
||||
sort_by_field: None,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_compress_dedicated_thread: true,
|
||||
docstore_blocksize: 16_384
|
||||
|
||||
@@ -12,7 +12,7 @@ mod segment_reader;
|
||||
|
||||
pub use self::index::{Index, IndexBuilder};
|
||||
pub(crate) use self::index_meta::SegmentMetaInventory;
|
||||
pub use self::index_meta::{IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta};
|
||||
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
|
||||
@@ -3,15 +3,12 @@
|
||||
|
||||
use common::ReadOnlyBitSet;
|
||||
|
||||
use super::SegmentWriter;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::{DocAddress, DocId, IndexSortByField, TantivyError};
|
||||
use crate::DocAddress;
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum MappingType {
|
||||
Stacked,
|
||||
StackedWithDeletes,
|
||||
Shuffled,
|
||||
}
|
||||
|
||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||
@@ -46,537 +43,4 @@ impl SegmentDocIdMapping {
|
||||
pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator<Item = DocAddress> + '_ {
|
||||
self.new_doc_id_to_old_doc_addr.iter().copied()
|
||||
}
|
||||
|
||||
/// This flags means the segments are simply stacked in the order of their ordinal.
|
||||
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
|
||||
///
|
||||
/// The different segment may present some deletes, in which case it is expressed by skipping a
|
||||
/// `DocId`. [(0, 1), (0, 3)] <--- here doc_id=0 and doc_id=1 have been deleted
|
||||
///
|
||||
/// Being trivial is equivalent to having the `new_doc_id_to_old_doc_addr` array sorted.
|
||||
///
|
||||
/// This allows for some optimization.
|
||||
pub(crate) fn is_trivial(&self) -> bool {
|
||||
match self.mapping_type {
|
||||
MappingType::Stacked | MappingType::StackedWithDeletes => true,
|
||||
MappingType::Shuffled => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
|
||||
pub struct DocIdMapping {
|
||||
new_doc_id_to_old: Vec<DocId>,
|
||||
old_doc_id_to_new: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl DocIdMapping {
|
||||
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
||||
let max_doc = new_doc_id_to_old.len();
|
||||
let old_max_doc = new_doc_id_to_old
|
||||
.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.map(|n| n + 1)
|
||||
.unwrap_or(0);
|
||||
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
|
||||
for i in 0..max_doc {
|
||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
||||
}
|
||||
DocIdMapping {
|
||||
new_doc_id_to_old,
|
||||
old_doc_id_to_new,
|
||||
}
|
||||
}
|
||||
|
||||
/// returns the new doc_id for the old doc_id
|
||||
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.old_doc_id_to_new[doc_id as usize]
|
||||
}
|
||||
/// returns the old doc_id for the new doc_id
|
||||
pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.new_doc_id_to_old[doc_id as usize]
|
||||
}
|
||||
/// iterate over old doc_ids in order of the new doc_ids
|
||||
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
||||
self.new_doc_id_to_old.iter().cloned()
|
||||
}
|
||||
|
||||
pub fn old_to_new_ids(&self) -> &[DocId] {
|
||||
&self.old_doc_id_to_new[..]
|
||||
}
|
||||
|
||||
/// Remaps a given array to the new doc ids.
|
||||
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
|
||||
self.new_doc_id_to_old
|
||||
.iter()
|
||||
.map(|old_doc| els[*old_doc as usize])
|
||||
.collect()
|
||||
}
|
||||
pub fn num_new_doc_ids(&self) -> usize {
|
||||
self.new_doc_id_to_old.len()
|
||||
}
|
||||
pub fn num_old_doc_ids(&self) -> usize {
|
||||
self.old_doc_id_to_new.len()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn expect_field_id_for_sort_field(
|
||||
schema: &Schema,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Field> {
|
||||
schema.get_field(&sort_by_field.field).map_err(|_| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"field to sort index by not found: {:?}",
|
||||
sort_by_field.field
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
// Generates a document mapping in the form of [index new doc_id] -> old doc_id
|
||||
// TODO detect if field is already sorted and discard mapping
|
||||
pub(crate) fn get_doc_id_mapping_from_field(
|
||||
sort_by_field: IndexSortByField,
|
||||
segment_writer: &SegmentWriter,
|
||||
) -> crate::Result<DocIdMapping> {
|
||||
let schema = segment_writer.segment_serializer.segment().schema();
|
||||
expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
|
||||
let new_doc_id_to_old = segment_writer.fast_field_writers.sort_order(
|
||||
sort_by_field.field.as_str(),
|
||||
segment_writer.max_doc(),
|
||||
sort_by_field.order.is_desc(),
|
||||
);
|
||||
// create new doc_id to old doc_id index (used in fast_field_writers)
|
||||
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use common::DateTime;
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::*;
|
||||
use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order};
|
||||
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
text_field_options: TextOptions,
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
|
||||
let my_number =
|
||||
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
|
||||
|
||||
let multi_numbers =
|
||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram()?;
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(my_number=>40_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
|
||||
)?;
|
||||
index_writer.add_document(doc!(my_number=>100_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
|
||||
)?;
|
||||
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index)
|
||||
}
|
||||
fn get_text_options() -> TextOptions {
|
||||
TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::Basic),
|
||||
)
|
||||
}
|
||||
#[test]
|
||||
fn test_sort_index_test_text_field() -> crate::Result<()> {
|
||||
// there are different serializers for different settings in postings/recorder.rs
|
||||
// test remapping for all of them
|
||||
let options = vec![
|
||||
get_text_options(),
|
||||
get_text_options().set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
),
|
||||
get_text_options().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
|
||||
),
|
||||
];
|
||||
|
||||
for option in options {
|
||||
// let options = get_text_options();
|
||||
// no index_sort
|
||||
let index = create_test_index(None, option.clone())?;
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![3]
|
||||
);
|
||||
|
||||
// sort by field asc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
)?;
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![0]
|
||||
);
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
}
|
||||
// sort by field desc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query =
|
||||
QueryParser::for_index(&index, vec![my_string_field]).parse_query("text")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![4]
|
||||
);
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_sort_index_get_documents() -> crate::Result<()> {
|
||||
// default baseline
|
||||
let index = create_test_index(None, get_text_options())?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
assert!(searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field)
|
||||
.is_none());
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.as_str(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
// sort by field asc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.as_str(),
|
||||
Some("blublub")
|
||||
);
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
||||
assert!(doc.get_first(my_string_field).is_none());
|
||||
}
|
||||
// sort by field desc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
||||
assert_eq!(
|
||||
doc.get_first(my_string_field).unwrap().as_str(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_index_test_string_field() -> crate::Result<()> {
|
||||
let index = create_test_index(None, get_text_options())?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![3]
|
||||
);
|
||||
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![0]
|
||||
);
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
}
|
||||
// sort by field desc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![4]
|
||||
);
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_index_fast_field() -> crate::Result<()> {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
)?;
|
||||
assert_eq!(
|
||||
index.settings().sort_by_field.as_ref().unwrap().field,
|
||||
"my_number".to_string()
|
||||
);
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
|
||||
let fast_field = fast_fields
|
||||
.u64("my_number")
|
||||
.unwrap()
|
||||
.first_or_default_col(999);
|
||||
assert_eq!(fast_field.get_val(0), 10u64);
|
||||
assert_eq!(fast_field.get_val(1), 20u64);
|
||||
assert_eq!(fast_field.get_val(2), 30u64);
|
||||
|
||||
let multifield = fast_fields.u64("multi_numbers").unwrap();
|
||||
let vals: Vec<u64> = multifield.values_for_doc(0u32).collect();
|
||||
assert_eq!(vals, &[] as &[u64]);
|
||||
let vals: Vec<_> = multifield.values_for_doc(1u32).collect();
|
||||
assert_eq!(vals, &[5, 6]);
|
||||
|
||||
let vals: Vec<_> = multifield.values_for_doc(2u32).collect();
|
||||
assert_eq!(vals, &[3]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_sort_by_date_field() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", INDEXED | STORED | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let settings = IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "date".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let index = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(settings)
|
||||
.create_in_ram()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_timestamp_secs(1000),
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_timestamp_secs(999),
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_timestamp_secs(1001),
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
|
||||
let fast_field = fast_fields
|
||||
.date("date")
|
||||
.unwrap()
|
||||
.first_or_default_col(DateTime::from_timestamp_secs(0));
|
||||
assert_eq!(fast_field.get_val(0), DateTime::from_timestamp_secs(1001));
|
||||
assert_eq!(fast_field.get_val(1), DateTime::from_timestamp_secs(1000));
|
||||
assert_eq!(fast_field.get_val(2), DateTime::from_timestamp_secs(999));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping() {
|
||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
|
||||
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
|
||||
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
|
||||
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping_remap() {
|
||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
|
||||
assert_eq!(
|
||||
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
|
||||
&[2000, 8000, 3000]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_sort() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
schema_builder.add_text_field("id", STRING | FAST | STORED);
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
|
||||
let resp = IndexBuilder::new()
|
||||
.schema(schema_builder.build())
|
||||
.settings(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "id".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
})
|
||||
.create_in_ram();
|
||||
assert!(resp
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("Unsupported field type"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
147
src/indexer/merge_index_test.rs
Normal file
147
src/indexer/merge_index_test.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::Index;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions,
|
||||
};
|
||||
use crate::{DocAddress, DocSet, IndexSettings, IndexWriter, Term};
|
||||
|
||||
fn create_test_index(index_settings: Option<IndexSettings>) -> crate::Result<Index> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_stored()
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
|
||||
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
|
||||
let multi_numbers =
|
||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram()?;
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
// segment 1 - range 1-3
|
||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||
)?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
||||
)?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
||||
)?;
|
||||
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
|
||||
|
||||
let in_val = 1u64;
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
|
||||
index_writer.commit()?;
|
||||
let int_vals = [10u64, 5];
|
||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
)?;
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||
)?;
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_index() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
..Default::default()
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![1]);
|
||||
assert_eq!(do_search("blubber"), vec![3]);
|
||||
assert_eq!(do_search("biggest"), vec![4]);
|
||||
}
|
||||
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
postings.advance();
|
||||
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{
|
||||
ColumnType, ColumnValues, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder,
|
||||
StackMergeOrder,
|
||||
ColumnType, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, StackMergeOrder,
|
||||
};
|
||||
use common::ReadOnlyBitSet;
|
||||
use itertools::Itertools;
|
||||
@@ -11,7 +10,7 @@ use measure_time::debug_time;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::index::{Segment, SegmentComponent, SegmentReader};
|
||||
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
|
||||
@@ -20,9 +19,7 @@ use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::{TermMerger, TermOrdinal};
|
||||
use crate::{
|
||||
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
|
||||
};
|
||||
use crate::{DocAddress, DocId, InvertedIndexReader};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
@@ -80,7 +77,6 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
|
||||
}
|
||||
|
||||
pub struct IndexMerger {
|
||||
index_settings: IndexSettings,
|
||||
schema: Schema,
|
||||
pub(crate) readers: Vec<SegmentReader>,
|
||||
max_doc: u32,
|
||||
@@ -116,7 +112,7 @@ fn convert_to_merge_order(
|
||||
) -> MergeRowOrder {
|
||||
match doc_id_mapping.mapping_type() {
|
||||
MappingType::Stacked => MergeRowOrder::Stack(StackMergeOrder::stack(columnars)),
|
||||
MappingType::StackedWithDeletes | MappingType::Shuffled => {
|
||||
MappingType::StackedWithDeletes => {
|
||||
// RUST/LLVM is amazing. The following conversion is actually a no-op:
|
||||
// no allocation, no copy.
|
||||
let new_row_id_to_old_row_id: Vec<RowAddr> = doc_id_mapping
|
||||
@@ -149,13 +145,9 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy
|
||||
}
|
||||
|
||||
impl IndexMerger {
|
||||
pub fn open(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
segments: &[Segment],
|
||||
) -> crate::Result<IndexMerger> {
|
||||
pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result<IndexMerger> {
|
||||
let alive_bitset = segments.iter().map(|_| None).collect_vec();
|
||||
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
|
||||
Self::open_with_custom_alive_set(schema, segments, alive_bitset)
|
||||
}
|
||||
|
||||
// Create merge with a custom delete set.
|
||||
@@ -172,7 +164,6 @@ impl IndexMerger {
|
||||
// segments and partitions them e.g. by a value in a field.
|
||||
pub fn open_with_custom_alive_set(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
segments: &[Segment],
|
||||
alive_bitset_opt: Vec<Option<AliveBitSet>>,
|
||||
) -> crate::Result<IndexMerger> {
|
||||
@@ -186,9 +177,6 @@ impl IndexMerger {
|
||||
}
|
||||
|
||||
let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
|
||||
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
||||
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
||||
}
|
||||
// sort segments by their natural sort setting
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
@@ -198,37 +186,12 @@ impl IndexMerger {
|
||||
return Err(crate::TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
index_settings,
|
||||
schema,
|
||||
readers,
|
||||
max_doc,
|
||||
})
|
||||
}
|
||||
|
||||
fn sort_readers_by_min_sort_field(
|
||||
readers: Vec<SegmentReader>,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<SegmentReader>> {
|
||||
// presort the readers by their min_values, so that when they are disjunct, we can use
|
||||
// the regular merge logic (implicitly sorted)
|
||||
let mut readers_with_min_sort_values = readers
|
||||
.into_iter()
|
||||
.map(|reader| {
|
||||
let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?;
|
||||
Ok((reader, accessor.min_value()))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
if sort_by_field.order.is_asc() {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
|
||||
} else {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
|
||||
}
|
||||
Ok(readers_with_min_sort_values
|
||||
.into_iter()
|
||||
.map(|(reader, _)| reader)
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn write_fieldnorms(
|
||||
&self,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
@@ -276,128 +239,6 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Checks if the readers are disjunct for their sort property and in the correct order to be
|
||||
/// able to just stack them.
|
||||
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<bool> {
|
||||
let reader_ordinal_and_field_accessors =
|
||||
self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
|
||||
let everything_is_in_order = reader_ordinal_and_field_accessors
|
||||
.into_iter()
|
||||
.map(|(_, col)| Arc::new(col))
|
||||
.tuple_windows()
|
||||
.all(|(field_accessor1, field_accessor2)| {
|
||||
if sort_by_field.order.is_asc() {
|
||||
field_accessor1.max_value() <= field_accessor2.min_value()
|
||||
} else {
|
||||
field_accessor1.min_value() >= field_accessor2.max_value()
|
||||
}
|
||||
});
|
||||
Ok(everything_is_in_order)
|
||||
}
|
||||
|
||||
pub(crate) fn get_sort_field_accessor(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Arc<dyn ColumnValues>> {
|
||||
reader.schema().get_field(&sort_by_field.field)?;
|
||||
let (value_accessor, _column_type) = reader
|
||||
.fast_fields()
|
||||
.u64_lenient(&sort_by_field.field)?
|
||||
.ok_or_else(|| FastFieldNotAvailableError {
|
||||
field_name: sort_by_field.field.to_string(),
|
||||
})?;
|
||||
Ok(value_accessor.first_or_default_col(0u64))
|
||||
}
|
||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
||||
pub(crate) fn get_reader_with_sort_field_accessor(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn ColumnValues>)>> {
|
||||
let reader_ordinal_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(reader_ordinal, _)| reader_ordinal as SegmentOrdinal)
|
||||
.map(|reader_ordinal: SegmentOrdinal| {
|
||||
let value_accessor = Self::get_sort_field_accessor(
|
||||
&self.readers[reader_ordinal as usize],
|
||||
sort_by_field,
|
||||
)?;
|
||||
Ok((reader_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
Ok(reader_ordinal_and_field_accessors)
|
||||
}
|
||||
|
||||
/// Generates the doc_id mapping where position in the vec=new
|
||||
/// doc_id.
|
||||
/// ReaderWithOrdinal will include the ordinal position of the
|
||||
/// reader in self.readers.
|
||||
pub(crate) fn generate_doc_id_mapping_with_sort_by_field(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<SegmentDocIdMapping> {
|
||||
let reader_ordinal_and_field_accessors =
|
||||
self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
|
||||
// create iterators over segment/sort_accessor/doc_id tuple
|
||||
let doc_id_reader_pair =
|
||||
reader_ordinal_and_field_accessors
|
||||
.iter()
|
||||
.map(|(reader_ord, ff_reader)| {
|
||||
let reader = &self.readers[*reader_ord as usize];
|
||||
reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| (doc_id, reader_ord, ff_reader))
|
||||
});
|
||||
|
||||
let total_num_new_docs = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.num_docs() as usize)
|
||||
.sum();
|
||||
|
||||
let mut sorted_doc_ids: Vec<DocAddress> = Vec::with_capacity(total_num_new_docs);
|
||||
|
||||
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
|
||||
sorted_doc_ids.extend(
|
||||
doc_id_reader_pair
|
||||
.into_iter()
|
||||
.kmerge_by(|a, b| {
|
||||
let val1 = a.2.get_val(a.0);
|
||||
let val2 = b.2.get_val(b.0);
|
||||
if sort_by_field.order == Order::Asc {
|
||||
val1 < val2
|
||||
} else {
|
||||
val1 > val2
|
||||
}
|
||||
})
|
||||
.map(|(doc_id, &segment_ord, _)| DocAddress {
|
||||
doc_id,
|
||||
segment_ord,
|
||||
}),
|
||||
);
|
||||
|
||||
let alive_bitsets: Vec<Option<ReadOnlyBitSet>> = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|segment_reader| {
|
||||
let alive_bitset = segment_reader.alive_bitset()?;
|
||||
Some(alive_bitset.bitset().clone())
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentDocIdMapping::new(
|
||||
sorted_doc_ids,
|
||||
MappingType::Shuffled,
|
||||
alive_bitsets,
|
||||
))
|
||||
}
|
||||
|
||||
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
|
||||
/// index sorting and the others
|
||||
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
|
||||
@@ -515,7 +356,6 @@ impl IndexMerger {
|
||||
);
|
||||
|
||||
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
|
||||
let mut doc_id_and_positions = vec![];
|
||||
|
||||
while merged_terms.advance() {
|
||||
segment_postings_containing_the_term.clear();
|
||||
@@ -611,37 +451,13 @@ impl IndexMerger {
|
||||
0u32
|
||||
};
|
||||
|
||||
// if doc_id_mapping exists, the doc_ids are reordered, they are
|
||||
// not just stacked. The field serializer expects monotonically increasing
|
||||
// doc_ids, so we collect and sort them first, before writing.
|
||||
//
|
||||
// I think this is not strictly necessary, it would be possible to
|
||||
// avoid the loading into a vec via some form of kmerge, but then the merge
|
||||
// logic would deviate much more from the stacking case (unsorted index)
|
||||
if !doc_id_mapping.is_trivial() {
|
||||
doc_id_and_positions.push((
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
positions_buffer.to_vec(),
|
||||
));
|
||||
} else {
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
||||
}
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
||||
}
|
||||
|
||||
doc = segment_postings.advance();
|
||||
}
|
||||
}
|
||||
if !doc_id_mapping.is_trivial() {
|
||||
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
|
||||
|
||||
for (doc_id, term_freq, positions) in &doc_id_and_positions {
|
||||
let delta_positions = delta_computer.compute_delta(positions);
|
||||
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
|
||||
}
|
||||
doc_id_and_positions.clear();
|
||||
}
|
||||
// closing the term.
|
||||
field_serializer.close_term()?;
|
||||
}
|
||||
@@ -670,47 +486,13 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_storable_fields(
|
||||
&self,
|
||||
store_writer: &mut StoreWriter,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> {
|
||||
debug_time!("write-storable-fields");
|
||||
debug!("write-storable-field");
|
||||
|
||||
if !doc_id_mapping.is_trivial() {
|
||||
debug!("non-trivial-doc-id-mapping");
|
||||
|
||||
let store_readers: Vec<_> = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.get_store_reader(50))
|
||||
.collect::<Result<_, _>>()?;
|
||||
|
||||
let mut document_iterators: Vec<_> = store_readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
|
||||
.collect();
|
||||
|
||||
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
|
||||
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
|
||||
if let Some(doc_bytes_res) = doc_bytes_it.next() {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
} else {
|
||||
return Err(DataCorruption::comment_only(format!(
|
||||
"unexpected missing document in docstore on merge, doc address \
|
||||
{old_doc_addr:?}",
|
||||
))
|
||||
.into());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
debug!("trivial-doc-id-mapping");
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader(1)?;
|
||||
if reader.has_deletes()
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader(1)?;
|
||||
if reader.has_deletes()
|
||||
// If there is not enough data in the store, we avoid stacking in order to
|
||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
@@ -726,14 +508,13 @@ impl IndexMerger {
|
||||
// take 7 in order to not walk over all checkpoints.
|
||||
|| store_reader.block_checkpoints().take(7).count() < 6
|
||||
|| store_reader.decompressor() != store_writer.compressor().into()
|
||||
{
|
||||
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
}
|
||||
} else {
|
||||
store_writer.stack(store_reader)?;
|
||||
{
|
||||
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
}
|
||||
} else {
|
||||
store_writer.stack(store_reader)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -745,18 +526,7 @@ impl IndexMerger {
|
||||
/// # Returns
|
||||
/// The number of documents in the resulting segment.
|
||||
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
|
||||
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
|
||||
{
|
||||
// If the documents are already sorted and stackable, we ignore the mapping and execute
|
||||
// it as if there was no sorting
|
||||
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
|
||||
self.get_doc_id_from_concatenated_data()?
|
||||
} else {
|
||||
self.generate_doc_id_mapping_with_sort_by_field(sort_by_field)?
|
||||
}
|
||||
} else {
|
||||
self.get_doc_id_from_concatenated_data()?
|
||||
};
|
||||
let doc_id_mapping = self.get_doc_id_from_concatenated_data()?;
|
||||
debug!("write-fieldnorms");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
|
||||
@@ -773,7 +543,7 @@ impl IndexMerger {
|
||||
)?;
|
||||
|
||||
debug!("write-storagefields");
|
||||
self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?;
|
||||
self.write_storable_fields(serializer.get_store_writer())?;
|
||||
debug!("write-fastfields");
|
||||
self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;
|
||||
|
||||
@@ -787,6 +557,8 @@ impl IndexMerger {
|
||||
mod tests {
|
||||
|
||||
use columnar::Column;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::strategy::Strategy;
|
||||
use schema::FAST;
|
||||
|
||||
use crate::collector::tests::{
|
||||
@@ -794,6 +566,7 @@ mod tests {
|
||||
};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::index::{Index, SegmentId};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
|
||||
use crate::schema::{
|
||||
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
|
||||
@@ -802,7 +575,7 @@ mod tests {
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{
|
||||
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
|
||||
IndexSortByField, IndexWriter, Order, Searcher,
|
||||
IndexWriter, Searcher,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -1275,60 +1048,6 @@ mod tests {
|
||||
test_merge_facets(None, true)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_asc() {
|
||||
// In the merge case this will go through the doc_id mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the doc_id mapping code, because the data
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_desc() {
|
||||
// In the merge case this will go through the doc_id mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the doc_id mapping code, because the data
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
|
||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||
@@ -1531,6 +1250,112 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
enum IndexingOp {
|
||||
ZeroVal,
|
||||
OneVal { val: u64 },
|
||||
TwoVal { val: u64 },
|
||||
Commit,
|
||||
}
|
||||
|
||||
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
prop_oneof![
|
||||
(0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
|
||||
(0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
|
||||
(0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
|
||||
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
|
||||
]
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
|
||||
assert!(test_merge_int_fields(&ops[..]).is_ok());
|
||||
}
|
||||
}
|
||||
fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
|
||||
if ops.iter().all(|op| *op == IndexingOp::Commit) {
|
||||
return Ok(());
|
||||
}
|
||||
let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
|
||||
.iter()
|
||||
.filter(|op| *op != &IndexingOp::Commit)
|
||||
.map(|op| match op {
|
||||
IndexingOp::ZeroVal => vec![],
|
||||
IndexingOp::OneVal { val } => vec![*val],
|
||||
IndexingOp::TwoVal { val } => vec![*val, *val],
|
||||
IndexingOp::Commit => unreachable!(),
|
||||
})
|
||||
.enumerate()
|
||||
.map(|(id, val)| (id as u32, val))
|
||||
.collect();
|
||||
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
||||
let mut doc = TantivyDocument::default();
|
||||
for &val in int_vals {
|
||||
doc.add_u64(int_field, val);
|
||||
}
|
||||
index_writer.add_document(doc).unwrap();
|
||||
};
|
||||
|
||||
for op in ops {
|
||||
match op {
|
||||
IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
|
||||
IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
|
||||
IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
|
||||
IndexingOp::Commit => {
|
||||
index_writer.commit().expect("commit failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
index_writer.commit().expect("commit failed");
|
||||
}
|
||||
{
|
||||
let mut segment_ids = index.searchable_segment_ids()?;
|
||||
segment_ids.sort();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
reader.reload()?;
|
||||
|
||||
let mut vals: Vec<u64> = Vec::new();
|
||||
let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
|
||||
vals.clear();
|
||||
vals.extend(col.values_for_doc(doc));
|
||||
assert_eq!(&vals[..], expected);
|
||||
};
|
||||
|
||||
let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
|
||||
for (doc_id, vals) in column_expected.iter() {
|
||||
test_vals(col, *doc_id, vals);
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
let searcher = reader.searcher();
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let col = segment
|
||||
.fast_fields()
|
||||
.column_opt::<u64>("intvals")
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
test_col(&col, &expected_doc_and_vals);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
@@ -1,579 +0,0 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::Index;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions, Value,
|
||||
};
|
||||
use crate::{
|
||||
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
|
||||
Term,
|
||||
};
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
assert!(index_writer.merge(&segment_ids).wait().is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have
|
||||
// disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast()
|
||||
.set_stored()
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
|
||||
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
|
||||
let multi_numbers =
|
||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram()?;
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
// segment 1 - range 1-3
|
||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||
)?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
||||
)?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
||||
)?;
|
||||
|
||||
index_writer.commit()?;
|
||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
|
||||
|
||||
let in_val = if force_disjunct_segment_sort_values {
|
||||
10_u64
|
||||
} else {
|
||||
1
|
||||
};
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
|
||||
index_writer.commit()?;
|
||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
||||
let int_vals = if force_disjunct_segment_sort_values {
|
||||
[100_u64, 50]
|
||||
} else {
|
||||
[10, 5]
|
||||
};
|
||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
)?;
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||
)?;
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_postinglist_sort_issue() {
|
||||
create_test_index_posting_list_issue(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc_not_disjunct() {
|
||||
test_merge_sorted_index_desc_(false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc_disjunct() {
|
||||
test_merge_sorted_index_desc_(true);
|
||||
}
|
||||
|
||||
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
force_disjunct_segment_sort_values,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64("intval").unwrap();
|
||||
assert_eq!(fast_field.first(5), Some(1u64));
|
||||
assert_eq!(fast_field.first(4), Some(2u64));
|
||||
assert_eq!(fast_field.first(3), Some(3u64));
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fast_field.first(2), Some(20u64));
|
||||
assert_eq!(fast_field.first(1), Some(100u64));
|
||||
} else {
|
||||
assert_eq!(fast_field.first(2), Some(10u64));
|
||||
assert_eq!(fast_field.first(1), Some(20u64));
|
||||
}
|
||||
assert_eq!(fast_field.first(0), Some(1_000u64));
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
} else {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
}
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
|
||||
}
|
||||
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![3]);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(do_search("blubber"), vec![1]);
|
||||
} else {
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
}
|
||||
assert_eq!(do_search("biggest"), vec![0]);
|
||||
}
|
||||
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
postings.advance();
|
||||
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
}
|
||||
|
||||
// access doc store
|
||||
{
|
||||
let blubber_pos = if force_disjunct_segment_sort_values {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
};
|
||||
let doc = searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(my_text_field).unwrap().as_value().as_str(),
|
||||
Some("blubber")
|
||||
);
|
||||
let doc = searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(int_field).unwrap().as_value().as_u64(),
|
||||
Some(1000)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_unsorted_index() {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![1]);
|
||||
assert_eq!(do_search("blubber"), vec![3]);
|
||||
assert_eq!(do_search("biggest"), vec![4]);
|
||||
}
|
||||
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
postings.advance();
|
||||
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
}
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_merge_sorted_index_asc() {
|
||||
// let index = create_test_index(
|
||||
// Some(IndexSettings {
|
||||
// sort_by_field: Some(IndexSortByField {
|
||||
// field: "intval".to_string(),
|
||||
// order: Order::Asc,
|
||||
// }),
|
||||
// ..Default::default()
|
||||
// }),
|
||||
// false,
|
||||
// )
|
||||
// .unwrap();
|
||||
|
||||
// let int_field = index.schema().get_field("intval").unwrap();
|
||||
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
// let bytes_field = index.schema().get_field("bytes").unwrap();
|
||||
// let reader = index.reader().unwrap();
|
||||
// let searcher = reader.searcher();
|
||||
// assert_eq!(searcher.segment_readers().len(), 1);
|
||||
// let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
|
||||
// let fast_fields = segment_reader.fast_fields();
|
||||
// let fast_field = fast_fields.u64(int_field).unwrap();
|
||||
// assert_eq!(fast_field.get_val(0), 1u64);
|
||||
// assert_eq!(fast_field.get_val(1), 2u64);
|
||||
// assert_eq!(fast_field.get_val(2), 3u64);
|
||||
// assert_eq!(fast_field.get_val(3), 10u64);
|
||||
// assert_eq!(fast_field.get_val(4), 20u64);
|
||||
// assert_eq!(fast_field.get_val(5), 1_000u64);
|
||||
|
||||
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
||||
// let mut vals = vec![];
|
||||
// fast_field.get_vals(doc_id, &mut vals);
|
||||
// vals
|
||||
// };
|
||||
// let fast_fields = segment_reader.fast_fields();
|
||||
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
|
||||
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
|
||||
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
|
||||
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
|
||||
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
|
||||
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
|
||||
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
|
||||
|
||||
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
|
||||
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
|
||||
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
|
||||
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
|
||||
|
||||
// // test new field norm mapping
|
||||
// {
|
||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
|
||||
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
|
||||
// }
|
||||
|
||||
// let searcher = index.reader().unwrap().searcher();
|
||||
// {
|
||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
// let do_search = |term: &str| {
|
||||
// let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
// .parse_query(term)
|
||||
// .unwrap();
|
||||
// let top_docs: Vec<(f32, DocAddress)> =
|
||||
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
// };
|
||||
|
||||
// assert_eq!(do_search("some"), vec![2]);
|
||||
// assert_eq!(do_search("blubber"), vec![3]);
|
||||
// assert_eq!(do_search("biggest"), vec![5]);
|
||||
// }
|
||||
|
||||
// // postings file
|
||||
// {
|
||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
// let term_a = Term::from_field_text(my_text_field, "text");
|
||||
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
// let mut postings = inverted_index
|
||||
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
// .unwrap()
|
||||
// .unwrap();
|
||||
|
||||
// assert_eq!(postings.doc_freq(), 2);
|
||||
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
// assert_eq!(
|
||||
// postings.doc_freq_given_deletes(
|
||||
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
// ),
|
||||
// 2
|
||||
// );
|
||||
|
||||
// let mut output = vec![];
|
||||
// postings.positions(&mut output);
|
||||
// assert_eq!(output, vec![1, 3]);
|
||||
// postings.advance();
|
||||
|
||||
// postings.positions(&mut output);
|
||||
// assert_eq!(output, vec![1]);
|
||||
// }
|
||||
|
||||
// // access doc store
|
||||
// {
|
||||
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
|
||||
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
|
||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench_sorted_index_merge {
|
||||
|
||||
use test::{self, Bencher};
|
||||
|
||||
use crate::index::Index;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::{NumericOptions, Schema};
|
||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index_builder = Index::builder().schema(schema).settings(IndexSettings {
|
||||
sort_by_field,
|
||||
..Default::default()
|
||||
});
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
|
||||
index_writer.add_document(doc!(int_field=>val)).unwrap();
|
||||
};
|
||||
// 3 segments with 10_000 values in the fast fields
|
||||
for _ in 0..3 {
|
||||
index_doc(&mut index_writer, 5000); // fix to make it unordered
|
||||
for i in 0..10_000 {
|
||||
index_doc(&mut index_writer, i);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
//#[bench]
|
||||
// fn create_sorted_index_walk_overkmerge_on_merge_fastfield(
|
||||
// b: &mut Bencher,
|
||||
//) -> crate::Result<()> {
|
||||
// let sort_by_field = IndexSortByField {
|
||||
// field: "intval".to_string(),
|
||||
// order: Order::Desc,
|
||||
//};
|
||||
// let index = create_index(Some(sort_by_field.clone()));
|
||||
// let segments = index.searchable_segments().unwrap();
|
||||
// let merger: IndexMerger =
|
||||
// IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
// let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
|
||||
// b.iter(|| {
|
||||
// let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
|
||||
// let reader = &merger.readers[doc_addr.segment_ord as usize];
|
||||
// let u64_reader: Arc<dyn Column<u64>> = reader
|
||||
//.fast_fields()
|
||||
//.typed_fast_field_reader("intval")
|
||||
//.expect(
|
||||
//"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
// it should never happen.",
|
||||
//);
|
||||
//(doc_addr.doc_id, reader, u64_reader)
|
||||
//});
|
||||
/// add values in order of the new doc_ids
|
||||
// let mut val = 0;
|
||||
// for (doc_id, _reader, field_reader) in sorted_doc_ids {
|
||||
// val = field_reader.get_val(doc_id);
|
||||
//}
|
||||
|
||||
// val
|
||||
//});
|
||||
|
||||
// Ok(())
|
||||
//}
|
||||
#[bench]
|
||||
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
|
||||
let sort_by_field = IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
};
|
||||
let index = create_index(Some(sort_by_field.clone()));
|
||||
// let field = index.schema().get_field("intval").unwrap();
|
||||
let segments = index.searchable_segments().unwrap();
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
b.iter(|| {
|
||||
merger
|
||||
.generate_doc_id_mapping_with_sort_by_field(&sort_by_field)
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -13,10 +13,10 @@ mod flat_map_with_buffer;
|
||||
pub(crate) mod index_writer;
|
||||
pub(crate) mod index_writer_status;
|
||||
mod log_merge_policy;
|
||||
mod merge_index_test;
|
||||
mod merge_operation;
|
||||
pub(crate) mod merge_policy;
|
||||
pub(crate) mod merger;
|
||||
mod merger_sorted_index_test;
|
||||
pub(crate) mod operation;
|
||||
pub(crate) mod prepared_commit;
|
||||
mod segment_entry;
|
||||
@@ -145,15 +145,27 @@ mod tests_mmap {
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_json_field_null_byte() {
|
||||
// Test when field name contains a zero byte, which has special meaning in tantivy.
|
||||
// As a workaround, we convert the zero byte to the ASCII character '0'.
|
||||
// https://github.com/quickwit-oss/tantivy/issues/2340
|
||||
// https://github.com/quickwit-oss/tantivy/issues/2193
|
||||
let field_name_in = "\u{0000}";
|
||||
let field_name_out = "0";
|
||||
test_json_field_name(field_name_in, field_name_out);
|
||||
fn test_json_field_null_byte_is_ignored() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
|
||||
let field = schema_builder.add_json_field("json", options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inv_indexer = segment_reader.inverted_index(field).unwrap();
|
||||
let term_dict = inv_indexer.terms();
|
||||
assert_eq!(term_dict.num_terms(), 1);
|
||||
let mut term_bytes = Vec::new();
|
||||
term_dict.ord_to_term(0, &mut term_bytes).unwrap();
|
||||
assert_eq!(term_bytes, b"key\0stest1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_field_1byte() {
|
||||
// Test when field name contains a '1' byte, which has special meaning in tantivy.
|
||||
@@ -291,7 +303,7 @@ mod tests_mmap {
|
||||
Type::Str,
|
||||
),
|
||||
(format!("{field_name_out_internal}a"), Type::Str),
|
||||
(format!("{field_name_out_internal}"), Type::Str),
|
||||
(field_name_out_internal.to_string(), Type::Str),
|
||||
(format!("num{field_name_out_internal}"), Type::I64),
|
||||
];
|
||||
expected_fields.sort();
|
||||
|
||||
@@ -38,7 +38,8 @@ impl PathToUnorderedId {
|
||||
#[cold]
|
||||
fn insert_new_path(&mut self, path: &str) -> u32 {
|
||||
let next_id = self.map.len() as u32;
|
||||
self.map.insert(path.to_string(), next_id);
|
||||
let new_path = path.to_string();
|
||||
self.map.insert(new_path, next_id);
|
||||
next_id
|
||||
}
|
||||
|
||||
|
||||
@@ -18,27 +18,9 @@ pub struct SegmentSerializer {
|
||||
|
||||
impl SegmentSerializer {
|
||||
/// Creates a new `SegmentSerializer`.
|
||||
pub fn for_segment(
|
||||
mut segment: Segment,
|
||||
is_in_merge: bool,
|
||||
) -> crate::Result<SegmentSerializer> {
|
||||
// If the segment is going to be sorted, we stream the docs first to a temporary file.
|
||||
// In the merge case this is not necessary because we can kmerge the already sorted
|
||||
// segments
|
||||
let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
|
||||
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
|
||||
let settings = segment.index().settings().clone();
|
||||
let store_writer = if remapping_required {
|
||||
let store_write = segment.open_write(SegmentComponent::TempStore)?;
|
||||
StoreWriter::new(
|
||||
store_write,
|
||||
crate::store::Compressor::None,
|
||||
// We want fast random access on the docs, so we choose a small block size.
|
||||
// If this is zero, the skip index will contain too many checkpoints and
|
||||
// therefore will be relatively slow.
|
||||
16000,
|
||||
settings.docstore_compress_dedicated_thread,
|
||||
)?
|
||||
} else {
|
||||
let store_writer = {
|
||||
let store_write = segment.open_write(SegmentComponent::Store)?;
|
||||
StoreWriter::new(
|
||||
store_write,
|
||||
@@ -72,10 +54,6 @@ impl SegmentSerializer {
|
||||
&self.segment
|
||||
}
|
||||
|
||||
pub fn segment_mut(&mut self) -> &mut Segment {
|
||||
&mut self.segment
|
||||
}
|
||||
|
||||
/// Accessor to the `PostingsSerializer`.
|
||||
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
|
||||
&mut self.postings_serializer
|
||||
|
||||
@@ -115,11 +115,10 @@ fn merge(
|
||||
.collect();
|
||||
|
||||
// An IndexMerger is like a "view" of our merged segments.
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
|
||||
|
||||
// ... we just serialize this index merger in our new segment to merge the segments.
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
|
||||
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
|
||||
@@ -220,13 +219,9 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
||||
)?;
|
||||
let merged_segment = merged_index.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
segments,
|
||||
filter_doc_ids,
|
||||
)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
|
||||
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
|
||||
@@ -1067,7 +1062,6 @@ mod tests {
|
||||
)?;
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
&segments[..],
|
||||
filter_segments,
|
||||
)?;
|
||||
@@ -1083,7 +1077,6 @@ mod tests {
|
||||
Index::create(RamDirectory::default(), target_schema, target_settings)?;
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
&segments[..],
|
||||
filter_segments,
|
||||
)?;
|
||||
|
||||
@@ -3,7 +3,6 @@ use common::JsonPathWriter;
|
||||
use itertools::Itertools;
|
||||
use tokenizer_api::BoxTokenStream;
|
||||
|
||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||
use super::operation::AddOperation;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
@@ -16,7 +15,6 @@ use crate::postings::{
|
||||
};
|
||||
use crate::schema::document::{Document, Value};
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||
use crate::{DocId, Opstamp, TantivyError};
|
||||
|
||||
@@ -41,20 +39,6 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
|
||||
})
|
||||
}
|
||||
|
||||
fn remap_doc_opstamps(
|
||||
opstamps: Vec<Opstamp>,
|
||||
doc_id_mapping_opt: Option<&DocIdMapping>,
|
||||
) -> Vec<Opstamp> {
|
||||
if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
|
||||
doc_id_mapping_opt
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc| opstamps[doc as usize])
|
||||
.collect()
|
||||
} else {
|
||||
opstamps
|
||||
}
|
||||
}
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// set of documents.
|
||||
///
|
||||
@@ -90,7 +74,7 @@ impl SegmentWriter {
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment)?;
|
||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||
let per_field_text_analyzers = schema
|
||||
.fields()
|
||||
@@ -139,15 +123,6 @@ impl SegmentWriter {
|
||||
/// be used afterwards.
|
||||
pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
|
||||
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
|
||||
let mapping: Option<DocIdMapping> = self
|
||||
.segment_serializer
|
||||
.segment()
|
||||
.index()
|
||||
.settings()
|
||||
.sort_by_field
|
||||
.clone()
|
||||
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
|
||||
.transpose()?;
|
||||
remap_and_write(
|
||||
self.schema,
|
||||
&self.per_field_postings_writers,
|
||||
@@ -155,10 +130,8 @@ impl SegmentWriter {
|
||||
self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
mapping.as_ref(),
|
||||
)?;
|
||||
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
|
||||
Ok(doc_opstamps)
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
/// Returns an estimation of the current memory usage of the segment writer.
|
||||
@@ -202,9 +175,8 @@ impl SegmentWriter {
|
||||
match field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
|
||||
let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
|
||||
@@ -220,15 +192,14 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
|
||||
let mut token_stream = if let Some(text) = value.as_str() {
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
text_analyzer.token_stream(text)
|
||||
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
|
||||
} else if let Some(tok_str) = value.into_pre_tokenized_text() {
|
||||
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
|
||||
} else {
|
||||
continue;
|
||||
@@ -250,9 +221,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
|
||||
num_vals += 1;
|
||||
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
|
||||
@@ -265,10 +235,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value_access = value_access as D::Value<'_>;
|
||||
let value = value_access.as_value();
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
|
||||
num_vals += 1;
|
||||
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
|
||||
@@ -282,9 +250,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
|
||||
num_vals += 1;
|
||||
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
|
||||
@@ -297,10 +264,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
num_vals += 1;
|
||||
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_f64(f64_val);
|
||||
@@ -312,10 +277,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Bool(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
num_vals += 1;
|
||||
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_bool(bool_val);
|
||||
@@ -327,10 +290,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Bytes(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
num_vals += 1;
|
||||
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_bytes(bytes);
|
||||
@@ -364,9 +325,8 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::IpAddr(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
for value in values {
|
||||
let value = value.as_value();
|
||||
|
||||
num_vals += 1;
|
||||
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
|
||||
@@ -432,11 +392,10 @@ fn remap_and_write(
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: &FieldNormsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> crate::Result<()> {
|
||||
debug!("remap-and-write");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?;
|
||||
fieldnorms_writer.serialize(fieldnorms_serializer)?;
|
||||
}
|
||||
let fieldnorm_data = serializer
|
||||
.segment()
|
||||
@@ -447,39 +406,10 @@ fn remap_and_write(
|
||||
schema,
|
||||
per_field_postings_writers,
|
||||
fieldnorm_readers,
|
||||
doc_id_map,
|
||||
serializer.get_postings_serializer(),
|
||||
)?;
|
||||
debug!("fastfield-serialize");
|
||||
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
|
||||
|
||||
// finalize temp docstore and create version, which reflects the doc_id_map
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
debug!("resort-docstore");
|
||||
let store_write = serializer
|
||||
.segment_mut()
|
||||
.open_write(SegmentComponent::Store)?;
|
||||
let settings = serializer.segment().index().settings();
|
||||
let store_writer = StoreWriter::new(
|
||||
store_write,
|
||||
settings.docstore_compression,
|
||||
settings.docstore_blocksize,
|
||||
settings.docstore_compress_dedicated_thread,
|
||||
)?;
|
||||
let old_store_writer = std::mem::replace(&mut serializer.store_writer, store_writer);
|
||||
old_store_writer.close()?;
|
||||
let store_read = StoreReader::open(
|
||||
serializer
|
||||
.segment()
|
||||
.open_read(SegmentComponent::TempStore)?,
|
||||
1, /* The docstore is configured to have one doc per block, and each doc is accessed
|
||||
* only once: we don't need caching. */
|
||||
)?;
|
||||
for old_doc_id in doc_id_map.iter_old_doc_ids() {
|
||||
let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
|
||||
serializer.get_store_writer().store_bytes(&doc_bytes)?;
|
||||
}
|
||||
}
|
||||
fast_field_writers.serialize(serializer.get_fast_field_write())?;
|
||||
|
||||
debug!("serializer-close");
|
||||
serializer.close()?;
|
||||
|
||||
24
src/lib.rs
24
src/lib.rs
@@ -222,8 +222,8 @@ pub use crate::core::{Executor, Searcher, SearcherGeneration};
|
||||
pub use crate::directory::Directory;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub use crate::index::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
|
||||
Segment, SegmentMeta, SegmentReader,
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
||||
SegmentMeta, SegmentReader,
|
||||
};
|
||||
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||
@@ -397,16 +397,20 @@ pub mod tests {
|
||||
#[macro_export]
|
||||
macro_rules! assert_nearly_equals {
|
||||
($left:expr, $right:expr) => {{
|
||||
match (&$left, &$right) {
|
||||
(left_val, right_val) => {
|
||||
assert_nearly_equals!($left, $right, 0.0005);
|
||||
}};
|
||||
($left:expr, $right:expr, $epsilon:expr) => {{
|
||||
match (&$left, &$right, &$epsilon) {
|
||||
(left_val, right_val, epsilon_val) => {
|
||||
let diff = (left_val - right_val).abs();
|
||||
let add = left_val.abs() + right_val.abs();
|
||||
if diff > 0.0005 * add {
|
||||
|
||||
if diff > *epsilon_val {
|
||||
panic!(
|
||||
r#"assertion failed: `(left ~= right)`
|
||||
left: `{:?}`,
|
||||
right: `{:?}`"#,
|
||||
&*left_val, &*right_val
|
||||
r#"assertion failed: `abs(left-right)>epsilon`
|
||||
left: `{:?}`,
|
||||
right: `{:?}`,
|
||||
epsilon: `{:?}`"#,
|
||||
&*left_val, &*right_val, &*epsilon_val
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
||||
//! This information is useful to run phrase queries.
|
||||
//!
|
||||
//! The [position](crate::SegmentComponent::Positions) file contains all of the
|
||||
//! The [position](crate::index::SegmentComponent::Positions) file contains all of the
|
||||
//! bitpacked positions delta, for all terms of a given field, one term after the other.
|
||||
//!
|
||||
//! Each term is encoded independently.
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::io;
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use stacker::Addr;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
|
||||
@@ -60,9 +59,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||
ordered_id_to_path: &[&str],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
@@ -71,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
||||
let mut prev_term_id = u32::MAX;
|
||||
let mut term_path_len = 0; // this will be set in the first iteration
|
||||
for (_field, path_id, term, addr) in term_addrs {
|
||||
for (_field, path_id, term, addr) in ordered_term_addrs {
|
||||
if prev_term_id != path_id.path_id() {
|
||||
term_buffer.truncate_value_bytes(0);
|
||||
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
||||
@@ -87,7 +85,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
||||
term_buffer.serialized_value_bytes(),
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
@@ -96,7 +93,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
|
||||
term_buffer.serialized_value_bytes(),
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
|
||||
@@ -15,6 +15,7 @@ pub trait Postings: DocSet + 'static {
|
||||
fn term_freq(&self) -> u32;
|
||||
|
||||
/// Returns the positions offsetted with a given value.
|
||||
/// It is not necessary to clear the `output` before calling this method.
|
||||
/// The output vector will be resized to the `term_freq`.
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::ops::Range;
|
||||
use stacker::Addr;
|
||||
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
use crate::postings::{
|
||||
@@ -50,7 +49,6 @@ pub(crate) fn serialize_postings(
|
||||
schema: Schema,
|
||||
per_field_postings_writers: &PerFieldPostingsWriter,
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> crate::Result<()> {
|
||||
// Replace unordered ids by ordered ids to be able to sort
|
||||
@@ -86,7 +84,6 @@ pub(crate) fn serialize_postings(
|
||||
postings_writer.serialize(
|
||||
&term_offsets[byte_offsets],
|
||||
&ordered_id_to_path,
|
||||
doc_id_map,
|
||||
&ctx,
|
||||
&mut field_serializer,
|
||||
)?;
|
||||
@@ -122,7 +119,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
|
||||
&self,
|
||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||
ordered_id_to_path: &[&str],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()>;
|
||||
@@ -187,7 +183,6 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
pub(crate) fn serialize_one_term(
|
||||
term: &[u8],
|
||||
addr: Addr,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
@@ -195,7 +190,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
let recorder: Rec = ctx.term_index.read(addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
recorder.serialize(&ctx.arena, serializer, buffer_lender);
|
||||
serializer.close_term()?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -229,13 +224,12 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
&self,
|
||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||
_ordered_id_to_path: &[&str],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (_field, _path_id, term, addr) in term_addrs {
|
||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
||||
Self::serialize_one_term(term, *addr, &mut buffer_lender, ctx, serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use common::read_u32_vint;
|
||||
use stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::FieldSerializer;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -71,7 +70,6 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
);
|
||||
@@ -115,26 +113,15 @@ impl Recorder for DocIdRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let (buffer, doc_ids) = buffer_lender.lend_all();
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
// TODO avoid reading twice.
|
||||
self.stack.read_to_end(arena, buffer);
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
|
||||
doc_ids.extend(iter.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)));
|
||||
doc_ids.sort_unstable();
|
||||
|
||||
for doc in doc_ids {
|
||||
serializer.write_doc(*doc, 0u32, &[][..]);
|
||||
}
|
||||
} else {
|
||||
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
|
||||
for doc_id in iter {
|
||||
serializer.write_doc(doc_id, 0u32, &[][..]);
|
||||
}
|
||||
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
|
||||
for doc_id in iter {
|
||||
serializer.write_doc(doc_id, 0u32, &[][..]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,35 +181,18 @@ impl Recorder for TermFrequencyRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(arena, buffer);
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let mut doc_id_and_tf = vec![];
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
prev_doc = doc_id;
|
||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||
doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq));
|
||||
}
|
||||
doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
|
||||
|
||||
for (doc_id, tf) in doc_id_and_tf {
|
||||
serializer.write_doc(doc_id, tf, &[][..]);
|
||||
}
|
||||
} else {
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
prev_doc = doc_id;
|
||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||
serializer.write_doc(doc_id, term_freq, &[][..]);
|
||||
}
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
prev_doc = doc_id;
|
||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||
serializer.write_doc(doc_id, term_freq, &[][..]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,14 +238,12 @@ impl Recorder for TfAndPositionRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
self.stack.read_to_end(arena, buffer_u8);
|
||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||
let mut doc_id_and_positions = vec![];
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
@@ -294,19 +262,7 @@ impl Recorder for TfAndPositionRecorder {
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
// this simple variant to remap may consume to much memory
|
||||
doc_id_and_positions
|
||||
.push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
|
||||
} else {
|
||||
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
|
||||
}
|
||||
}
|
||||
if doc_id_map.is_some() {
|
||||
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _)| doc_id);
|
||||
for (doc_id, positions) in doc_id_and_positions {
|
||||
serializer.write_doc(doc_id, positions.len() as u32, &positions);
|
||||
}
|
||||
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,10 +22,7 @@ pub struct AllWeight;
|
||||
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let all_scorer = AllScorer {
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
};
|
||||
let all_scorer = AllScorer::new(reader.max_doc());
|
||||
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
||||
}
|
||||
|
||||
@@ -43,6 +40,13 @@ pub struct AllScorer {
|
||||
max_doc: DocId,
|
||||
}
|
||||
|
||||
impl AllScorer {
|
||||
/// Creates a new AllScorer with `max_doc` docs.
|
||||
pub fn new(max_doc: DocId) -> AllScorer {
|
||||
AllScorer { doc: 0u32, max_doc }
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
#[inline(always)]
|
||||
fn advance(&mut self) -> DocId {
|
||||
|
||||
@@ -66,6 +66,10 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// Term::from_field_text(title, "diary"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// ));
|
||||
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(title, "cow"),
|
||||
/// IndexRecordOption::Basic
|
||||
/// ));
|
||||
/// // A TermQuery with "found" in the body
|
||||
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(body, "found"),
|
||||
@@ -74,7 +78,7 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// // TermQuery "diary" must and "girl" must not be present
|
||||
/// let queries_with_occurs1 = vec![
|
||||
/// (Occur::Must, diary_term_query.box_clone()),
|
||||
/// (Occur::MustNot, girl_term_query),
|
||||
/// (Occur::MustNot, girl_term_query.box_clone()),
|
||||
/// ];
|
||||
/// // Make a BooleanQuery equivalent to
|
||||
/// // title:+diary title:-girl
|
||||
@@ -82,15 +86,10 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
||||
/// assert_eq!(count1, 1);
|
||||
///
|
||||
/// // TermQuery for "cow" in the title
|
||||
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(title, "cow"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// ));
|
||||
/// // "title:diary OR title:cow"
|
||||
/// let title_diary_or_cow = BooleanQuery::new(vec![
|
||||
/// (Occur::Should, diary_term_query.box_clone()),
|
||||
/// (Occur::Should, cow_term_query),
|
||||
/// (Occur::Should, cow_term_query.box_clone()),
|
||||
/// ]);
|
||||
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
||||
/// assert_eq!(count2, 4);
|
||||
@@ -118,21 +117,38 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// ]);
|
||||
/// let count4 = searcher.search(&nested_query, &Count)?;
|
||||
/// assert_eq!(count4, 1);
|
||||
///
|
||||
/// // You may call `with_minimum_required_clauses` to
|
||||
/// // specify the number of should clauses the returned documents must match.
|
||||
/// let minimum_required_query = BooleanQuery::with_minimum_required_clauses(vec![
|
||||
/// (Occur::Should, cow_term_query.box_clone()),
|
||||
/// (Occur::Should, girl_term_query.box_clone()),
|
||||
/// (Occur::Should, diary_term_query.box_clone()),
|
||||
/// ], 2);
|
||||
/// // Return documents contains "Diary Cow", "Diary Girl" or "Cow Girl"
|
||||
/// // Notice: "Diary" isn't "Dairy". ;-)
|
||||
/// let count5 = searcher.search(&minimum_required_query, &Count)?;
|
||||
/// assert_eq!(count5, 1);
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Debug)]
|
||||
pub struct BooleanQuery {
|
||||
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||
minimum_number_should_match: usize,
|
||||
}
|
||||
|
||||
impl Clone for BooleanQuery {
|
||||
fn clone(&self) -> Self {
|
||||
self.subqueries
|
||||
let subqueries = self
|
||||
.subqueries
|
||||
.iter()
|
||||
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
|
||||
.collect::<Vec<_>>()
|
||||
.into()
|
||||
.collect::<Vec<_>>();
|
||||
Self {
|
||||
subqueries,
|
||||
minimum_number_should_match: self.minimum_number_should_match,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,8 +165,9 @@ impl Query for BooleanQuery {
|
||||
.iter()
|
||||
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
|
||||
.collect::<crate::Result<_>>()?;
|
||||
Ok(Box::new(BooleanWeight::new(
|
||||
Ok(Box::new(BooleanWeight::with_minimum_number_should_match(
|
||||
sub_weights,
|
||||
self.minimum_number_should_match,
|
||||
enable_scoring.is_scoring_enabled(),
|
||||
Box::new(SumWithCoordsCombiner::default),
|
||||
)))
|
||||
@@ -166,7 +183,41 @@ impl Query for BooleanQuery {
|
||||
impl BooleanQuery {
|
||||
/// Creates a new boolean query.
|
||||
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
|
||||
BooleanQuery { subqueries }
|
||||
// If the bool query includes at least one should clause
|
||||
// and no Must or MustNot clauses, the default value is 1. Otherwise, the default value is
|
||||
// 0. Keep pace with Elasticsearch.
|
||||
let mut minimum_required = 0;
|
||||
for (occur, _) in &subqueries {
|
||||
match occur {
|
||||
Occur::Should => minimum_required = 1,
|
||||
Occur::Must | Occur::MustNot => {
|
||||
minimum_required = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Self::with_minimum_required_clauses(subqueries, minimum_required)
|
||||
}
|
||||
|
||||
/// Create a new boolean query with minimum number of required should clauses specified.
|
||||
pub fn with_minimum_required_clauses(
|
||||
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||
minimum_number_should_match: usize,
|
||||
) -> BooleanQuery {
|
||||
BooleanQuery {
|
||||
subqueries,
|
||||
minimum_number_should_match,
|
||||
}
|
||||
}
|
||||
|
||||
/// Getter for `minimum_number_should_match`
|
||||
pub fn get_minimum_number_should_match(&self) -> usize {
|
||||
self.minimum_number_should_match
|
||||
}
|
||||
|
||||
/// Setter for `minimum_number_should_match`
|
||||
pub fn set_minimum_number_should_match(&mut self, minimum_number_should_match: usize) {
|
||||
self.minimum_number_should_match = minimum_number_should_match;
|
||||
}
|
||||
|
||||
/// Returns the intersection of the queries.
|
||||
@@ -181,6 +232,18 @@ impl BooleanQuery {
|
||||
BooleanQuery::new(subqueries)
|
||||
}
|
||||
|
||||
/// Returns the union of the queries with minimum required clause.
|
||||
pub fn union_with_minimum_required_clauses(
|
||||
queries: Vec<Box<dyn Query>>,
|
||||
minimum_required_clauses: usize,
|
||||
) -> BooleanQuery {
|
||||
let subqueries = queries
|
||||
.into_iter()
|
||||
.map(|sub_query| (Occur::Should, sub_query))
|
||||
.collect();
|
||||
BooleanQuery::with_minimum_required_clauses(subqueries, minimum_required_clauses)
|
||||
}
|
||||
|
||||
/// Helper method to create a boolean query matching a given list of terms.
|
||||
/// The resulting query is a disjunction of the terms.
|
||||
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
|
||||
@@ -203,11 +266,13 @@ impl BooleanQuery {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::BooleanQuery;
|
||||
use crate::collector::{Count, DocSetCollector};
|
||||
use crate::query::{QueryClone, QueryParser, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{DocAddress, Index, Term};
|
||||
use crate::query::{Query, QueryClone, QueryParser, TermQuery};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, TEXT};
|
||||
use crate::{DocAddress, DocId, Index, Term};
|
||||
|
||||
fn create_test_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -223,6 +288,73 @@ mod tests {
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minimum_required() -> crate::Result<()> {
|
||||
fn create_test_index_with<T: IntoIterator<Item = &'static str>>(
|
||||
docs: T,
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
for doc in docs {
|
||||
writer.add_document(doc!(text => doc))?;
|
||||
}
|
||||
writer.commit()?;
|
||||
Ok(index)
|
||||
}
|
||||
fn create_boolean_query_with_mr<T: IntoIterator<Item = &'static str>>(
|
||||
queries: T,
|
||||
field: Field,
|
||||
mr: usize,
|
||||
) -> BooleanQuery {
|
||||
let terms = queries
|
||||
.into_iter()
|
||||
.map(|t| Term::from_field_text(field, t))
|
||||
.map(|t| TermQuery::new(t, IndexRecordOption::Basic))
|
||||
.map(|q| -> Box<dyn Query> { Box::new(q) })
|
||||
.collect();
|
||||
BooleanQuery::union_with_minimum_required_clauses(terms, mr)
|
||||
}
|
||||
fn check_doc_id<T: IntoIterator<Item = DocId>>(
|
||||
expected: T,
|
||||
actually: HashSet<DocAddress>,
|
||||
seg: u32,
|
||||
) {
|
||||
assert_eq!(
|
||||
actually,
|
||||
expected
|
||||
.into_iter()
|
||||
.map(|id| DocAddress::new(seg, id))
|
||||
.collect()
|
||||
);
|
||||
}
|
||||
let index = create_test_index_with(["a b c", "a c e", "d f g", "z z z", "c i b"])?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let text = index.schema().get_field("text").unwrap();
|
||||
// Documents contains 'a c' 'a z' 'a i' 'c z' 'c i' or 'z i' shall be return.
|
||||
let q1 = create_boolean_query_with_mr(["a", "c", "z", "i"], text, 2);
|
||||
let docs = searcher.search(&q1, &DocSetCollector)?;
|
||||
check_doc_id([0, 1, 4], docs, 0);
|
||||
// Documents contains 'a b c', 'a b e', 'a c e' or 'b c e' shall be return.
|
||||
let q2 = create_boolean_query_with_mr(["a", "b", "c", "e"], text, 3);
|
||||
let docs = searcher.search(&q2, &DocSetCollector)?;
|
||||
check_doc_id([0, 1], docs, 0);
|
||||
// Nothing queried since minimum_required is too large.
|
||||
let q3 = create_boolean_query_with_mr(["a", "b"], text, 3);
|
||||
let docs = searcher.search(&q3, &DocSetCollector)?;
|
||||
assert!(docs.is_empty());
|
||||
// When mr is set to zero or one, there are no difference with `Boolean::Union`.
|
||||
let q4 = create_boolean_query_with_mr(["a", "z"], text, 1);
|
||||
let docs = searcher.search(&q4, &DocSetCollector)?;
|
||||
check_doc_id([0, 1, 3], docs, 0);
|
||||
let q5 = create_boolean_query_with_mr(["a", "b"], text, 0);
|
||||
let docs = searcher.search(&q5, &DocSetCollector)?;
|
||||
check_doc_id([0, 1, 4], docs, 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union() -> crate::Result<()> {
|
||||
let index = create_test_index()?;
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::collections::HashMap;
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::query::disjunction::Disjunction;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
@@ -18,6 +19,26 @@ enum SpecializedScorer {
|
||||
Other(Box<dyn Scorer>),
|
||||
}
|
||||
|
||||
fn scorer_disjunction<TScoreCombiner>(
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
score_combiner: TScoreCombiner,
|
||||
minimum_match_required: usize,
|
||||
) -> Box<dyn Scorer>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
debug_assert!(!scorers.is_empty());
|
||||
debug_assert!(minimum_match_required > 1);
|
||||
if scorers.len() == 1 {
|
||||
return scorers.into_iter().next().unwrap(); // Safe unwrap.
|
||||
}
|
||||
Box::new(Disjunction::new(
|
||||
scorers,
|
||||
score_combiner,
|
||||
minimum_match_required,
|
||||
))
|
||||
}
|
||||
|
||||
fn scorer_union<TScoreCombiner>(
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||
@@ -70,6 +91,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
/// Weight associated to the `BoolQuery`.
|
||||
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
|
||||
weights: Vec<(Occur, Box<dyn Weight>)>,
|
||||
minimum_number_should_match: usize,
|
||||
scoring_enabled: bool,
|
||||
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
|
||||
}
|
||||
@@ -85,6 +107,22 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
weights,
|
||||
scoring_enabled,
|
||||
score_combiner_fn,
|
||||
minimum_number_should_match: 1,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new boolean weight with minimum number of required should clauses specified.
|
||||
pub fn with_minimum_number_should_match(
|
||||
weights: Vec<(Occur, Box<dyn Weight>)>,
|
||||
minimum_number_should_match: usize,
|
||||
scoring_enabled: bool,
|
||||
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
|
||||
) -> BooleanWeight<TScoreCombiner> {
|
||||
BooleanWeight {
|
||||
weights,
|
||||
minimum_number_should_match,
|
||||
scoring_enabled,
|
||||
score_combiner_fn,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,43 +149,89 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
||||
) -> crate::Result<SpecializedScorer> {
|
||||
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
||||
|
||||
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
|
||||
.remove(&Occur::Should)
|
||||
.map(|scorers| scorer_union(scorers, &score_combiner_fn));
|
||||
// Indicate how should clauses are combined with other clauses.
|
||||
enum CombinationMethod {
|
||||
Ignored,
|
||||
// Only contributes to final score.
|
||||
Optional(SpecializedScorer),
|
||||
// Must be fitted.
|
||||
Required(Box<dyn Scorer>),
|
||||
}
|
||||
let mut must_scorers = per_occur_scorers.remove(&Occur::Must);
|
||||
let should_opt = if let Some(mut should_scorers) = per_occur_scorers.remove(&Occur::Should)
|
||||
{
|
||||
let num_of_should_scorers = should_scorers.len();
|
||||
if self.minimum_number_should_match > num_of_should_scorers {
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
}
|
||||
match self.minimum_number_should_match {
|
||||
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
|
||||
1 => CombinationMethod::Required(into_box_scorer(
|
||||
scorer_union(should_scorers, &score_combiner_fn),
|
||||
&score_combiner_fn,
|
||||
)),
|
||||
n if num_of_should_scorers == n => {
|
||||
// When num_of_should_scorers equals the number of should clauses,
|
||||
// they are no different from must clauses.
|
||||
must_scorers = match must_scorers.take() {
|
||||
Some(mut must_scorers) => {
|
||||
must_scorers.append(&mut should_scorers);
|
||||
Some(must_scorers)
|
||||
}
|
||||
None => Some(should_scorers),
|
||||
};
|
||||
CombinationMethod::Ignored
|
||||
}
|
||||
_ => CombinationMethod::Required(scorer_disjunction(
|
||||
should_scorers,
|
||||
score_combiner_fn(),
|
||||
self.minimum_number_should_match,
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
// None of should clauses are provided.
|
||||
if self.minimum_number_should_match > 0 {
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
} else {
|
||||
CombinationMethod::Ignored
|
||||
}
|
||||
};
|
||||
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::MustNot)
|
||||
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
|
||||
.map(|specialized_scorer| {
|
||||
.map(|specialized_scorer: SpecializedScorer| {
|
||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
||||
});
|
||||
|
||||
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Must)
|
||||
.map(intersect_scorers);
|
||||
|
||||
let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) {
|
||||
(Some(should_scorer), Some(must_scorer)) => {
|
||||
let positive_scorer = match (should_opt, must_scorers) {
|
||||
(CombinationMethod::Ignored, Some(must_scorers)) => {
|
||||
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||
}
|
||||
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
|
||||
let must_scorer = intersect_scorers(must_scorers);
|
||||
if self.scoring_enabled {
|
||||
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
|
||||
Box<dyn Scorer>,
|
||||
Box<dyn Scorer>,
|
||||
TComplexScoreCombiner,
|
||||
>::new(
|
||||
must_scorer,
|
||||
into_box_scorer(should_scorer, &score_combiner_fn),
|
||||
)))
|
||||
SpecializedScorer::Other(Box::new(
|
||||
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
must_scorer,
|
||||
into_box_scorer(should_scorer, &score_combiner_fn),
|
||||
),
|
||||
))
|
||||
} else {
|
||||
SpecializedScorer::Other(must_scorer)
|
||||
}
|
||||
}
|
||||
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
|
||||
(Some(should_scorer), None) => should_scorer,
|
||||
(None, None) => {
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
|
||||
must_scorers.push(should_scorer);
|
||||
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||
}
|
||||
(CombinationMethod::Ignored, None) => {
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
|
||||
}
|
||||
(CombinationMethod::Required(should_scorer), None) => {
|
||||
SpecializedScorer::Other(should_scorer)
|
||||
}
|
||||
// Optional options are promoted to required if no must scorers exists.
|
||||
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
|
||||
};
|
||||
|
||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
|
||||
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
||||
|
||||
327
src/query/disjunction.rs
Normal file
327
src/query/disjunction.rs
Normal file
@@ -0,0 +1,327 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::{ScoreCombiner, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
/// `Disjunction` is responsible for merging `DocSet` from multiple
|
||||
/// source. Specifically, It takes the union of two or more `DocSet`s
|
||||
/// then filtering out elements that appear fewer times than a
|
||||
/// specified threshold.
|
||||
pub struct Disjunction<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
chains: BinaryHeap<ScorerWrapper<TScorer>>,
|
||||
minimum_matches_required: usize,
|
||||
score_combiner: TScoreCombiner,
|
||||
|
||||
current_doc: DocId,
|
||||
current_score: Score,
|
||||
}
|
||||
|
||||
/// A wrapper around a `Scorer` that caches the current `doc_id` and implements the `DocSet` trait.
|
||||
/// Also, the `Ord` trait and it's family are implemented reversely. So that we can combine
|
||||
/// `std::BinaryHeap<ScorerWrapper<T>>` to gain a min-heap with current doc id as key.
|
||||
struct ScorerWrapper<T> {
|
||||
scorer: T,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl<T: Scorer> ScorerWrapper<T> {
|
||||
fn new(scorer: T) -> Self {
|
||||
let current_doc = scorer.doc();
|
||||
Self {
|
||||
scorer,
|
||||
current_doc,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Scorer> PartialEq for ScorerWrapper<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.doc() == other.doc()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Scorer> Eq for ScorerWrapper<T> {}
|
||||
|
||||
impl<T: Scorer> PartialOrd for ScorerWrapper<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Scorer> Ord for ScorerWrapper<T> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.doc().cmp(&other.doc()).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Scorer> DocSet for ScorerWrapper<T> {
|
||||
fn advance(&mut self) -> DocId {
|
||||
let doc_id = self.scorer.advance();
|
||||
self.current_doc = doc_id;
|
||||
doc_id
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.scorer.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
|
||||
pub fn new<T: IntoIterator<Item = TScorer>>(
|
||||
docsets: T,
|
||||
score_combiner: TScoreCombiner,
|
||||
minimum_matches_required: usize,
|
||||
) -> Self {
|
||||
debug_assert!(
|
||||
minimum_matches_required > 1,
|
||||
"union scorer works better if just one matches required"
|
||||
);
|
||||
let chains = docsets
|
||||
.into_iter()
|
||||
.map(|doc| ScorerWrapper::new(doc))
|
||||
.collect();
|
||||
let mut disjunction = Self {
|
||||
chains,
|
||||
score_combiner,
|
||||
current_doc: TERMINATED,
|
||||
minimum_matches_required,
|
||||
current_score: 0.0,
|
||||
};
|
||||
if minimum_matches_required > disjunction.chains.len() {
|
||||
return disjunction;
|
||||
}
|
||||
disjunction.advance();
|
||||
disjunction
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
|
||||
for Disjunction<TScorer, TScoreCombiner>
|
||||
{
|
||||
fn advance(&mut self) -> DocId {
|
||||
let mut current_num_matches = 0;
|
||||
while let Some(mut candidate) = self.chains.pop() {
|
||||
let next = candidate.doc();
|
||||
if next != TERMINATED {
|
||||
// Peek next doc.
|
||||
if self.current_doc != next {
|
||||
if current_num_matches >= self.minimum_matches_required {
|
||||
self.chains.push(candidate);
|
||||
self.current_score = self.score_combiner.score();
|
||||
return self.current_doc;
|
||||
}
|
||||
// Reset current_num_matches and scores.
|
||||
current_num_matches = 0;
|
||||
self.current_doc = next;
|
||||
self.score_combiner.clear();
|
||||
}
|
||||
current_num_matches += 1;
|
||||
self.score_combiner.update(&mut candidate.scorer);
|
||||
candidate.advance();
|
||||
self.chains.push(candidate);
|
||||
}
|
||||
}
|
||||
if current_num_matches < self.minimum_matches_required {
|
||||
self.current_doc = TERMINATED;
|
||||
}
|
||||
self.current_score = self.score_combiner.score();
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.chains
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
|
||||
for Disjunction<TScorer, TScoreCombiner>
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.current_score
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use super::Disjunction;
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::{ConstScorer, Scorer, SumCombiner, VecDocSet};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
fn conjunct<T: Ord + Copy>(arrays: &[Vec<T>], pass_line: usize) -> Vec<T> {
|
||||
let mut counts = BTreeMap::new();
|
||||
for array in arrays {
|
||||
for &element in array {
|
||||
*counts.entry(element).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
counts
|
||||
.iter()
|
||||
.filter_map(|(&element, &count)| {
|
||||
if count >= pass_line {
|
||||
Some(element)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn aux_test_conjunction(vals: Vec<Vec<u32>>, min_match: usize) {
|
||||
let mut union_expected = VecDocSet::from(conjunct(&vals, min_match));
|
||||
let make_scorer = || {
|
||||
Disjunction::new(
|
||||
vals.iter()
|
||||
.cloned()
|
||||
.map(VecDocSet::from)
|
||||
.map(|d| ConstScorer::new(d, 1.0)),
|
||||
DoNothingCombiner,
|
||||
min_match,
|
||||
)
|
||||
};
|
||||
let mut scorer: Disjunction<_, DoNothingCombiner> = make_scorer();
|
||||
let mut count = 0;
|
||||
while scorer.doc() != TERMINATED {
|
||||
assert_eq!(union_expected.doc(), scorer.doc());
|
||||
assert_eq!(union_expected.advance(), scorer.advance());
|
||||
count += 1;
|
||||
}
|
||||
assert_eq!(union_expected.advance(), TERMINATED);
|
||||
assert_eq!(count, make_scorer().count_including_deleted());
|
||||
}
|
||||
|
||||
#[should_panic]
|
||||
#[test]
|
||||
fn test_arg_check1() {
|
||||
aux_test_conjunction(vec![], 0);
|
||||
}
|
||||
|
||||
#[should_panic]
|
||||
#[test]
|
||||
fn test_arg_check2() {
|
||||
aux_test_conjunction(vec![], 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_corner_case() {
|
||||
aux_test_conjunction(vec![], 2);
|
||||
aux_test_conjunction(vec![vec![]; 1000], 2);
|
||||
aux_test_conjunction(vec![vec![]; 100], usize::MAX);
|
||||
aux_test_conjunction(vec![vec![0xC0FFEE]; 10000], usize::MAX);
|
||||
aux_test_conjunction((1..10000u32).map(|i| vec![i]).collect::<Vec<_>>(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_conjunction() {
|
||||
aux_test_conjunction(
|
||||
vec![
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
],
|
||||
2,
|
||||
);
|
||||
aux_test_conjunction(
|
||||
vec![vec![8], vec![3, 4, 0xC0FFEEu32], vec![1, 2, 100000000u32]],
|
||||
2,
|
||||
);
|
||||
aux_test_conjunction(
|
||||
vec![
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
],
|
||||
3,
|
||||
)
|
||||
}
|
||||
|
||||
// This dummy scorer does nothing but yield doc id increasingly.
|
||||
// with constant score 1.0
|
||||
#[derive(Clone)]
|
||||
struct DummyScorer {
|
||||
cursor: usize,
|
||||
foo: Vec<(DocId, f32)>,
|
||||
}
|
||||
|
||||
impl DummyScorer {
|
||||
fn new(doc_score: Vec<(DocId, f32)>) -> Self {
|
||||
Self {
|
||||
cursor: 0,
|
||||
foo: doc_score,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for DummyScorer {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.cursor += 1;
|
||||
self.doc()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.foo.get(self.cursor).map(|x| x.0).unwrap_or(TERMINATED)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.foo.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for DummyScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_score_calculate() {
|
||||
let mut scorer = Disjunction::new(
|
||||
vec![
|
||||
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||
DummyScorer::new(vec![(1, 1f32), (4, 1f32)]),
|
||||
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||
],
|
||||
SumCombiner::default(),
|
||||
3,
|
||||
);
|
||||
assert_eq!(scorer.score(), 5.0);
|
||||
assert_eq!(scorer.advance(), 2);
|
||||
assert_eq!(scorer.score(), 3.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_score_calculate_corner_case() {
|
||||
let mut scorer = Disjunction::new(
|
||||
vec![
|
||||
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||
],
|
||||
SumCombiner::default(),
|
||||
2,
|
||||
);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert_eq!(scorer.score(), 3.0);
|
||||
assert_eq!(scorer.advance(), 3);
|
||||
assert_eq!(scorer.score(), 2.0);
|
||||
}
|
||||
}
|
||||
@@ -149,7 +149,7 @@ mod tests {
|
||||
use crate::query::exist_query::ExistsQuery;
|
||||
use crate::query::{BooleanQuery, RangeQuery};
|
||||
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
||||
use crate::{Index, Searcher};
|
||||
use crate::{Index, Searcher, Term};
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_simple() -> crate::Result<()> {
|
||||
@@ -188,9 +188,8 @@ mod tests {
|
||||
|
||||
// exercise seek
|
||||
let query = BooleanQuery::intersection(vec![
|
||||
Box::new(RangeQuery::new_u64_bounds(
|
||||
"all".to_string(),
|
||||
Bound::Included(50),
|
||||
Box::new(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||
Bound::Unbounded,
|
||||
)),
|
||||
Box::new(ExistsQuery::new_exists_query("even".to_string())),
|
||||
@@ -198,10 +197,9 @@ mod tests {
|
||||
assert_eq!(searcher.search(&query, &Count)?, 25);
|
||||
|
||||
let query = BooleanQuery::intersection(vec![
|
||||
Box::new(RangeQuery::new_u64_bounds(
|
||||
"all".to_string(),
|
||||
Bound::Included(0),
|
||||
Bound::Excluded(50),
|
||||
Box::new(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(all_field, 0)),
|
||||
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||
)),
|
||||
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
|
||||
]);
|
||||
|
||||
@@ -5,6 +5,7 @@ mod bm25;
|
||||
mod boolean_query;
|
||||
mod boost_query;
|
||||
mod const_score_query;
|
||||
mod disjunction;
|
||||
mod disjunction_max_query;
|
||||
mod empty_query;
|
||||
mod exclude;
|
||||
@@ -53,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
|
||||
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
|
||||
pub use self::regex_query::RegexQuery;
|
||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||
pub use self::score_combiner::{
|
||||
|
||||
@@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery {
|
||||
Bound::Unbounded
|
||||
};
|
||||
|
||||
let mut range_query = RangeQuery::new_term_bounds(
|
||||
enable_scoring
|
||||
.schema()
|
||||
.get_field_name(self.field)
|
||||
.to_owned(),
|
||||
self.prefix.1.typ(),
|
||||
&Bound::Included(self.prefix.1.clone()),
|
||||
&end_term,
|
||||
);
|
||||
let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
|
||||
range_query.limit(self.max_expansions as u64);
|
||||
range_query.weight(enable_scoring)
|
||||
}
|
||||
|
||||
@@ -97,6 +97,7 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
|
||||
suffixes: Vec<TPostings>,
|
||||
suffix_offset: u32,
|
||||
phrase_count: u32,
|
||||
suffix_position_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||
@@ -140,6 +141,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||
suffixes,
|
||||
suffix_offset: (max_offset - suffix_pos) as u32,
|
||||
phrase_count: 0,
|
||||
suffix_position_buffer: Vec::with_capacity(100),
|
||||
};
|
||||
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
||||
phrase_prefix_scorer.advance();
|
||||
@@ -153,7 +155,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||
|
||||
fn matches_prefix(&mut self) -> bool {
|
||||
let mut count = 0;
|
||||
let mut positions = Vec::new();
|
||||
let current_doc = self.doc();
|
||||
let pos_matching = self.phrase_scorer.get_intersection();
|
||||
for suffix in &mut self.suffixes {
|
||||
@@ -162,8 +163,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||
}
|
||||
let doc = suffix.seek(current_doc);
|
||||
if doc == current_doc {
|
||||
suffix.positions_with_offset(self.suffix_offset, &mut positions);
|
||||
count += intersection_count(pos_matching, &positions);
|
||||
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
|
||||
count += intersection_count(pos_matching, &self.suffix_position_buffer);
|
||||
}
|
||||
}
|
||||
self.phrase_count = count as u32;
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fmt;
|
||||
use std::ops::Bound;
|
||||
|
||||
use crate::query::Occur;
|
||||
use crate::schema::{Field, Term, Type};
|
||||
use crate::schema::Term;
|
||||
use crate::Score;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -14,14 +14,10 @@ pub enum LogicalLiteral {
|
||||
prefix: bool,
|
||||
},
|
||||
Range {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
lower: Bound<Term>,
|
||||
upper: Bound<Term>,
|
||||
},
|
||||
Set {
|
||||
field: Field,
|
||||
value_type: Type,
|
||||
elements: Vec<Term>,
|
||||
},
|
||||
All,
|
||||
|
||||
@@ -790,8 +790,6 @@ impl QueryParser {
|
||||
let (field, json_path) = try_tuple!(self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
let mut errors = Vec::new();
|
||||
let lower = match self.resolve_bound(field, json_path, &lower) {
|
||||
Ok(bound) => bound,
|
||||
@@ -812,12 +810,8 @@ impl QueryParser {
|
||||
// we failed to parse something. Either way, there is no point emiting it
|
||||
return (None, errors);
|
||||
}
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
||||
field: self.schema.get_field_name(field).to_string(),
|
||||
value_type,
|
||||
lower,
|
||||
upper,
|
||||
}));
|
||||
let logical_ast =
|
||||
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
|
||||
(Some(logical_ast), errors)
|
||||
}
|
||||
UserInputLeaf::Set {
|
||||
@@ -832,17 +826,11 @@ impl QueryParser {
|
||||
let (field, json_path) = try_tuple!(self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
let (elements, errors) = elements
|
||||
.into_iter()
|
||||
.map(|element| self.compute_boundary_term(field, json_path, &element))
|
||||
.partition_result();
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
|
||||
elements,
|
||||
field,
|
||||
value_type,
|
||||
}));
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
|
||||
(Some(logical_ast), errors)
|
||||
}
|
||||
UserInputLeaf::Exists { .. } => (
|
||||
@@ -890,14 +878,7 @@ fn convert_literal_to_query(
|
||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
lower,
|
||||
upper,
|
||||
} => Box::new(RangeQuery::new_term_bounds(
|
||||
field, value_type, &lower, &upper,
|
||||
)),
|
||||
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
|
||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
}
|
||||
@@ -1142,8 +1123,8 @@ mod test {
|
||||
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
|
||||
upper_bound: Included([98]), limit: None }"
|
||||
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
|
||||
Included(Term(field=0, type=Str, \"b\")), limit: None }"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1821,7 +1802,8 @@ mod test {
|
||||
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
||||
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
||||
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
|
||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })], \
|
||||
minimum_number_should_match: 1 }"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1886,7 +1868,8 @@ mod test {
|
||||
format!("{query:?}"),
|
||||
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
|
||||
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
|
||||
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
|
||||
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))], \
|
||||
minimum_number_should_match: 1 }"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1903,7 +1886,8 @@ mod test {
|
||||
format!("{query:?}"),
|
||||
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
|
||||
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
|
||||
distance: 2, transposition_cost_one: false, prefix: true })] }"
|
||||
distance: 2, transposition_cost_one: false, prefix: true })], \
|
||||
minimum_number_should_match: 1 }"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Bound;
|
||||
|
||||
use crate::collector::Count;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::query::RangeQuery;
|
||||
use crate::{schema, IndexBuilder, TantivyDocument};
|
||||
use crate::{schema, IndexBuilder, TantivyDocument, Term};
|
||||
|
||||
#[test]
|
||||
fn range_query_fast_optional_field_minimum() {
|
||||
@@ -218,10 +220,9 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = RangeQuery::new_u64_bounds(
|
||||
"score".to_string(),
|
||||
std::ops::Bound::Included(70),
|
||||
std::ops::Bound::Unbounded,
|
||||
let query = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(score_field, 70)),
|
||||
Bound::Unbounded,
|
||||
);
|
||||
|
||||
let count = searcher.search(&query, &Count).unwrap();
|
||||
@@ -2,21 +2,19 @@ use std::ops::Bound;
|
||||
|
||||
use crate::schema::Type;
|
||||
|
||||
mod fast_field_range_query;
|
||||
mod fast_field_range_doc_set;
|
||||
mod range_query;
|
||||
mod range_query_ip_fastfield;
|
||||
mod range_query_u64_fastfield;
|
||||
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight;
|
||||
pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||
|
||||
// TODO is this correct?
|
||||
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
|
||||
match typ {
|
||||
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||
Type::IpAddr => true,
|
||||
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
||||
Type::Facet | Type::Bytes | Type::Json => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,21 +1,17 @@
|
||||
use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::{Bound, Range};
|
||||
use std::ops::Bound;
|
||||
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
use common::{BinarySerializable, BitSet};
|
||||
use common::BitSet;
|
||||
|
||||
use super::map_bound;
|
||||
use super::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||
use crate::error::TantivyError;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res};
|
||||
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption, Term, Type};
|
||||
use crate::termdict::{TermDictionary, TermStreamer};
|
||||
use crate::{DateTime, DocId, Score};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// `RangeQuery` matches all documents that have at least one term within a defined range.
|
||||
///
|
||||
@@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score};
|
||||
/// ```rust
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::RangeQuery;
|
||||
/// use tantivy::Term;
|
||||
/// use tantivy::schema::{Schema, INDEXED};
|
||||
/// use tantivy::{doc, Index, IndexWriter};
|
||||
/// use std::ops::Bound;
|
||||
/// # fn test() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
@@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score};
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
||||
/// let docs_in_the_sixties = RangeQuery::new(
|
||||
/// Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
/// Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
/// );
|
||||
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
/// assert_eq!(num_60s_books, 2285);
|
||||
/// Ok(())
|
||||
@@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score};
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RangeQuery {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
lower_bound: Bound<Vec<u8>>,
|
||||
upper_bound: Bound<Vec<u8>>,
|
||||
lower_bound: Bound<Term>,
|
||||
upper_bound: Bound<Term>,
|
||||
limit: Option<u64>,
|
||||
}
|
||||
|
||||
/// Returns the inner value of a `Bound`
|
||||
pub(crate) fn inner_bound(val: &Bound<Term>) -> Option<&Term> {
|
||||
match val {
|
||||
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||
Bound::Unbounded => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl RangeQuery {
|
||||
/// Creates a new `RangeQuery` from bounded start and end terms.
|
||||
///
|
||||
/// If the value type is not correct, something may go terribly wrong when
|
||||
/// the `Weight` object is created.
|
||||
pub fn new_term_bounds(
|
||||
field: String,
|
||||
value_type: Type,
|
||||
lower_bound: &Bound<Term>,
|
||||
upper_bound: &Bound<Term>,
|
||||
) -> RangeQuery {
|
||||
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
|
||||
pub fn new(lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> RangeQuery {
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type,
|
||||
lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
|
||||
upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `i64` field.
|
||||
///
|
||||
/// If the field is not of the type `i64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
|
||||
RangeQuery::new_i64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `i64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `i64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_i64_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<i64>,
|
||||
upper_bound: Bound<i64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &i64| {
|
||||
Term::from_field_i64(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::I64,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `f64` field.
|
||||
///
|
||||
/// If the field is not of the type `f64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
|
||||
RangeQuery::new_f64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `f64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `f64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_f64_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<f64>,
|
||||
upper_bound: Bound<f64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &f64| {
|
||||
Term::from_field_f64(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::F64,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `u64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `u64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_u64_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<u64>,
|
||||
upper_bound: Bound<u64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &u64| {
|
||||
Term::from_field_u64(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::U64,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `ip` field.
|
||||
///
|
||||
/// If the field is not of the type `ip`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_ip_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<Ipv6Addr>,
|
||||
upper_bound: Bound<Ipv6Addr>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &Ipv6Addr| {
|
||||
Term::from_field_ip_addr(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::IpAddr,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `u64` field.
|
||||
///
|
||||
/// If the field is not of the type `u64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
|
||||
RangeQuery::new_u64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `date` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `date`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_date_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<DateTime>,
|
||||
upper_bound: Bound<DateTime>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &DateTime| {
|
||||
Term::from_field_date(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::Date,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `date` field.
|
||||
///
|
||||
/// If the field is not of the type `date`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
|
||||
RangeQuery::new_date_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `Str` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `Str`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_str_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<&str>,
|
||||
upper_bound: Bound<&str>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &&str| val.as_bytes().to_vec();
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::Str,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `Str` field.
|
||||
///
|
||||
/// If the field is not of the type `Str`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
|
||||
RangeQuery::new_str_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Field to search over
|
||||
pub fn field(&self) -> &str {
|
||||
&self.field
|
||||
pub fn field(&self) -> Field {
|
||||
self.get_term().field()
|
||||
}
|
||||
|
||||
/// The value type of the field
|
||||
pub fn value_type(&self) -> Type {
|
||||
self.get_term().typ()
|
||||
}
|
||||
|
||||
pub(crate) fn get_term(&self) -> &Term {
|
||||
inner_bound(&self.lower_bound)
|
||||
.or(inner_bound(&self.upper_bound))
|
||||
.expect("At least one bound must be set")
|
||||
}
|
||||
|
||||
/// Limit the number of term the `RangeQuery` will go through.
|
||||
@@ -319,70 +120,23 @@ impl RangeQuery {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the type maps to a u64 fast field
|
||||
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
|
||||
match typ {
|
||||
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||
Type::IpAddr => false,
|
||||
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RangeQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
let schema = enable_scoring.schema();
|
||||
let field_type = schema
|
||||
.get_field_entry(schema.get_field(&self.field)?)
|
||||
.field_type();
|
||||
let value_type = field_type.value_type();
|
||||
if value_type != self.value_type {
|
||||
let err_msg = format!(
|
||||
"Create a range query of the type {:?}, when the field given was of type \
|
||||
{value_type:?}",
|
||||
self.value_type
|
||||
);
|
||||
return Err(TantivyError::SchemaError(err_msg));
|
||||
}
|
||||
let field_type = schema.get_field_entry(self.field()).field_type();
|
||||
|
||||
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
|
||||
if field_type.is_ip_addr() {
|
||||
let parse_ip_from_bytes = |data: &Vec<u8>| {
|
||||
let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| {
|
||||
crate::TantivyError::InvalidArgument(
|
||||
"Expected 8 bytes for ip address".to_string(),
|
||||
)
|
||||
})?;
|
||||
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
|
||||
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
|
||||
};
|
||||
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
|
||||
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
|
||||
Ok(Box::new(IPFastFieldRangeWeight::new(
|
||||
self.field.to_string(),
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
)))
|
||||
} else {
|
||||
// We run the range query on u64 value space for performance reasons and simpicity
|
||||
// assert the type maps to u64
|
||||
assert!(maps_to_u64_fastfield(self.value_type));
|
||||
let parse_from_bytes = |data: &Vec<u8>| {
|
||||
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
|
||||
};
|
||||
|
||||
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
|
||||
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
|
||||
Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
|
||||
self.field.to_string(),
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
)))
|
||||
}
|
||||
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
|
||||
Ok(Box::new(FastFieldRangeWeight::new(
|
||||
self.field(),
|
||||
self.lower_bound.clone(),
|
||||
self.upper_bound.clone(),
|
||||
)))
|
||||
} else {
|
||||
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
|
||||
Ok(Box::new(RangeWeight {
|
||||
field: self.field.to_string(),
|
||||
lower_bound: self.lower_bound.clone(),
|
||||
upper_bound: self.upper_bound.clone(),
|
||||
field: self.field(),
|
||||
lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term),
|
||||
upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term),
|
||||
limit: self.limit,
|
||||
}))
|
||||
}
|
||||
@@ -390,7 +144,7 @@ impl Query for RangeQuery {
|
||||
}
|
||||
|
||||
pub struct RangeWeight {
|
||||
field: String,
|
||||
field: Field,
|
||||
lower_bound: Bound<Vec<u8>>,
|
||||
upper_bound: Bound<Vec<u8>>,
|
||||
limit: Option<u64>,
|
||||
@@ -423,7 +177,7 @@ impl Weight for RangeWeight {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?;
|
||||
let inverted_index = reader.inverted_index(self.field)?;
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_range = self.term_range(term_dict)?;
|
||||
let mut processed_count = 0;
|
||||
@@ -477,7 +231,7 @@ mod tests {
|
||||
use crate::schema::{
|
||||
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
|
||||
};
|
||||
use crate::{Index, IndexWriter};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn test_range_query_simple() -> crate::Result<()> {
|
||||
@@ -499,7 +253,10 @@ mod tests {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
|
||||
let docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
@@ -530,7 +287,10 @@ mod tests {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
|
||||
let mut docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
docs_in_the_sixties.limit(5);
|
||||
|
||||
// due to the limit and no docs in 1963, it's really only 1960..=1965
|
||||
@@ -575,29 +335,29 @@ mod tests {
|
||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||
Bound::Excluded(Term::from_field_i64(int_field, 11)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64_bounds(
|
||||
"intfield".to_string(),
|
||||
Bound::Included(10),
|
||||
Bound::Included(11)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||
Bound::Included(Term::from_field_i64(int_field, 11)),
|
||||
)),
|
||||
18
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64_bounds(
|
||||
"intfield".to_string(),
|
||||
Bound::Excluded(9),
|
||||
Bound::Included(10)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Excluded(Term::from_field_i64(int_field, 9)),
|
||||
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64_bounds(
|
||||
"intfield".to_string(),
|
||||
Bound::Included(9),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_i64(int_field, 9)),
|
||||
Bound::Unbounded
|
||||
)),
|
||||
91
|
||||
@@ -646,29 +406,29 @@ mod tests {
|
||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||
Bound::Excluded(Term::from_field_f64(float_field, 11.0)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
"floatfield".to_string(),
|
||||
Bound::Included(10.0),
|
||||
Bound::Included(11.0)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||
Bound::Included(Term::from_field_f64(float_field, 11.0)),
|
||||
)),
|
||||
18
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
"floatfield".to_string(),
|
||||
Bound::Excluded(9.0),
|
||||
Bound::Included(10.0)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Excluded(Term::from_field_f64(float_field, 9.0)),
|
||||
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
"floatfield".to_string(),
|
||||
Bound::Included(9.0),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_f64(float_field, 9.0)),
|
||||
Bound::Unbounded
|
||||
)),
|
||||
91
|
||||
|
||||
@@ -1,512 +0,0 @@
|
||||
//! IP Fastfields support efficient scanning for range queries.
|
||||
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
|
||||
//! used, which uses the term dictionary + postings.
|
||||
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use columnar::{Column, MonotonicallyMappableToU128};
|
||||
|
||||
use crate::query::range_query::fast_field_range_query::RangeDocSet;
|
||||
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
||||
|
||||
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
|
||||
pub struct IPFastFieldRangeWeight {
|
||||
field: String,
|
||||
lower_bound: Bound<Ipv6Addr>,
|
||||
upper_bound: Bound<Ipv6Addr>,
|
||||
}
|
||||
|
||||
impl IPFastFieldRangeWeight {
|
||||
/// Creates a new IPFastFieldRangeWeight.
|
||||
pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
|
||||
Self {
|
||||
field,
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for IPFastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
||||
reader.fast_fields().column_opt(&self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range(
|
||||
&self.lower_bound,
|
||||
&self.upper_bound,
|
||||
ip_addr_column.min_value(),
|
||||
ip_addr_column.max_value(),
|
||||
);
|
||||
let docset = RangeDocSet::new(value_range, ip_addr_column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Document #({doc}) does not match"
|
||||
)));
|
||||
}
|
||||
let explanation = Explanation::new("Const", scorer.score());
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
fn bound_to_value_range(
|
||||
lower_bound: &Bound<Ipv6Addr>,
|
||||
upper_bound: &Bound<Ipv6Addr>,
|
||||
min_value: Ipv6Addr,
|
||||
max_value: Ipv6Addr,
|
||||
) -> RangeInclusive<Ipv6Addr> {
|
||||
let start_value = match lower_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
|
||||
Bound::Unbounded => min_value,
|
||||
};
|
||||
|
||||
let end_value = match upper_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
|
||||
Bound::Unbounded => max_value,
|
||||
};
|
||||
start_value..=end_value
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use proptest::prelude::ProptestConfig;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
pub id: String,
|
||||
pub ip: Ipv6Addr,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = Doc> {
|
||||
prop_oneof![
|
||||
(0u64..10_000u64).prop_map(doc_from_id_1),
|
||||
(1u64..10_000u64).prop_map(doc_from_id_2),
|
||||
]
|
||||
}
|
||||
|
||||
pub fn doc_from_id_1(id: u64) -> Doc {
|
||||
let id = id * 1000;
|
||||
Doc {
|
||||
// ip != id
|
||||
id: id.to_string(),
|
||||
ip: Ipv6Addr::from_u128(id as u128),
|
||||
}
|
||||
}
|
||||
fn doc_from_id_2(id: u64) -> Doc {
|
||||
let id = id * 1000;
|
||||
Doc {
|
||||
// ip != id
|
||||
id: (id - 1).to_string(),
|
||||
ip: Ipv6Addr::from_u128(id as u128),
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
|
||||
assert!(test_ip_range_for_docs(&ops).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression1() {
|
||||
let ops = &[doc_from_id_1(0)];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression2() {
|
||||
let ops = &[
|
||||
doc_from_id_1(52),
|
||||
doc_from_id_1(63),
|
||||
doc_from_id_1(12),
|
||||
doc_from_id_2(91),
|
||||
doc_from_id_2(33),
|
||||
];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression3() {
|
||||
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression3_simple() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
|
||||
.into_iter()
|
||||
.map(Ipv6Addr::from_u128)
|
||||
.collect();
|
||||
for &ip_addr in &ip_addrs {
|
||||
writer
|
||||
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
|
||||
.unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let range_weight = IPFastFieldRangeWeight {
|
||||
field: "ips".to_string(),
|
||||
lower_bound: Bound::Included(ip_addrs[1]),
|
||||
upper_bound: Bound::Included(ip_addrs[2]),
|
||||
};
|
||||
let count = range_weight.count(searcher.segment_reader(0)).unwrap();
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let text_field = schema_builder.add_text_field("id", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
|
||||
for doc in docs.iter() {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
ips_field => doc.ip,
|
||||
ips_field => doc.ip,
|
||||
ip_field => doc.ip,
|
||||
text_field => doc.id.to_string(),
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
|
||||
let index = create_index_from_docs(docs);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(&index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
|
||||
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
|
||||
};
|
||||
|
||||
let test_sample = |sample_docs: &[Doc]| {
|
||||
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
|
||||
ips.sort();
|
||||
let ip_range = ips[0]..=ips[1];
|
||||
let expected_num_hits = docs
|
||||
.iter()
|
||||
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
|
||||
.count();
|
||||
|
||||
let query = gen_query_inclusive("ip", &ip_range);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
let query = gen_query_inclusive("ips", &ip_range);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
// Intersection search
|
||||
let id_filter = sample_docs[0].id.to_string();
|
||||
let expected_num_hits = docs
|
||||
.iter()
|
||||
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
|
||||
.count();
|
||||
let query = format!(
|
||||
"{} AND id:{}",
|
||||
gen_query_inclusive("ip", &ip_range),
|
||||
&id_filter
|
||||
);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
// Intersection search on multivalue ip field
|
||||
let id_filter = sample_docs[0].id.to_string();
|
||||
let query = format!(
|
||||
"{} AND id:{}",
|
||||
gen_query_inclusive("ips", &ip_range),
|
||||
&id_filter
|
||||
);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
};
|
||||
|
||||
test_sample(&[docs[0].clone(), docs[0].clone()]);
|
||||
if docs.len() > 1 {
|
||||
test_sample(&[docs[0].clone(), docs[1].clone()]);
|
||||
test_sample(&[docs[1].clone(), docs[1].clone()]);
|
||||
}
|
||||
if docs.len() > 2 {
|
||||
test_sample(&[docs[1].clone(), docs[2].clone()]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use test::Bencher;
|
||||
|
||||
use super::tests::*;
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::Index;
|
||||
|
||||
fn get_index_0_to_100() -> Index {
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let num_vals = 100_000;
|
||||
let docs: Vec<_> = (0..num_vals)
|
||||
.map(|_i| {
|
||||
let id = if rng.gen_bool(0.01) {
|
||||
"veryfew".to_string() // 1%
|
||||
} else if rng.gen_bool(0.1) {
|
||||
"few".to_string() // 9%
|
||||
} else {
|
||||
"many".to_string() // 90%
|
||||
};
|
||||
Doc {
|
||||
id,
|
||||
// Multiply by 1000, so that we create many buckets in the compact space
|
||||
// The benches depend on this range to select n-percent of elements with the
|
||||
// methods below.
|
||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
create_index_from_docs(&docs)
|
||||
}
|
||||
|
||||
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn excute_query(
|
||||
field: &str,
|
||||
ip_range: RangeInclusive<Ipv6Addr>,
|
||||
suffix: &str,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
|
||||
format!(
|
||||
"{}:[{} TO {}] {}",
|
||||
field,
|
||||
&from.to_string(),
|
||||
&to.to_string(),
|
||||
suffix
|
||||
)
|
||||
};
|
||||
|
||||
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
let query = query_from_text(&query);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query, &(Count)).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user