mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
Compare commits
17 Commits
test_colum
...
agg_format
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b345c11786 | ||
|
|
7ebcc15b17 | ||
|
|
1b4076691f | ||
|
|
eab660873a | ||
|
|
232f37126e | ||
|
|
13e9885dfd | ||
|
|
56d79cb203 | ||
|
|
0f4c2e27cf | ||
|
|
f9ae295507 | ||
|
|
d9db5302d9 | ||
|
|
e453848134 | ||
|
|
59084143ef | ||
|
|
511b027350 | ||
|
|
322f47eb47 | ||
|
|
72f61ff89c | ||
|
|
a141c3ec59 | ||
|
|
e90e7a25ae |
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
@@ -15,11 +15,11 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Install Rust
|
- name: Install Rust
|
||||||
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
|
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||||
- name: Generate code coverage
|
- name: Generate code coverage
|
||||||
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v3
|
uses: codecov/codecov-action@v3
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.63"
|
rust-version = "1.66"
|
||||||
exclude = ["benches/*.json", "benches/*.txt"]
|
exclude = ["benches/*.json", "benches/*.txt"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
@@ -38,7 +38,7 @@ levenshtein_automata = "0.2.1"
|
|||||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||||
crossbeam-channel = "0.5.4"
|
crossbeam-channel = "0.5.4"
|
||||||
rust-stemmers = "1.2.0"
|
rust-stemmers = "1.2.0"
|
||||||
downcast-rs = "1.2.0"
|
downcast-rs = "1.2.1"
|
||||||
bitpacking = { version = "0.9.2", default-features = false, features = [
|
bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||||
"bitpacker4x",
|
"bitpacker4x",
|
||||||
] }
|
] }
|
||||||
@@ -64,6 +64,7 @@ tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
|
|||||||
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
||||||
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||||
|
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||||
futures-util = { version = "0.3.28", optional = true }
|
futures-util = { version = "0.3.28", optional = true }
|
||||||
fnv = "1.0.7"
|
fnv = "1.0.7"
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
|||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
|
The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
|
||||||
performance for different types of queries/collections.
|
performance for different types of queries/collections.
|
||||||
|
|
||||||
Your mileage WILL vary depending on the nature of queries and their load.
|
Your mileage WILL vary depending on the nature of queries and their load.
|
||||||
@@ -101,7 +101,8 @@ cargo test
|
|||||||
## Companies Using Tantivy
|
## Companies Using Tantivy
|
||||||
|
|
||||||
<p align="left">
|
<p align="left">
|
||||||
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />
|
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />
|
||||||
|
<img align="center" src="doc/assets/images/paradedb.png" alt="ParadeDB" height="25" width="auto" />
|
||||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||||
|
|||||||
@@ -51,10 +51,15 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
|||||||
register!(group, percentiles_f64);
|
register!(group, percentiles_f64);
|
||||||
register!(group, terms_few);
|
register!(group, terms_few);
|
||||||
register!(group, terms_many);
|
register!(group, terms_many);
|
||||||
|
register!(group, terms_many_top_1000);
|
||||||
register!(group, terms_many_order_by_term);
|
register!(group, terms_many_order_by_term);
|
||||||
register!(group, terms_many_with_top_hits);
|
register!(group, terms_many_with_top_hits);
|
||||||
register!(group, terms_many_with_avg_sub_agg);
|
register!(group, terms_many_with_avg_sub_agg);
|
||||||
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
|
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
||||||
|
|
||||||
|
register!(group, cardinality_agg);
|
||||||
|
register!(group, terms_few_with_cardinality_agg);
|
||||||
|
|
||||||
register!(group, range_agg);
|
register!(group, range_agg);
|
||||||
register!(group, range_agg_with_avg_sub_agg);
|
register!(group, range_agg_with_avg_sub_agg);
|
||||||
register!(group, range_agg_with_term_agg_few);
|
register!(group, range_agg_with_term_agg_few);
|
||||||
@@ -123,6 +128,33 @@ fn percentiles_f64(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cardinality_agg(index: &Index) {
|
||||||
|
let agg_req = json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "text_many_terms"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
});
|
||||||
|
execute_agg(index, agg_req);
|
||||||
|
}
|
||||||
|
fn terms_few_with_cardinality_agg(index: &Index) {
|
||||||
|
let agg_req = json!({
|
||||||
|
"my_texts": {
|
||||||
|
"terms": { "field": "text_few_terms" },
|
||||||
|
"aggs": {
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "text_many_terms"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
execute_agg(index, agg_req);
|
||||||
|
}
|
||||||
|
|
||||||
fn terms_few(index: &Index) {
|
fn terms_few(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||||
@@ -135,6 +167,12 @@ fn terms_many(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
fn terms_many_top_1000(index: &Index) {
|
||||||
|
let agg_req = json!({
|
||||||
|
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
|
||||||
|
});
|
||||||
|
execute_agg(index, agg_req);
|
||||||
|
}
|
||||||
fn terms_many_order_by_term(index: &Index) {
|
fn terms_many_order_by_term(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
||||||
@@ -171,7 +209,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
|
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": {
|
"my_texts": {
|
||||||
"terms": { "field": "json.mixed_type" },
|
"terms": { "field": "json.mixed_type" },
|
||||||
@@ -268,6 +306,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn histogram(index: &Index) {
|
fn histogram(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"rangef64": {
|
"rangef64": {
|
||||||
|
|||||||
@@ -29,6 +29,10 @@ binggan = "0.8.1"
|
|||||||
name = "bench_merge"
|
name = "bench_merge"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_access"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
unstable = []
|
unstable = []
|
||||||
|
|||||||
67
columnar/benches/bench_access.rs
Normal file
67
columnar/benches/bench_access.rs
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
use binggan::{black_box, InputGroup};
|
||||||
|
use common::*;
|
||||||
|
use tantivy_columnar::Column;
|
||||||
|
|
||||||
|
pub mod common;
|
||||||
|
|
||||||
|
const NUM_DOCS: u32 = 2_000_000;
|
||||||
|
|
||||||
|
pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
|
||||||
|
let reader = generate_columnar_with_name(card, num_docs, "price");
|
||||||
|
reader.read_columns("price").unwrap()[0]
|
||||||
|
.open_u64_lenient()
|
||||||
|
.unwrap()
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let mut inputs = Vec::new();
|
||||||
|
|
||||||
|
let mut add_card = |card1: Card| {
|
||||||
|
inputs.push((
|
||||||
|
format!("{card1}"),
|
||||||
|
generate_columnar_and_open(card1, NUM_DOCS),
|
||||||
|
));
|
||||||
|
};
|
||||||
|
|
||||||
|
add_card(Card::MultiSparse);
|
||||||
|
add_card(Card::Multi);
|
||||||
|
add_card(Card::Sparse);
|
||||||
|
add_card(Card::Dense);
|
||||||
|
add_card(Card::Full);
|
||||||
|
|
||||||
|
bench_group(InputGroup::new_with_inputs(inputs));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_group(mut runner: InputGroup<Column>) {
|
||||||
|
runner.register("access_values_for_doc", |column| {
|
||||||
|
let mut sum = 0;
|
||||||
|
for i in 0..NUM_DOCS {
|
||||||
|
for value in column.values_for_doc(i) {
|
||||||
|
sum += value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
black_box(sum);
|
||||||
|
});
|
||||||
|
runner.register("access_first_vals", |column| {
|
||||||
|
let mut sum = 0;
|
||||||
|
const BLOCK_SIZE: usize = 32;
|
||||||
|
let mut docs = vec![0; BLOCK_SIZE];
|
||||||
|
let mut buffer = vec![None; BLOCK_SIZE];
|
||||||
|
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
|
||||||
|
// fill docs
|
||||||
|
for idx in 0..BLOCK_SIZE {
|
||||||
|
docs[idx] = idx as u32 + i;
|
||||||
|
}
|
||||||
|
|
||||||
|
column.first_vals(&docs, &mut buffer);
|
||||||
|
for val in buffer.iter() {
|
||||||
|
let Some(val) = val else { continue };
|
||||||
|
sum += *val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
black_box(sum);
|
||||||
|
});
|
||||||
|
runner.run();
|
||||||
|
}
|
||||||
@@ -31,7 +31,7 @@ fn get_test_columns() -> Columns {
|
|||||||
}
|
}
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer
|
dataframe_writer
|
||||||
.serialize(data.len() as u32, None, &mut buffer)
|
.serialize(data.len() as u32, &mut buffer)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -1,62 +1,11 @@
|
|||||||
#![feature(test)]
|
pub mod common;
|
||||||
extern crate test;
|
|
||||||
|
|
||||||
use core::fmt;
|
|
||||||
use std::fmt::{Display, Formatter};
|
|
||||||
|
|
||||||
use binggan::{black_box, BenchRunner};
|
use binggan::{black_box, BenchRunner};
|
||||||
|
use common::{generate_columnar_with_name, Card};
|
||||||
use tantivy_columnar::*;
|
use tantivy_columnar::*;
|
||||||
|
|
||||||
enum Card {
|
|
||||||
Multi,
|
|
||||||
Sparse,
|
|
||||||
Dense,
|
|
||||||
}
|
|
||||||
impl Display for Card {
|
|
||||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
|
||||||
match self {
|
|
||||||
Card::Multi => write!(f, "multi"),
|
|
||||||
Card::Sparse => write!(f, "sparse"),
|
|
||||||
Card::Dense => write!(f, "dense"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const NUM_DOCS: u32 = 100_000;
|
const NUM_DOCS: u32 = 100_000;
|
||||||
|
|
||||||
fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
|
|
||||||
use tantivy_columnar::ColumnarWriter;
|
|
||||||
|
|
||||||
let mut columnar_writer = ColumnarWriter::default();
|
|
||||||
|
|
||||||
match card {
|
|
||||||
Card::Multi => {
|
|
||||||
columnar_writer.record_numerical(0, "price", 10u64);
|
|
||||||
columnar_writer.record_numerical(0, "price", 10u64);
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
for i in 0..num_docs {
|
|
||||||
match card {
|
|
||||||
Card::Multi | Card::Sparse => {
|
|
||||||
if i % 8 == 0 {
|
|
||||||
columnar_writer.record_numerical(i, "price", i as u64);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Card::Dense => {
|
|
||||||
if i % 6 == 0 {
|
|
||||||
columnar_writer.record_numerical(i, "price", i as u64);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut wrt: Vec<u8> = Vec::new();
|
|
||||||
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
|
|
||||||
|
|
||||||
ColumnarReader::open(wrt).unwrap()
|
|
||||||
}
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let mut inputs = Vec::new();
|
let mut inputs = Vec::new();
|
||||||
|
|
||||||
@@ -64,16 +13,19 @@ fn main() {
|
|||||||
inputs.push((
|
inputs.push((
|
||||||
format!("merge_{card1}_and_{card2}"),
|
format!("merge_{card1}_and_{card2}"),
|
||||||
vec![
|
vec![
|
||||||
generate_columnar(card1, NUM_DOCS),
|
generate_columnar_with_name(card1, NUM_DOCS, "price"),
|
||||||
generate_columnar(card2, NUM_DOCS),
|
generate_columnar_with_name(card2, NUM_DOCS, "price"),
|
||||||
],
|
],
|
||||||
));
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
add_combo(Card::Multi, Card::Multi);
|
add_combo(Card::Multi, Card::Multi);
|
||||||
|
add_combo(Card::MultiSparse, Card::MultiSparse);
|
||||||
add_combo(Card::Dense, Card::Dense);
|
add_combo(Card::Dense, Card::Dense);
|
||||||
add_combo(Card::Sparse, Card::Sparse);
|
add_combo(Card::Sparse, Card::Sparse);
|
||||||
add_combo(Card::Sparse, Card::Dense);
|
add_combo(Card::Sparse, Card::Dense);
|
||||||
|
add_combo(Card::MultiSparse, Card::Dense);
|
||||||
|
add_combo(Card::MultiSparse, Card::Sparse);
|
||||||
add_combo(Card::Multi, Card::Dense);
|
add_combo(Card::Multi, Card::Dense);
|
||||||
add_combo(Card::Multi, Card::Sparse);
|
add_combo(Card::Multi, Card::Sparse);
|
||||||
|
|
||||||
@@ -84,16 +36,12 @@ fn main() {
|
|||||||
input_name,
|
input_name,
|
||||||
columnar_readers,
|
columnar_readers,
|
||||||
move |columnar_readers: &Vec<ColumnarReader>| {
|
move |columnar_readers: &Vec<ColumnarReader>| {
|
||||||
let mut out = vec![];
|
let mut out = Vec::new();
|
||||||
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
|
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
|
||||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||||
|
|
||||||
let _ = black_box(merge_columnar(
|
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||||
&columnar_readers,
|
black_box(out);
|
||||||
&[],
|
|
||||||
merge_row_order.into(),
|
|
||||||
&mut out,
|
|
||||||
));
|
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
59
columnar/benches/common.rs
Normal file
59
columnar/benches/common.rs
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
extern crate tantivy_columnar;
|
||||||
|
|
||||||
|
use core::fmt;
|
||||||
|
use std::fmt::{Display, Formatter};
|
||||||
|
|
||||||
|
use tantivy_columnar::{ColumnarReader, ColumnarWriter};
|
||||||
|
|
||||||
|
pub enum Card {
|
||||||
|
MultiSparse,
|
||||||
|
Multi,
|
||||||
|
Sparse,
|
||||||
|
Dense,
|
||||||
|
Full,
|
||||||
|
}
|
||||||
|
impl Display for Card {
|
||||||
|
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
Card::MultiSparse => write!(f, "multi sparse 1/13"),
|
||||||
|
Card::Multi => write!(f, "multi 2x"),
|
||||||
|
Card::Sparse => write!(f, "sparse 1/13"),
|
||||||
|
Card::Dense => write!(f, "dense 1/12"),
|
||||||
|
Card::Full => write!(f, "full"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
|
||||||
|
let mut columnar_writer = ColumnarWriter::default();
|
||||||
|
|
||||||
|
if let Card::MultiSparse = card {
|
||||||
|
columnar_writer.record_numerical(0, column_name, 10u64);
|
||||||
|
columnar_writer.record_numerical(0, column_name, 10u64);
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..num_docs {
|
||||||
|
match card {
|
||||||
|
Card::MultiSparse | Card::Sparse => {
|
||||||
|
if i % 13 == 0 {
|
||||||
|
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Card::Dense => {
|
||||||
|
if i % 12 == 0 {
|
||||||
|
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Card::Full => {
|
||||||
|
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||||
|
}
|
||||||
|
Card::Multi => {
|
||||||
|
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||||
|
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut wrt: Vec<u8> = Vec::new();
|
||||||
|
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||||
|
ColumnarReader::open(wrt).unwrap()
|
||||||
|
}
|
||||||
BIN
columnar/compat_tests_data/v1.columnar
Normal file
BIN
columnar/compat_tests_data/v1.columnar
Normal file
Binary file not shown.
BIN
columnar/compat_tests_data/v2.columnar
Normal file
BIN
columnar/compat_tests_data/v2.columnar
Normal file
Binary file not shown.
@@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
|||||||
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
|
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the docids of values which are in the provided value range.
|
/// Get the docids of values which are in the provided value and docid range.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn get_docids_for_value_range(
|
pub fn get_docids_for_value_range(
|
||||||
&self,
|
&self,
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use crate::column_values::{
|
|||||||
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||||
};
|
};
|
||||||
use crate::iterable::Iterable;
|
use crate::iterable::Iterable;
|
||||||
use crate::StrColumn;
|
use crate::{StrColumn, Version};
|
||||||
|
|
||||||
pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
|
pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
|
||||||
column_index: SerializableColumnIndex<'_>,
|
column_index: SerializableColumnIndex<'_>,
|
||||||
@@ -40,25 +40,9 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
|
pub fn open_column_u64<T: MonotonicallyMappableToU64>(
|
||||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
|
||||||
let column_index_num_bytes = u32::from_le_bytes(
|
|
||||||
column_index_num_bytes_payload
|
|
||||||
.as_slice()
|
|
||||||
.try_into()
|
|
||||||
.unwrap(),
|
|
||||||
);
|
|
||||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
|
||||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
|
||||||
let column_values = load_u64_based_column_values(column_values_data)?;
|
|
||||||
Ok(Column {
|
|
||||||
index: column_index,
|
|
||||||
values: column_values,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
|
||||||
bytes: OwnedBytes,
|
bytes: OwnedBytes,
|
||||||
|
format_version: Version,
|
||||||
) -> io::Result<Column<T>> {
|
) -> io::Result<Column<T>> {
|
||||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||||
let column_index_num_bytes = u32::from_le_bytes(
|
let column_index_num_bytes = u32::from_le_bytes(
|
||||||
@@ -68,7 +52,27 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
|||||||
.unwrap(),
|
.unwrap(),
|
||||||
);
|
);
|
||||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||||
|
let column_values = load_u64_based_column_values(column_values_data)?;
|
||||||
|
Ok(Column {
|
||||||
|
index: column_index,
|
||||||
|
values: column_values,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||||
|
bytes: OwnedBytes,
|
||||||
|
format_version: Version,
|
||||||
|
) -> io::Result<Column<T>> {
|
||||||
|
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||||
|
let column_index_num_bytes = u32::from_le_bytes(
|
||||||
|
column_index_num_bytes_payload
|
||||||
|
.as_slice()
|
||||||
|
.try_into()
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||||
|
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||||
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
|
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
|
||||||
Ok(Column {
|
Ok(Column {
|
||||||
index: column_index,
|
index: column_index,
|
||||||
@@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
|||||||
/// Open the column as u64.
|
/// Open the column as u64.
|
||||||
///
|
///
|
||||||
/// See [`open_u128_as_compact_u64`] for more details.
|
/// See [`open_u128_as_compact_u64`] for more details.
|
||||||
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
|
pub fn open_column_u128_as_compact_u64(
|
||||||
|
bytes: OwnedBytes,
|
||||||
|
format_version: Version,
|
||||||
|
) -> io::Result<Column<u64>> {
|
||||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||||
let column_index_num_bytes = u32::from_le_bytes(
|
let column_index_num_bytes = u32::from_le_bytes(
|
||||||
column_index_num_bytes_payload
|
column_index_num_bytes_payload
|
||||||
@@ -88,7 +95,7 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
|
|||||||
.unwrap(),
|
.unwrap(),
|
||||||
);
|
);
|
||||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||||
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
|
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
|
||||||
Ok(Column {
|
Ok(Column {
|
||||||
index: column_index,
|
index: column_index,
|
||||||
@@ -96,19 +103,19 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
|
pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
|
||||||
let (body, dictionary_len_bytes) = data.rsplit(4);
|
let (body, dictionary_len_bytes) = data.rsplit(4);
|
||||||
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
|
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
|
||||||
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
|
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
|
||||||
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
|
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
|
||||||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
|
||||||
Ok(BytesColumn {
|
Ok(BytesColumn {
|
||||||
dictionary,
|
dictionary,
|
||||||
term_ord_column,
|
term_ord_column,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
|
pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
|
||||||
let bytes_column = open_column_bytes(data)?;
|
let bytes_column = open_column_bytes(data, format_version)?;
|
||||||
Ok(StrColumn::wrap(bytes_column))
|
Ok(StrColumn::wrap(bytes_column))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use common::OwnedBytes;
|
||||||
|
|
||||||
use crate::column_index::merge::detect_cardinality;
|
use crate::column_index::merge::detect_cardinality;
|
||||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
use crate::column_index::multivalued_index::{
|
||||||
|
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
|
||||||
|
};
|
||||||
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
|
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
|
||||||
use crate::{
|
use crate::{
|
||||||
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
|
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
|
||||||
@@ -171,7 +175,11 @@ mod tests {
|
|||||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||||
panic!("Excpected a multivalued index")
|
panic!("Excpected a multivalued index")
|
||||||
};
|
};
|
||||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
let mut output = Vec::new();
|
||||||
|
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||||
|
let multivalue =
|
||||||
|
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
|
||||||
|
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
|
||||||
assert_eq!(&start_indexes, &[0, 3, 5]);
|
assert_eq!(&start_indexes, &[0, 3, 5]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,11 +208,16 @@ mod tests {
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
.into();
|
.into();
|
||||||
|
|
||||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||||
panic!("Excpected a multivalued index")
|
panic!("Excpected a multivalued index")
|
||||||
};
|
};
|
||||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
let mut output = Vec::new();
|
||||||
|
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||||
|
let multivalue =
|
||||||
|
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
|
||||||
|
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
|
||||||
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
|
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
use crate::column_index::{SerializableColumnIndex, Set};
|
use crate::column_index::{
|
||||||
|
SerializableColumnIndex, SerializableMultivalueIndex, SerializableOptionalIndex, Set,
|
||||||
|
};
|
||||||
use crate::iterable::Iterable;
|
use crate::iterable::Iterable;
|
||||||
use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
|
use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
|
||||||
|
|
||||||
@@ -14,15 +16,24 @@ pub fn merge_column_index_shuffled<'a>(
|
|||||||
Cardinality::Optional => {
|
Cardinality::Optional => {
|
||||||
let non_null_row_ids =
|
let non_null_row_ids =
|
||||||
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
||||||
SerializableColumnIndex::Optional {
|
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||||
non_null_row_ids,
|
non_null_row_ids,
|
||||||
num_rows: shuffle_merge_order.num_rows(),
|
num_rows: shuffle_merge_order.num_rows(),
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
Cardinality::Multivalued => {
|
Cardinality::Multivalued => {
|
||||||
let multivalue_start_index =
|
let non_null_row_ids =
|
||||||
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
|
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
||||||
SerializableColumnIndex::Multivalued(multivalue_start_index)
|
SerializableColumnIndex::Multivalued(SerializableMultivalueIndex {
|
||||||
|
doc_ids_with_values: SerializableOptionalIndex {
|
||||||
|
non_null_row_ids,
|
||||||
|
num_rows: shuffle_merge_order.num_rows(),
|
||||||
|
},
|
||||||
|
start_offsets: merge_column_index_shuffled_multivalued(
|
||||||
|
column_indexes,
|
||||||
|
shuffle_merge_order,
|
||||||
|
),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -102,11 +113,18 @@ fn iter_num_values<'a>(
|
|||||||
|
|
||||||
/// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
|
/// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
|
||||||
/// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
|
/// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
|
||||||
|
///
|
||||||
|
/// This will filter values with 0 values as these are covered by the optional index in the
|
||||||
|
/// multivalue index.
|
||||||
fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
|
fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
|
||||||
iter::once(0u32).chain(num_vals.scan(0, |state, num_vals| {
|
iter::once(0u32).chain(
|
||||||
*state += num_vals;
|
num_vals
|
||||||
Some(*state)
|
.filter(|num_vals| *num_vals != 0)
|
||||||
}))
|
.scan(0, |state, num_vals| {
|
||||||
|
*state += num_vals;
|
||||||
|
Some(*state)
|
||||||
|
}),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
|
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
|
||||||
@@ -134,7 +152,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_integrate_num_vals_several() {
|
fn test_integrate_num_vals_several() {
|
||||||
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 3, 13, 33].into_iter()));
|
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 13, 33].into_iter()));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -157,10 +175,10 @@ mod tests {
|
|||||||
Cardinality::Optional,
|
Cardinality::Optional,
|
||||||
&shuffle_merge_order,
|
&shuffle_merge_order,
|
||||||
);
|
);
|
||||||
let SerializableColumnIndex::Optional {
|
let SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||||
non_null_row_ids,
|
non_null_row_ids,
|
||||||
num_rows,
|
num_rows,
|
||||||
} = serializable_index
|
}) = serializable_index
|
||||||
else {
|
else {
|
||||||
panic!()
|
panic!()
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
use std::iter;
|
use std::ops::Range;
|
||||||
|
|
||||||
use crate::column_index::{SerializableColumnIndex, Set};
|
use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
|
||||||
|
use crate::column_index::serialize::SerializableOptionalIndex;
|
||||||
|
use crate::column_index::SerializableColumnIndex;
|
||||||
use crate::iterable::Iterable;
|
use crate::iterable::Iterable;
|
||||||
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
|
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
|
||||||
|
|
||||||
@@ -15,23 +17,149 @@ pub fn merge_column_index_stacked<'a>(
|
|||||||
) -> SerializableColumnIndex<'a> {
|
) -> SerializableColumnIndex<'a> {
|
||||||
match cardinality_after_merge {
|
match cardinality_after_merge {
|
||||||
Cardinality::Full => SerializableColumnIndex::Full,
|
Cardinality::Full => SerializableColumnIndex::Full,
|
||||||
Cardinality::Optional => SerializableColumnIndex::Optional {
|
Cardinality::Optional => SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||||
non_null_row_ids: Box::new(StackedOptionalIndex {
|
non_null_row_ids: Box::new(StackedOptionalIndex {
|
||||||
columns,
|
columns,
|
||||||
stack_merge_order,
|
stack_merge_order,
|
||||||
}),
|
}),
|
||||||
num_rows: stack_merge_order.num_rows(),
|
num_rows: stack_merge_order.num_rows(),
|
||||||
},
|
}),
|
||||||
Cardinality::Multivalued => {
|
Cardinality::Multivalued => {
|
||||||
let stacked_multivalued_index = StackedMultivaluedIndex {
|
let serializable_multivalue_index =
|
||||||
columns,
|
make_serializable_multivalued_index(columns, stack_merge_order);
|
||||||
stack_merge_order,
|
SerializableColumnIndex::Multivalued(serializable_multivalue_index)
|
||||||
};
|
|
||||||
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct StackedDocIdsWithValues<'a> {
|
||||||
|
column_indexes: &'a [ColumnIndex],
|
||||||
|
stack_merge_order: &'a StackMergeOrder,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterable<u32> for StackedDocIdsWithValues<'_> {
|
||||||
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
|
Box::new((0..self.column_indexes.len()).flat_map(|i| {
|
||||||
|
let column_index = &self.column_indexes[i];
|
||||||
|
let doc_range = self.stack_merge_order.columnar_range(i);
|
||||||
|
get_doc_ids_with_values(column_index, doc_range)
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_doc_ids_with_values<'a>(
|
||||||
|
column_index: &'a ColumnIndex,
|
||||||
|
doc_range: Range<u32>,
|
||||||
|
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||||
|
match column_index {
|
||||||
|
ColumnIndex::Empty { .. } => Box::new(0..0),
|
||||||
|
ColumnIndex::Full => Box::new(doc_range),
|
||||||
|
ColumnIndex::Optional(optional_index) => Box::new(
|
||||||
|
optional_index
|
||||||
|
.iter_rows()
|
||||||
|
.map(move |row| row + doc_range.start),
|
||||||
|
),
|
||||||
|
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||||
|
MultiValueIndex::MultiValueIndexV1(multivalued_index) => {
|
||||||
|
Box::new((0..multivalued_index.num_docs()).filter_map(move |docid| {
|
||||||
|
let range = multivalued_index.range(docid);
|
||||||
|
if range.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(docid + doc_range.start)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
|
||||||
|
multivalued_index
|
||||||
|
.optional_index
|
||||||
|
.iter_rows()
|
||||||
|
.map(move |row| row + doc_range.start),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stack_doc_ids_with_values<'a>(
|
||||||
|
column_indexes: &'a [ColumnIndex],
|
||||||
|
stack_merge_order: &'a StackMergeOrder,
|
||||||
|
) -> SerializableOptionalIndex<'a> {
|
||||||
|
let num_rows = stack_merge_order.num_rows();
|
||||||
|
SerializableOptionalIndex {
|
||||||
|
non_null_row_ids: Box::new(StackedDocIdsWithValues {
|
||||||
|
column_indexes,
|
||||||
|
stack_merge_order,
|
||||||
|
}),
|
||||||
|
num_rows,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct StackedStartOffsets<'a> {
|
||||||
|
column_indexes: &'a [ColumnIndex],
|
||||||
|
stack_merge_order: &'a StackMergeOrder,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_num_values_iterator<'a>(
|
||||||
|
column_index: &'a ColumnIndex,
|
||||||
|
num_docs: u32,
|
||||||
|
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||||
|
match column_index {
|
||||||
|
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
|
||||||
|
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
|
||||||
|
ColumnIndex::Optional(optional_index) => {
|
||||||
|
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
|
||||||
|
}
|
||||||
|
ColumnIndex::Multivalued(multivalued_index) => Box::new(
|
||||||
|
multivalued_index
|
||||||
|
.get_start_index_column()
|
||||||
|
.iter()
|
||||||
|
.scan(0u32, |previous_start_offset, current_start_offset| {
|
||||||
|
let num_vals = current_start_offset - *previous_start_offset;
|
||||||
|
*previous_start_offset = current_start_offset;
|
||||||
|
Some(num_vals)
|
||||||
|
})
|
||||||
|
.skip(1),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
|
||||||
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
|
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
|
||||||
|
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
|
||||||
|
let column_index = &self.column_indexes[columnar_id];
|
||||||
|
get_num_values_iterator(column_index, num_docs)
|
||||||
|
});
|
||||||
|
Box::new(std::iter::once(0u32).chain(num_values_it.into_iter().scan(
|
||||||
|
0u32,
|
||||||
|
|cumulated, el| {
|
||||||
|
*cumulated += el;
|
||||||
|
Some(*cumulated)
|
||||||
|
},
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stack_start_offsets<'a>(
|
||||||
|
column_indexes: &'a [ColumnIndex],
|
||||||
|
stack_merge_order: &'a StackMergeOrder,
|
||||||
|
) -> Box<dyn Iterable<u32> + 'a> {
|
||||||
|
Box::new(StackedStartOffsets {
|
||||||
|
column_indexes,
|
||||||
|
stack_merge_order,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_serializable_multivalued_index<'a>(
|
||||||
|
columns: &'a [ColumnIndex],
|
||||||
|
stack_merge_order: &'a StackMergeOrder,
|
||||||
|
) -> SerializableMultivalueIndex<'a> {
|
||||||
|
SerializableMultivalueIndex {
|
||||||
|
doc_ids_with_values: stack_doc_ids_with_values(columns, stack_merge_order),
|
||||||
|
start_offsets: stack_start_offsets(columns, stack_merge_order),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct StackedOptionalIndex<'a> {
|
struct StackedOptionalIndex<'a> {
|
||||||
columns: &'a [ColumnIndex],
|
columns: &'a [ColumnIndex],
|
||||||
stack_merge_order: &'a StackMergeOrder,
|
stack_merge_order: &'a StackMergeOrder,
|
||||||
@@ -62,87 +190,3 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
struct StackedMultivaluedIndex<'a> {
|
|
||||||
columns: &'a [ColumnIndex],
|
|
||||||
stack_merge_order: &'a StackMergeOrder,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_column_opt_to_multivalued_index<'a>(
|
|
||||||
column_index_opt: &'a ColumnIndex,
|
|
||||||
num_rows: RowId,
|
|
||||||
) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
|
||||||
match column_index_opt {
|
|
||||||
ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
|
|
||||||
ColumnIndex::Full => Box::new(0..num_rows + 1),
|
|
||||||
ColumnIndex::Optional(optional_index) => {
|
|
||||||
Box::new(
|
|
||||||
(0..num_rows)
|
|
||||||
// TODO optimize
|
|
||||||
.map(|row_id| optional_index.rank(row_id))
|
|
||||||
.chain(std::iter::once(optional_index.num_non_nulls())),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
|
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + '_> {
|
|
||||||
let multivalued_indexes =
|
|
||||||
self.columns
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(columnar_id, column_opt)| {
|
|
||||||
let num_rows =
|
|
||||||
self.stack_merge_order.columnar_range(columnar_id).len() as RowId;
|
|
||||||
convert_column_opt_to_multivalued_index(column_opt, num_rows)
|
|
||||||
});
|
|
||||||
stack_multivalued_indexes(multivalued_indexes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Refactor me
|
|
||||||
fn stack_multivalued_indexes<'a>(
|
|
||||||
mut multivalued_indexes: impl Iterator<Item = Box<dyn Iterator<Item = RowId> + 'a>> + 'a,
|
|
||||||
) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
|
||||||
let mut offset = 0;
|
|
||||||
let mut last_row_id = 0;
|
|
||||||
let mut current_it = multivalued_indexes.next();
|
|
||||||
Box::new(std::iter::from_fn(move || loop {
|
|
||||||
if let Some(row_id) = current_it.as_mut()?.next() {
|
|
||||||
last_row_id = offset + row_id;
|
|
||||||
return Some(last_row_id);
|
|
||||||
}
|
|
||||||
offset = last_row_id;
|
|
||||||
loop {
|
|
||||||
current_it = multivalued_indexes.next();
|
|
||||||
if current_it.as_mut()?.next().is_some() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::RowId;
|
|
||||||
|
|
||||||
fn it<'a>(row_ids: &'a [RowId]) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
|
||||||
Box::new(row_ids.iter().copied())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_stack() {
|
|
||||||
let columns = [
|
|
||||||
it(&[0u32, 0u32]),
|
|
||||||
it(&[0u32, 1u32, 1u32, 4u32]),
|
|
||||||
it(&[0u32, 3u32, 5u32]),
|
|
||||||
it(&[0u32, 4u32]),
|
|
||||||
]
|
|
||||||
.into_iter();
|
|
||||||
let start_offsets: Vec<RowId> = super::stack_multivalued_indexes(columns).collect();
|
|
||||||
assert_eq!(start_offsets, &[0, 0, 1, 1, 4, 7, 9, 13]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -11,8 +11,11 @@ mod serialize;
|
|||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
||||||
pub use merge::merge_column_index;
|
pub use merge::merge_column_index;
|
||||||
|
pub(crate) use multivalued_index::SerializableMultivalueIndex;
|
||||||
pub use optional_index::{OptionalIndex, Set};
|
pub use optional_index::{OptionalIndex, Set};
|
||||||
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
|
pub use serialize::{
|
||||||
|
open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
use crate::column_index::multivalued_index::MultiValueIndex;
|
||||||
use crate::{Cardinality, DocId, RowId};
|
use crate::{Cardinality, DocId, RowId};
|
||||||
@@ -131,15 +134,41 @@ impl ColumnIndex {
|
|||||||
let row_end = optional_index.rank(doc_id_range.end);
|
let row_end = optional_index.rank(doc_id_range.end);
|
||||||
row_start..row_end
|
row_start..row_end
|
||||||
}
|
}
|
||||||
ColumnIndex::Multivalued(multivalued_index) => {
|
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||||
let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1;
|
MultiValueIndex::MultiValueIndexV1(index) => {
|
||||||
let start_docid = doc_id_range.start.min(end_docid);
|
let row_start = index.start_index_column.get_val(doc_id_range.start);
|
||||||
|
let row_end = index.start_index_column.get_val(doc_id_range.end);
|
||||||
|
row_start..row_end
|
||||||
|
}
|
||||||
|
MultiValueIndex::MultiValueIndexV2(index) => {
|
||||||
|
// In this case we will use the optional_index select the next values
|
||||||
|
// that are valid. There are different cases to consider:
|
||||||
|
// Not exists below means does not exist in the optional
|
||||||
|
// index, because it has no values.
|
||||||
|
// * doc_id_range may cover a range of docids which are non existent
|
||||||
|
// => rank
|
||||||
|
// will give us the next document outside the range with a value. They both
|
||||||
|
// get the same rank and therefore return a zero range
|
||||||
|
//
|
||||||
|
// * doc_id_range.start and doc_id_range.end may not exist, but docids in
|
||||||
|
// between may have values
|
||||||
|
// => rank will give us the next document outside the range with a value.
|
||||||
|
//
|
||||||
|
// * doc_id_range.start may be not existent but doc_id_range.end may exist
|
||||||
|
// * doc_id_range.start may exist but doc_id_range.end may not exist
|
||||||
|
// * doc_id_range.start and doc_id_range.end may exist
|
||||||
|
// => rank on doc_id_range.end will give use the next value, which matches
|
||||||
|
// how the `start_index_column` works, so we get the value start of the next
|
||||||
|
// docid which we use to create the exclusive range.
|
||||||
|
//
|
||||||
|
let rank_start = index.optional_index.rank(doc_id_range.start);
|
||||||
|
let row_start = index.start_index_column.get_val(rank_start);
|
||||||
|
let rank_end = index.optional_index.rank(doc_id_range.end);
|
||||||
|
let row_end = index.start_index_column.get_val(rank_end);
|
||||||
|
|
||||||
let row_start = multivalued_index.start_index_column.get_val(start_docid);
|
row_start..row_end
|
||||||
let row_end = multivalued_index.start_index_column.get_val(end_docid);
|
}
|
||||||
|
},
|
||||||
row_start..row_end
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,64 +3,98 @@ use std::io::Write;
|
|||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use common::OwnedBytes;
|
use common::{CountingWriter, OwnedBytes};
|
||||||
|
|
||||||
|
use super::optional_index::{open_optional_index, serialize_optional_index};
|
||||||
|
use super::{OptionalIndex, SerializableOptionalIndex, Set};
|
||||||
use crate::column_values::{
|
use crate::column_values::{
|
||||||
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
|
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
|
||||||
};
|
};
|
||||||
use crate::iterable::Iterable;
|
use crate::iterable::Iterable;
|
||||||
use crate::{DocId, RowId};
|
use crate::{DocId, RowId, Version};
|
||||||
|
|
||||||
|
pub struct SerializableMultivalueIndex<'a> {
|
||||||
|
pub doc_ids_with_values: SerializableOptionalIndex<'a>,
|
||||||
|
pub start_offsets: Box<dyn Iterable<u32> + 'a>,
|
||||||
|
}
|
||||||
|
|
||||||
pub fn serialize_multivalued_index(
|
pub fn serialize_multivalued_index(
|
||||||
multivalued_index: &dyn Iterable<RowId>,
|
multivalued_index: &SerializableMultivalueIndex,
|
||||||
output: &mut impl Write,
|
output: &mut impl Write,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
|
let SerializableMultivalueIndex {
|
||||||
|
doc_ids_with_values,
|
||||||
|
start_offsets,
|
||||||
|
} = multivalued_index;
|
||||||
|
let mut count_writer = CountingWriter::wrap(output);
|
||||||
|
let SerializableOptionalIndex {
|
||||||
|
non_null_row_ids,
|
||||||
|
num_rows,
|
||||||
|
} = doc_ids_with_values;
|
||||||
|
serialize_optional_index(&**non_null_row_ids, *num_rows, &mut count_writer)?;
|
||||||
|
let optional_len = count_writer.written_bytes() as u32;
|
||||||
|
let output = count_writer.finish();
|
||||||
serialize_u64_based_column_values(
|
serialize_u64_based_column_values(
|
||||||
multivalued_index,
|
&**start_offsets,
|
||||||
&[CodecType::Bitpacked, CodecType::Linear],
|
&[CodecType::Bitpacked, CodecType::Linear],
|
||||||
output,
|
output,
|
||||||
)?;
|
)?;
|
||||||
|
output.write_all(&optional_len.to_le_bytes())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
|
pub fn open_multivalued_index(
|
||||||
let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
|
bytes: OwnedBytes,
|
||||||
Ok(MultiValueIndex { start_index_column })
|
format_version: Version,
|
||||||
|
) -> io::Result<MultiValueIndex> {
|
||||||
|
match format_version {
|
||||||
|
Version::V1 => {
|
||||||
|
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||||
|
load_u64_based_column_values(bytes)?;
|
||||||
|
Ok(MultiValueIndex::MultiValueIndexV1(MultiValueIndexV1 {
|
||||||
|
start_index_column,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
Version::V2 => {
|
||||||
|
let (body_bytes, optional_index_len) = bytes.rsplit(4);
|
||||||
|
let optional_index_len =
|
||||||
|
u32::from_le_bytes(optional_index_len.as_slice().try_into().unwrap());
|
||||||
|
let (optional_index_bytes, start_index_bytes) =
|
||||||
|
body_bytes.split(optional_index_len as usize);
|
||||||
|
let optional_index = open_optional_index(optional_index_bytes)?;
|
||||||
|
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||||
|
load_u64_based_column_values(start_index_bytes)?;
|
||||||
|
Ok(MultiValueIndex::MultiValueIndexV2(MultiValueIndexV2 {
|
||||||
|
optional_index,
|
||||||
|
start_index_column,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
/// Index to resolve value range for given doc_id.
|
/// Index to resolve value range for given doc_id.
|
||||||
/// Starts at 0.
|
/// Starts at 0.
|
||||||
pub struct MultiValueIndex {
|
pub enum MultiValueIndex {
|
||||||
|
MultiValueIndexV1(MultiValueIndexV1),
|
||||||
|
MultiValueIndexV2(MultiValueIndexV2),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
/// Index to resolve value range for given doc_id.
|
||||||
|
/// Starts at 0.
|
||||||
|
pub struct MultiValueIndexV1 {
|
||||||
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for MultiValueIndex {
|
impl MultiValueIndexV1 {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
f.debug_struct("MultiValuedIndex")
|
|
||||||
.field("num_rows", &self.start_index_column.num_vals())
|
|
||||||
.finish_non_exhaustive()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
|
|
||||||
fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
|
|
||||||
MultiValueIndex { start_index_column }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MultiValueIndex {
|
|
||||||
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
|
|
||||||
let bytes = OwnedBytes::new(buffer);
|
|
||||||
open_multivalued_index(bytes).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns `[start, end)`, such that the values associated with
|
/// Returns `[start, end)`, such that the values associated with
|
||||||
/// the given document are `start..end`.
|
/// the given document are `start..end`.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||||
|
if doc_id >= self.num_docs() {
|
||||||
|
return 0..0;
|
||||||
|
}
|
||||||
let start = self.start_index_column.get_val(doc_id);
|
let start = self.start_index_column.get_val(doc_id);
|
||||||
let end = self.start_index_column.get_val(doc_id + 1);
|
let end = self.start_index_column.get_val(doc_id + 1);
|
||||||
start..end
|
start..end
|
||||||
@@ -83,7 +117,6 @@ impl MultiValueIndex {
|
|||||||
///
|
///
|
||||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||||
/// match a docid to its value position.
|
/// match a docid to its value position.
|
||||||
#[allow(clippy::bool_to_int_with_if)]
|
|
||||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||||
if ranks.is_empty() {
|
if ranks.is_empty() {
|
||||||
return;
|
return;
|
||||||
@@ -111,11 +144,170 @@ impl MultiValueIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
/// Index to resolve value range for given doc_id.
|
||||||
|
/// Starts at 0.
|
||||||
|
pub struct MultiValueIndexV2 {
|
||||||
|
pub optional_index: OptionalIndex,
|
||||||
|
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for MultiValueIndex {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
let index = match self {
|
||||||
|
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
|
||||||
|
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
|
||||||
|
};
|
||||||
|
f.debug_struct("MultiValuedIndex")
|
||||||
|
.field("num_rows", &index.num_vals())
|
||||||
|
.finish_non_exhaustive()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MultiValueIndex {
|
||||||
|
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
|
||||||
|
assert!(!start_offsets.is_empty());
|
||||||
|
assert_eq!(start_offsets[0], 0);
|
||||||
|
let mut doc_with_values = Vec::new();
|
||||||
|
let mut compact_start_offsets: Vec<u32> = vec![0];
|
||||||
|
for doc in 0..start_offsets.len() - 1 {
|
||||||
|
if start_offsets[doc] < start_offsets[doc + 1] {
|
||||||
|
doc_with_values.push(doc as RowId);
|
||||||
|
compact_start_offsets.push(start_offsets[doc + 1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let serializable_multivalued_index = SerializableMultivalueIndex {
|
||||||
|
doc_ids_with_values: SerializableOptionalIndex {
|
||||||
|
non_null_row_ids: Box::new(&doc_with_values[..]),
|
||||||
|
num_rows: start_offsets.len() as u32 - 1,
|
||||||
|
},
|
||||||
|
start_offsets: Box::new(&compact_start_offsets[..]),
|
||||||
|
};
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
serialize_multivalued_index(&serializable_multivalued_index, &mut buffer).unwrap();
|
||||||
|
let bytes = OwnedBytes::new(buffer);
|
||||||
|
open_multivalued_index(bytes, Version::V2).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_start_index_column(&self) -> &Arc<dyn crate::ColumnValues<RowId>> {
|
||||||
|
match self {
|
||||||
|
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
|
||||||
|
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `[start, end)` values range, such that the values associated with
|
||||||
|
/// the given document are `start..end`.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||||
|
match self {
|
||||||
|
MultiValueIndex::MultiValueIndexV1(idx) => idx.range(doc_id),
|
||||||
|
MultiValueIndex::MultiValueIndexV2(idx) => idx.range(doc_id),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of documents in the index.
|
||||||
|
#[inline]
|
||||||
|
pub fn num_docs(&self) -> u32 {
|
||||||
|
match self {
|
||||||
|
MultiValueIndex::MultiValueIndexV1(idx) => idx.start_index_column.num_vals() - 1,
|
||||||
|
MultiValueIndex::MultiValueIndexV2(idx) => idx.optional_index.num_docs(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||||
|
/// docids. Positions are converted inplace to docids.
|
||||||
|
///
|
||||||
|
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||||
|
/// index.
|
||||||
|
///
|
||||||
|
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||||
|
/// increasing positions.
|
||||||
|
///
|
||||||
|
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||||
|
/// match a docid to its value position.
|
||||||
|
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||||
|
match self {
|
||||||
|
MultiValueIndex::MultiValueIndexV1(idx) => {
|
||||||
|
idx.select_batch_in_place(docid_start, ranks)
|
||||||
|
}
|
||||||
|
MultiValueIndex::MultiValueIndexV2(idx) => {
|
||||||
|
idx.select_batch_in_place(docid_start, ranks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl MultiValueIndexV2 {
|
||||||
|
/// Returns `[start, end)`, such that the values associated with
|
||||||
|
/// the given document are `start..end`.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||||
|
let Some(rank) = self.optional_index.rank_if_exists(doc_id) else {
|
||||||
|
return 0..0;
|
||||||
|
};
|
||||||
|
let start = self.start_index_column.get_val(rank);
|
||||||
|
let end = self.start_index_column.get_val(rank + 1);
|
||||||
|
start..end
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of documents in the index.
|
||||||
|
#[inline]
|
||||||
|
pub fn num_docs(&self) -> u32 {
|
||||||
|
self.optional_index.num_docs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||||
|
/// docids. Positions are converted inplace to docids.
|
||||||
|
///
|
||||||
|
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||||
|
/// index.
|
||||||
|
///
|
||||||
|
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||||
|
/// increasing positions.
|
||||||
|
///
|
||||||
|
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||||
|
/// match a docid to its value position.
|
||||||
|
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||||
|
if ranks.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let mut cur_pos_in_idx = self.optional_index.rank(docid_start);
|
||||||
|
let mut last_doc = None;
|
||||||
|
|
||||||
|
assert!(cur_pos_in_idx <= ranks[0]);
|
||||||
|
|
||||||
|
let mut write_doc_pos = 0;
|
||||||
|
for i in 0..ranks.len() {
|
||||||
|
let pos = ranks[i];
|
||||||
|
loop {
|
||||||
|
let end = self.start_index_column.get_val(cur_pos_in_idx + 1);
|
||||||
|
if end > pos {
|
||||||
|
ranks[write_doc_pos] = cur_pos_in_idx;
|
||||||
|
write_doc_pos += if last_doc == Some(cur_pos_in_idx) {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
1
|
||||||
|
};
|
||||||
|
last_doc = Some(cur_pos_in_idx);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
cur_pos_in_idx += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ranks.truncate(write_doc_pos);
|
||||||
|
|
||||||
|
for rank in ranks.iter_mut() {
|
||||||
|
*rank = self.optional_index.select(*rank);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
||||||
use super::MultiValueIndex;
|
use super::MultiValueIndex;
|
||||||
|
use crate::{ColumnarReader, DynamicColumn};
|
||||||
|
|
||||||
fn index_to_pos_helper(
|
fn index_to_pos_helper(
|
||||||
index: &MultiValueIndex,
|
index: &MultiValueIndex,
|
||||||
@@ -134,6 +326,7 @@ mod tests {
|
|||||||
let positions = &[10u32, 11, 15, 20, 21, 22];
|
let positions = &[10u32, 11, 15, 20, 21, 22];
|
||||||
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
|
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
|
||||||
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
|
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
|
||||||
|
|
||||||
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
|
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
|
||||||
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
|
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
|
||||||
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
|
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
|
||||||
@@ -141,4 +334,67 @@ mod tests {
|
|||||||
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
|
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
|
||||||
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
|
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_range_to_rowids() {
|
||||||
|
use crate::ColumnarWriter;
|
||||||
|
|
||||||
|
let mut columnar_writer = ColumnarWriter::default();
|
||||||
|
|
||||||
|
// This column gets coerced to u64
|
||||||
|
columnar_writer.record_numerical(1, "full", u64::MAX);
|
||||||
|
columnar_writer.record_numerical(1, "full", u64::MAX);
|
||||||
|
|
||||||
|
columnar_writer.record_numerical(5, "full", u64::MAX);
|
||||||
|
columnar_writer.record_numerical(5, "full", u64::MAX);
|
||||||
|
|
||||||
|
let mut wrt: Vec<u8> = Vec::new();
|
||||||
|
columnar_writer.serialize(7, &mut wrt).unwrap();
|
||||||
|
|
||||||
|
let reader = ColumnarReader::open(wrt).unwrap();
|
||||||
|
// Open the column as u64
|
||||||
|
let column = reader.read_columns("full").unwrap()[0]
|
||||||
|
.open()
|
||||||
|
.unwrap()
|
||||||
|
.coerce_numerical(crate::NumericalType::U64)
|
||||||
|
.unwrap();
|
||||||
|
let DynamicColumn::U64(column) = column else {
|
||||||
|
panic!();
|
||||||
|
};
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(1..2);
|
||||||
|
assert_eq!(row_id_range, 0..2);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(0..2);
|
||||||
|
assert_eq!(row_id_range, 0..2);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(0..4);
|
||||||
|
assert_eq!(row_id_range, 0..2);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(3..4);
|
||||||
|
assert_eq!(row_id_range, 2..2);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(1..6);
|
||||||
|
assert_eq!(row_id_range, 0..4);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(3..6);
|
||||||
|
assert_eq!(row_id_range, 2..4);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(0..6);
|
||||||
|
assert_eq!(row_id_range, 0..4);
|
||||||
|
|
||||||
|
let row_id_range = column.index.docid_range_to_rowids(0..6);
|
||||||
|
assert_eq!(row_id_range, 0..4);
|
||||||
|
|
||||||
|
let check = |range, expected| {
|
||||||
|
let full_range = 0..=u64::MAX;
|
||||||
|
let mut docids = Vec::new();
|
||||||
|
column.get_docids_for_value_range(full_range, range, &mut docids);
|
||||||
|
assert_eq!(docids, expected);
|
||||||
|
};
|
||||||
|
|
||||||
|
// check(0..1, vec![]);
|
||||||
|
// check(0..2, vec![1]);
|
||||||
|
check(1..2, vec![1]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,8 +86,14 @@ pub struct OptionalIndex {
|
|||||||
block_metas: Arc<[BlockMeta]>,
|
block_metas: Arc<[BlockMeta]>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterable<u32> for &'a OptionalIndex {
|
||||||
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
|
Box::new(self.iter_rows())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for OptionalIndex {
|
impl std::fmt::Debug for OptionalIndex {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
f.debug_struct("OptionalIndex")
|
f.debug_struct("OptionalIndex")
|
||||||
.field("num_rows", &self.num_rows)
|
.field("num_rows", &self.num_rows)
|
||||||
.field("num_non_null_rows", &self.num_non_null_rows)
|
.field("num_non_null_rows", &self.num_non_null_rows)
|
||||||
|
|||||||
@@ -28,10 +28,11 @@ pub trait Set<T> {
|
|||||||
/// Returns true if the elements is contained in the Set
|
/// Returns true if the elements is contained in the Set
|
||||||
fn contains(&self, el: T) -> bool;
|
fn contains(&self, el: T) -> bool;
|
||||||
|
|
||||||
/// Returns the number of rows in the set that are < `el`
|
/// Returns the element's rank (its position in the set).
|
||||||
|
/// If the set does not contain the element, it will return the next existing elements rank.
|
||||||
fn rank(&self, el: T) -> T;
|
fn rank(&self, el: T) -> T;
|
||||||
|
|
||||||
/// If the set contains `el` returns the element rank.
|
/// If the set contains `el`, returns the element's rank (its position in the set).
|
||||||
/// If the set does not contain the element, it returns `None`.
|
/// If the set does not contain the element, it returns `None`.
|
||||||
fn rank_if_exists(&self, el: T) -> Option<T>;
|
fn rank_if_exists(&self, el: T) -> Option<T>;
|
||||||
|
|
||||||
|
|||||||
@@ -22,8 +22,8 @@ fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
|
|||||||
vals.iter().cloned().take_while(|v| *v < val).count() as u16
|
vals.iter().cloned().take_while(|v| *v < val).count() as u16
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for rank in 0..vals.len() {
|
for (rank, val) in vals.iter().enumerate() {
|
||||||
assert_eq!(tested_set.select(rank as u16), vals[rank]);
|
assert_eq!(tested_set.select(rank as u16), *val);
|
||||||
}
|
}
|
||||||
buffer.len()
|
buffer.len()
|
||||||
}
|
}
|
||||||
@@ -107,3 +107,41 @@ fn test_simple_translate_codec_idx_to_original_idx_dense() {
|
|||||||
assert_eq!(i, select_cursor.select(i));
|
assert_eq!(i, select_cursor.select(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_simple_translate_idx_to_value_idx_dense() {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
DenseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
|
||||||
|
let tested_set = DenseBlockCodec::open(buffer.as_slice());
|
||||||
|
assert!(tested_set.contains(1));
|
||||||
|
assert!(!tested_set.contains(2));
|
||||||
|
assert_eq!(tested_set.rank(0), 0);
|
||||||
|
assert_eq!(tested_set.rank(1), 0);
|
||||||
|
for rank in 2..10 {
|
||||||
|
// ranks that don't exist select the next highest one
|
||||||
|
assert_eq!(tested_set.rank_if_exists(rank), None);
|
||||||
|
assert_eq!(tested_set.rank(rank), 1);
|
||||||
|
}
|
||||||
|
assert_eq!(tested_set.rank(10), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_simple_translate_idx_to_value_idx_sparse() {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
SparseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
|
||||||
|
let tested_set = SparseBlockCodec::open(buffer.as_slice());
|
||||||
|
assert!(tested_set.contains(1));
|
||||||
|
assert!(!tested_set.contains(2));
|
||||||
|
assert_eq!(tested_set.rank(0), 0);
|
||||||
|
assert_eq!(tested_set.select(tested_set.rank(0)), 1);
|
||||||
|
assert_eq!(tested_set.rank(1), 0);
|
||||||
|
assert_eq!(tested_set.select(tested_set.rank(1)), 1);
|
||||||
|
for rank in 2..10 {
|
||||||
|
// ranks that don't exist select the next highest one
|
||||||
|
assert_eq!(tested_set.rank_if_exists(rank), None);
|
||||||
|
assert_eq!(tested_set.rank(rank), 1);
|
||||||
|
assert_eq!(tested_set.select(tested_set.rank(rank)), 10);
|
||||||
|
}
|
||||||
|
assert_eq!(tested_set.rank(10), 1);
|
||||||
|
assert_eq!(tested_set.select(tested_set.rank(10)), 10);
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,9 +15,7 @@ fn test_optional_index_with_num_docs(num_docs: u32) {
|
|||||||
let mut dataframe_writer = ColumnarWriter::default();
|
let mut dataframe_writer = ColumnarWriter::default();
|
||||||
dataframe_writer.record_numerical(100, "score", 80i64);
|
dataframe_writer.record_numerical(100, "score", 80i64);
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer
|
dataframe_writer.serialize(num_docs, &mut buffer).unwrap();
|
||||||
.serialize(num_docs, None, &mut buffer)
|
|
||||||
.unwrap();
|
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
|
||||||
|
|||||||
@@ -3,28 +3,39 @@ use std::io::Write;
|
|||||||
|
|
||||||
use common::{CountingWriter, OwnedBytes};
|
use common::{CountingWriter, OwnedBytes};
|
||||||
|
|
||||||
|
use super::multivalued_index::SerializableMultivalueIndex;
|
||||||
|
use super::OptionalIndex;
|
||||||
use crate::column_index::multivalued_index::serialize_multivalued_index;
|
use crate::column_index::multivalued_index::serialize_multivalued_index;
|
||||||
use crate::column_index::optional_index::serialize_optional_index;
|
use crate::column_index::optional_index::serialize_optional_index;
|
||||||
use crate::column_index::ColumnIndex;
|
use crate::column_index::ColumnIndex;
|
||||||
use crate::iterable::Iterable;
|
use crate::iterable::Iterable;
|
||||||
use crate::{Cardinality, RowId};
|
use crate::{Cardinality, RowId, Version};
|
||||||
|
|
||||||
|
pub struct SerializableOptionalIndex<'a> {
|
||||||
|
pub non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
|
||||||
|
pub num_rows: RowId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&'a OptionalIndex> for SerializableOptionalIndex<'a> {
|
||||||
|
fn from(optional_index: &'a OptionalIndex) -> Self {
|
||||||
|
SerializableOptionalIndex {
|
||||||
|
non_null_row_ids: Box::new(optional_index),
|
||||||
|
num_rows: optional_index.num_docs(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub enum SerializableColumnIndex<'a> {
|
pub enum SerializableColumnIndex<'a> {
|
||||||
Full,
|
Full,
|
||||||
Optional {
|
Optional(SerializableOptionalIndex<'a>),
|
||||||
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
|
Multivalued(SerializableMultivalueIndex<'a>),
|
||||||
num_rows: RowId,
|
|
||||||
},
|
|
||||||
// TODO remove the Arc<dyn> apart from serialization this is not
|
|
||||||
// dynamic at all.
|
|
||||||
Multivalued(Box<dyn Iterable<RowId> + 'a>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SerializableColumnIndex<'a> {
|
impl<'a> SerializableColumnIndex<'a> {
|
||||||
pub fn get_cardinality(&self) -> Cardinality {
|
pub fn get_cardinality(&self) -> Cardinality {
|
||||||
match self {
|
match self {
|
||||||
SerializableColumnIndex::Full => Cardinality::Full,
|
SerializableColumnIndex::Full => Cardinality::Full,
|
||||||
SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
|
SerializableColumnIndex::Optional(_) => Cardinality::Optional,
|
||||||
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
|
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -40,12 +51,12 @@ pub fn serialize_column_index(
|
|||||||
output.write_all(&[cardinality])?;
|
output.write_all(&[cardinality])?;
|
||||||
match column_index {
|
match column_index {
|
||||||
SerializableColumnIndex::Full => {}
|
SerializableColumnIndex::Full => {}
|
||||||
SerializableColumnIndex::Optional {
|
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||||
non_null_row_ids,
|
non_null_row_ids,
|
||||||
num_rows,
|
num_rows,
|
||||||
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
|
}) => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
|
||||||
SerializableColumnIndex::Multivalued(multivalued_index) => {
|
SerializableColumnIndex::Multivalued(multivalued_index) => {
|
||||||
serialize_multivalued_index(&*multivalued_index, &mut output)?
|
serialize_multivalued_index(&multivalued_index, &mut output)?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let column_index_num_bytes = output.written_bytes() as u32;
|
let column_index_num_bytes = output.written_bytes() as u32;
|
||||||
@@ -53,7 +64,10 @@ pub fn serialize_column_index(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Open a serialized column index.
|
/// Open a serialized column index.
|
||||||
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
|
pub fn open_column_index(
|
||||||
|
mut bytes: OwnedBytes,
|
||||||
|
format_version: Version,
|
||||||
|
) -> io::Result<ColumnIndex> {
|
||||||
if bytes.is_empty() {
|
if bytes.is_empty() {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
io::ErrorKind::UnexpectedEof,
|
io::ErrorKind::UnexpectedEof,
|
||||||
@@ -70,7 +84,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
|
|||||||
Ok(ColumnIndex::Optional(optional_index))
|
Ok(ColumnIndex::Optional(optional_index))
|
||||||
}
|
}
|
||||||
Cardinality::Multivalued => {
|
Cardinality::Multivalued => {
|
||||||
let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
|
let multivalue_index =
|
||||||
|
super::multivalued_index::open_multivalued_index(bytes, format_version)?;
|
||||||
Ok(ColumnIndex::Multivalued(multivalue_index))
|
Ok(ColumnIndex::Multivalued(multivalue_index))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
|
|||||||
fn value_iter() -> impl Iterator<Item = u64> {
|
fn value_iter() -> impl Iterator<Item = u64> {
|
||||||
0..20_000
|
0..20_000
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
let stats = compute_stats(data.iter().cloned());
|
let stats = compute_stats(data.iter().cloned());
|
||||||
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
|
|||||||
for val in data {
|
for val in data {
|
||||||
codec_serializer.collect(*val);
|
codec_serializer.collect(*val);
|
||||||
}
|
}
|
||||||
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
|
codec_serializer
|
||||||
|
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
||||||
let col = get_reader_for_bench::<Codec>(data);
|
let col = get_reader_for_bench::<Codec>(data);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
|
|||||||
@@ -1,3 +1,6 @@
|
|||||||
|
use core::fmt;
|
||||||
|
use std::fmt::{Display, Formatter};
|
||||||
|
|
||||||
use crate::InvalidData;
|
use crate::InvalidData;
|
||||||
|
|
||||||
pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_of::<u32>();
|
pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_of::<u32>();
|
||||||
@@ -8,7 +11,7 @@ const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];
|
|||||||
|
|
||||||
pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
|
pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
|
||||||
let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
|
let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
|
||||||
footer_bytes[0..4].copy_from_slice(&Version::V1.to_bytes());
|
footer_bytes[0..4].copy_from_slice(&CURRENT_VERSION.to_bytes());
|
||||||
footer_bytes[4..8].copy_from_slice(&MAGIC_BYTES[..]);
|
footer_bytes[4..8].copy_from_slice(&MAGIC_BYTES[..]);
|
||||||
footer_bytes
|
footer_bytes
|
||||||
}
|
}
|
||||||
@@ -20,10 +23,22 @@ pub fn parse_footer(footer_bytes: [u8; VERSION_FOOTER_NUM_BYTES]) -> Result<Vers
|
|||||||
Version::try_from_bytes(footer_bytes[0..4].try_into().unwrap())
|
Version::try_from_bytes(footer_bytes[0..4].try_into().unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub const CURRENT_VERSION: Version = Version::V2;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||||
#[repr(u32)]
|
#[repr(u32)]
|
||||||
pub enum Version {
|
pub enum Version {
|
||||||
V1 = 1u32,
|
V1 = 1u32,
|
||||||
|
V2 = 2u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for Version {
|
||||||
|
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
Version::V1 => write!(f, "v1"),
|
||||||
|
Version::V2 => write!(f, "v2"),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Version {
|
impl Version {
|
||||||
@@ -35,6 +50,7 @@ impl Version {
|
|||||||
let code = u32::from_le_bytes(bytes);
|
let code = u32::from_le_bytes(bytes);
|
||||||
match code {
|
match code {
|
||||||
1u32 => Ok(Version::V1),
|
1u32 => Ok(Version::V1),
|
||||||
|
2u32 => Ok(Version::V2),
|
||||||
_ => Err(InvalidData),
|
_ => Err(InvalidData),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -47,9 +63,9 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_footer_dserialization() {
|
fn test_footer_deserialization() {
|
||||||
let parsed_version: Version = parse_footer(footer()).unwrap();
|
let parsed_version: Version = parse_footer(footer()).unwrap();
|
||||||
assert_eq!(Version::V1, parsed_version);
|
assert_eq!(Version::V2, parsed_version);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -63,11 +79,10 @@ mod tests {
|
|||||||
for &i in &version_to_tests {
|
for &i in &version_to_tests {
|
||||||
let version_res = Version::try_from_bytes(i.to_le_bytes());
|
let version_res = Version::try_from_bytes(i.to_le_bytes());
|
||||||
if let Ok(version) = version_res {
|
if let Ok(version) = version_res {
|
||||||
assert_eq!(version, Version::V1);
|
|
||||||
assert_eq!(version.to_bytes(), i.to_le_bytes());
|
assert_eq!(version.to_bytes(), i.to_le_bytes());
|
||||||
valid_versions.insert(i);
|
valid_versions.insert(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert_eq!(valid_versions.len(), 1);
|
assert_eq!(valid_versions.len(), 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ use std::io;
|
|||||||
use std::net::Ipv6Addr;
|
use std::net::Ipv6Addr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use itertools::Itertools;
|
|
||||||
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||||
|
|
||||||
use super::writer::ColumnarSerializer;
|
use super::writer::ColumnarSerializer;
|
||||||
@@ -371,20 +370,8 @@ fn is_empty_after_merge(
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
ColumnIndex::Multivalued(multivalued_index) => {
|
ColumnIndex::Multivalued(multivalued_index) => {
|
||||||
for (doc_id, (start_index, end_index)) in multivalued_index
|
for alive_docid in alive_bitset.iter() {
|
||||||
.start_index_column
|
if !multivalued_index.range(alive_docid).is_empty() {
|
||||||
.iter()
|
|
||||||
.tuple_windows()
|
|
||||||
.enumerate()
|
|
||||||
{
|
|
||||||
let doc_id = doc_id as u32;
|
|
||||||
if start_index == end_index {
|
|
||||||
// There are no values in this document
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// The document contains values and is present in the alive bitset.
|
|
||||||
// The column is therefore not empty.
|
|
||||||
if alive_bitset.contains(doc_id) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
|
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
|
||||||
|
|
||||||
@@ -12,7 +14,7 @@ fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
|
|||||||
}
|
}
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer
|
dataframe_writer
|
||||||
.serialize(vals.len() as RowId, None, &mut buffer)
|
.serialize(vals.len() as RowId, &mut buffer)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
ColumnarReader::open(buffer).unwrap()
|
ColumnarReader::open(buffer).unwrap()
|
||||||
}
|
}
|
||||||
@@ -157,9 +159,7 @@ fn make_numerical_columnar_multiple_columns(
|
|||||||
.max()
|
.max()
|
||||||
.unwrap_or(0u32);
|
.unwrap_or(0u32);
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer
|
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||||
.serialize(num_rows, None, &mut buffer)
|
|
||||||
.unwrap();
|
|
||||||
ColumnarReader::open(buffer).unwrap()
|
ColumnarReader::open(buffer).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -182,9 +182,7 @@ fn make_byte_columnar_multiple_columns(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer
|
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||||
.serialize(num_rows, None, &mut buffer)
|
|
||||||
.unwrap();
|
|
||||||
ColumnarReader::open(buffer).unwrap()
|
ColumnarReader::open(buffer).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,9 +201,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column
|
|||||||
.max()
|
.max()
|
||||||
.unwrap_or(0u32);
|
.unwrap_or(0u32);
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer
|
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||||
.serialize(num_rows, None, &mut buffer)
|
|
||||||
.unwrap();
|
|
||||||
ColumnarReader::open(buffer).unwrap()
|
ColumnarReader::open(buffer).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ mod reader;
|
|||||||
mod writer;
|
mod writer;
|
||||||
|
|
||||||
pub use column_type::{ColumnType, HasAssociatedColumnType};
|
pub use column_type::{ColumnType, HasAssociatedColumnType};
|
||||||
|
pub use format_version::{Version, CURRENT_VERSION};
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) use merge::ColumnTypeCategory;
|
pub(crate) use merge::ColumnTypeCategory;
|
||||||
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use sstable::{Dictionary, RangeSSTable};
|
|||||||
|
|
||||||
use crate::columnar::{format_version, ColumnType};
|
use crate::columnar::{format_version, ColumnType};
|
||||||
use crate::dynamic_column::DynamicColumnHandle;
|
use crate::dynamic_column::DynamicColumnHandle;
|
||||||
use crate::RowId;
|
use crate::{RowId, Version};
|
||||||
|
|
||||||
fn io_invalid_data(msg: String) -> io::Error {
|
fn io_invalid_data(msg: String) -> io::Error {
|
||||||
io::Error::new(io::ErrorKind::InvalidData, msg)
|
io::Error::new(io::ErrorKind::InvalidData, msg)
|
||||||
@@ -19,6 +19,7 @@ pub struct ColumnarReader {
|
|||||||
column_dictionary: Dictionary<RangeSSTable>,
|
column_dictionary: Dictionary<RangeSSTable>,
|
||||||
column_data: FileSlice,
|
column_data: FileSlice,
|
||||||
num_rows: RowId,
|
num_rows: RowId,
|
||||||
|
format_version: Version,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for ColumnarReader {
|
impl fmt::Debug for ColumnarReader {
|
||||||
@@ -53,6 +54,7 @@ impl fmt::Debug for ColumnarReader {
|
|||||||
fn read_all_columns_in_stream(
|
fn read_all_columns_in_stream(
|
||||||
mut stream: sstable::Streamer<'_, RangeSSTable>,
|
mut stream: sstable::Streamer<'_, RangeSSTable>,
|
||||||
column_data: &FileSlice,
|
column_data: &FileSlice,
|
||||||
|
format_version: Version,
|
||||||
) -> io::Result<Vec<DynamicColumnHandle>> {
|
) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
while stream.advance() {
|
while stream.advance() {
|
||||||
@@ -67,6 +69,7 @@ fn read_all_columns_in_stream(
|
|||||||
let dynamic_column_handle = DynamicColumnHandle {
|
let dynamic_column_handle = DynamicColumnHandle {
|
||||||
file_slice,
|
file_slice,
|
||||||
column_type,
|
column_type,
|
||||||
|
format_version,
|
||||||
};
|
};
|
||||||
results.push(dynamic_column_handle);
|
results.push(dynamic_column_handle);
|
||||||
}
|
}
|
||||||
@@ -88,7 +91,7 @@ impl ColumnarReader {
|
|||||||
let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
|
let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
|
||||||
let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
|
let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
|
||||||
footer_bytes[12..].try_into().unwrap();
|
footer_bytes[12..].try_into().unwrap();
|
||||||
let _version = format_version::parse_footer(version_footer_bytes)?;
|
let format_version = format_version::parse_footer(version_footer_bytes)?;
|
||||||
let (column_data, sstable) =
|
let (column_data, sstable) =
|
||||||
file_slice_without_sstable_len.split_from_end(sstable_len as usize);
|
file_slice_without_sstable_len.split_from_end(sstable_len as usize);
|
||||||
let column_dictionary = Dictionary::open(sstable)?;
|
let column_dictionary = Dictionary::open(sstable)?;
|
||||||
@@ -96,6 +99,7 @@ impl ColumnarReader {
|
|||||||
column_dictionary,
|
column_dictionary,
|
||||||
column_data,
|
column_data,
|
||||||
num_rows,
|
num_rows,
|
||||||
|
format_version,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -126,6 +130,7 @@ impl ColumnarReader {
|
|||||||
let column_handle = DynamicColumnHandle {
|
let column_handle = DynamicColumnHandle {
|
||||||
file_slice,
|
file_slice,
|
||||||
column_type,
|
column_type,
|
||||||
|
format_version: self.format_version,
|
||||||
};
|
};
|
||||||
Some((column_name, column_handle))
|
Some((column_name, column_handle))
|
||||||
} else {
|
} else {
|
||||||
@@ -167,7 +172,7 @@ impl ColumnarReader {
|
|||||||
.stream_for_column_range(column_name)
|
.stream_for_column_range(column_name)
|
||||||
.into_stream_async()
|
.into_stream_async()
|
||||||
.await?;
|
.await?;
|
||||||
read_all_columns_in_stream(stream, &self.column_data)
|
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get all columns for the given column name.
|
/// Get all columns for the given column name.
|
||||||
@@ -176,7 +181,7 @@ impl ColumnarReader {
|
|||||||
/// different types.
|
/// different types.
|
||||||
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
|
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||||
let stream = self.stream_for_column_range(column_name).into_stream()?;
|
let stream = self.stream_for_column_range(column_name).into_stream()?;
|
||||||
read_all_columns_in_stream(stream, &self.column_data)
|
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the number of columns in the columnar.
|
/// Return the number of columns in the columnar.
|
||||||
@@ -195,7 +200,7 @@ mod tests {
|
|||||||
columnar_writer.record_column_type("col1", ColumnType::Str, false);
|
columnar_writer.record_column_type("col1", ColumnType::Str, false);
|
||||||
columnar_writer.record_column_type("col2", ColumnType::U64, false);
|
columnar_writer.record_column_type("col2", ColumnType::U64, false);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
columnar_writer.serialize(1, None, &mut buffer).unwrap();
|
columnar_writer.serialize(1, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
let columns = columnar.list_columns().unwrap();
|
let columns = columnar.list_columns().unwrap();
|
||||||
assert_eq!(columns.len(), 2);
|
assert_eq!(columns.len(), 2);
|
||||||
@@ -211,7 +216,7 @@ mod tests {
|
|||||||
columnar_writer.record_column_type("count", ColumnType::U64, false);
|
columnar_writer.record_column_type("count", ColumnType::U64, false);
|
||||||
columnar_writer.record_numerical(1, "count", 1u64);
|
columnar_writer.record_numerical(1, "count", 1u64);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
columnar_writer.serialize(2, None, &mut buffer).unwrap();
|
columnar_writer.serialize(2, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
let columns = columnar.list_columns().unwrap();
|
let columns = columnar.list_columns().unwrap();
|
||||||
assert_eq!(columns.len(), 1);
|
assert_eq!(columns.len(), 1);
|
||||||
|
|||||||
@@ -41,31 +41,10 @@ impl ColumnWriter {
|
|||||||
pub(super) fn operation_iterator<'a, V: SymbolValue>(
|
pub(super) fn operation_iterator<'a, V: SymbolValue>(
|
||||||
&self,
|
&self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
old_to_new_ids_opt: Option<&[RowId]>,
|
|
||||||
buffer: &'a mut Vec<u8>,
|
buffer: &'a mut Vec<u8>,
|
||||||
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
|
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
self.values.read_to_end(arena, buffer);
|
self.values.read_to_end(arena, buffer);
|
||||||
if let Some(old_to_new_ids) = old_to_new_ids_opt {
|
|
||||||
// TODO avoid the extra deserialization / serialization.
|
|
||||||
let mut sorted_ops: Vec<(RowId, ColumnOperation<V>)> = Vec::new();
|
|
||||||
let mut new_doc = 0u32;
|
|
||||||
let mut cursor = &buffer[..];
|
|
||||||
for op in std::iter::from_fn(|| ColumnOperation::<V>::deserialize(&mut cursor)) {
|
|
||||||
if let ColumnOperation::NewDoc(doc) = &op {
|
|
||||||
new_doc = old_to_new_ids[*doc as usize];
|
|
||||||
sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc)));
|
|
||||||
} else {
|
|
||||||
sorted_ops.push((new_doc, op));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// stable sort is crucial here.
|
|
||||||
sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id);
|
|
||||||
buffer.clear();
|
|
||||||
for (_, op) in sorted_ops {
|
|
||||||
buffer.extend_from_slice(op.serialize().as_ref());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut cursor: &[u8] = &buffer[..];
|
let mut cursor: &[u8] = &buffer[..];
|
||||||
std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
|
std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
|
||||||
}
|
}
|
||||||
@@ -231,11 +210,9 @@ impl NumericalColumnWriter {
|
|||||||
pub(super) fn operation_iterator<'a>(
|
pub(super) fn operation_iterator<'a>(
|
||||||
self,
|
self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
old_to_new_ids: Option<&[RowId]>,
|
|
||||||
buffer: &'a mut Vec<u8>,
|
buffer: &'a mut Vec<u8>,
|
||||||
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
|
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
|
||||||
self.column_writer
|
self.column_writer.operation_iterator(arena, buffer)
|
||||||
.operation_iterator(arena, old_to_new_ids, buffer)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -277,11 +254,9 @@ impl StrOrBytesColumnWriter {
|
|||||||
pub(super) fn operation_iterator<'a>(
|
pub(super) fn operation_iterator<'a>(
|
||||||
&self,
|
&self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
old_to_new_ids: Option<&[RowId]>,
|
|
||||||
byte_buffer: &'a mut Vec<u8>,
|
byte_buffer: &'a mut Vec<u8>,
|
||||||
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
|
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
|
||||||
self.column_writer
|
self.column_writer.operation_iterator(arena, byte_buffer)
|
||||||
.operation_iterator(arena, old_to_new_ids, byte_buffer)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,11 +8,12 @@ use std::net::Ipv6Addr;
|
|||||||
|
|
||||||
use column_operation::ColumnOperation;
|
use column_operation::ColumnOperation;
|
||||||
pub(crate) use column_writers::CompatibleNumericalTypes;
|
pub(crate) use column_writers::CompatibleNumericalTypes;
|
||||||
|
use common::json_path_writer::JSON_END_OF_PATH;
|
||||||
use common::CountingWriter;
|
use common::CountingWriter;
|
||||||
pub(crate) use serializer::ColumnarSerializer;
|
pub(crate) use serializer::ColumnarSerializer;
|
||||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||||
|
|
||||||
use crate::column_index::SerializableColumnIndex;
|
use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
|
||||||
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||||
use crate::columnar::column_type::ColumnType;
|
use crate::columnar::column_type::ColumnType;
|
||||||
use crate::columnar::writer::column_writers::{
|
use crate::columnar::writer::column_writers::{
|
||||||
@@ -43,7 +44,7 @@ struct SpareBuffers {
|
|||||||
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
||||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
||||||
/// let mut wrt: Vec<u8> = Vec::new();
|
/// let mut wrt: Vec<u8> = Vec::new();
|
||||||
/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
|
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct ColumnarWriter {
|
pub struct ColumnarWriter {
|
||||||
@@ -75,63 +76,6 @@ impl ColumnarWriter {
|
|||||||
.sum::<usize>()
|
.sum::<usize>()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
|
|
||||||
/// column.
|
|
||||||
///
|
|
||||||
/// If the column is multivalued, use the first value for scoring.
|
|
||||||
/// If no value is associated to a specific row, the document is assigned
|
|
||||||
/// the lowest possible score.
|
|
||||||
///
|
|
||||||
/// The sort applied is stable.
|
|
||||||
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
|
|
||||||
let Some(numerical_col_writer) = self
|
|
||||||
.numerical_field_hash_map
|
|
||||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
|
||||||
.or_else(|| {
|
|
||||||
self.datetime_field_hash_map
|
|
||||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
|
||||||
})
|
|
||||||
else {
|
|
||||||
return Vec::new();
|
|
||||||
};
|
|
||||||
let mut symbols_buffer = Vec::new();
|
|
||||||
let mut values = Vec::new();
|
|
||||||
let mut start_doc_check_fill = 0;
|
|
||||||
let mut current_doc_opt: Option<RowId> = None;
|
|
||||||
// Assumption: NewDoc will never call the same doc twice and is strictly increasing between
|
|
||||||
// calls
|
|
||||||
for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
|
|
||||||
match op {
|
|
||||||
ColumnOperation::NewDoc(doc) => {
|
|
||||||
current_doc_opt = Some(doc);
|
|
||||||
}
|
|
||||||
ColumnOperation::Value(numerical_value) => {
|
|
||||||
if let Some(current_doc) = current_doc_opt {
|
|
||||||
// Fill up with 0.0 since last doc
|
|
||||||
values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
|
|
||||||
start_doc_check_fill = current_doc + 1;
|
|
||||||
// handle multi values
|
|
||||||
current_doc_opt = None;
|
|
||||||
|
|
||||||
let score: f32 = f64::coerce(numerical_value) as f32;
|
|
||||||
values.push((score, current_doc));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for doc in values.len() as u32..num_docs {
|
|
||||||
values.push((0.0f32, doc));
|
|
||||||
}
|
|
||||||
values.sort_by(|(left_score, _), (right_score, _)| {
|
|
||||||
if reversed {
|
|
||||||
right_score.total_cmp(left_score)
|
|
||||||
} else {
|
|
||||||
left_score.total_cmp(right_score)
|
|
||||||
}
|
|
||||||
});
|
|
||||||
values.into_iter().map(|(_score, doc)| doc).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Records a column type. This is useful to bypass the coercion process,
|
/// Records a column type. This is useful to bypass the coercion process,
|
||||||
/// makes sure the empty is present in the resulting columnar, or set
|
/// makes sure the empty is present in the resulting columnar, or set
|
||||||
/// the `sort_values_within_row`.
|
/// the `sort_values_within_row`.
|
||||||
@@ -302,13 +246,9 @@ impl ColumnarWriter {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
pub fn serialize(
|
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||||
&mut self,
|
|
||||||
num_docs: RowId,
|
|
||||||
old_to_new_row_ids: Option<&[RowId]>,
|
|
||||||
wrt: &mut dyn io::Write,
|
|
||||||
) -> io::Result<()> {
|
|
||||||
let mut serializer = ColumnarSerializer::new(wrt);
|
let mut serializer = ColumnarSerializer::new(wrt);
|
||||||
|
|
||||||
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
||||||
.numerical_field_hash_map
|
.numerical_field_hash_map
|
||||||
.iter()
|
.iter()
|
||||||
@@ -322,7 +262,7 @@ impl ColumnarWriter {
|
|||||||
columns.extend(
|
columns.extend(
|
||||||
self.bytes_field_hash_map
|
self.bytes_field_hash_map
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
|
.map(|(column_name, addr)| (column_name, ColumnType::Bytes, addr)),
|
||||||
);
|
);
|
||||||
columns.extend(
|
columns.extend(
|
||||||
self.str_field_hash_map
|
self.str_field_hash_map
|
||||||
@@ -349,6 +289,12 @@ impl ColumnarWriter {
|
|||||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||||
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
||||||
for (column_name, column_type, addr) in columns {
|
for (column_name, column_type, addr) in columns {
|
||||||
|
if column_name.contains(&JSON_END_OF_PATH) {
|
||||||
|
// Tantivy uses b'0' as a separator for nested fields in JSON.
|
||||||
|
// Column names with a b'0' are not simply ignored by the columnar (and the inverted
|
||||||
|
// index).
|
||||||
|
continue;
|
||||||
|
}
|
||||||
match column_type {
|
match column_type {
|
||||||
ColumnType::Bool => {
|
ColumnType::Bool => {
|
||||||
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
||||||
@@ -358,11 +304,7 @@ impl ColumnarWriter {
|
|||||||
serialize_bool_column(
|
serialize_bool_column(
|
||||||
cardinality,
|
cardinality,
|
||||||
num_docs,
|
num_docs,
|
||||||
column_writer.operation_iterator(
|
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||||
arena,
|
|
||||||
old_to_new_row_ids,
|
|
||||||
&mut symbol_byte_buffer,
|
|
||||||
),
|
|
||||||
buffers,
|
buffers,
|
||||||
&mut column_serializer,
|
&mut column_serializer,
|
||||||
)?;
|
)?;
|
||||||
@@ -376,11 +318,7 @@ impl ColumnarWriter {
|
|||||||
serialize_ip_addr_column(
|
serialize_ip_addr_column(
|
||||||
cardinality,
|
cardinality,
|
||||||
num_docs,
|
num_docs,
|
||||||
column_writer.operation_iterator(
|
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||||
arena,
|
|
||||||
old_to_new_row_ids,
|
|
||||||
&mut symbol_byte_buffer,
|
|
||||||
),
|
|
||||||
buffers,
|
buffers,
|
||||||
&mut column_serializer,
|
&mut column_serializer,
|
||||||
)?;
|
)?;
|
||||||
@@ -405,11 +343,8 @@ impl ColumnarWriter {
|
|||||||
num_docs,
|
num_docs,
|
||||||
str_or_bytes_column_writer.sort_values_within_row,
|
str_or_bytes_column_writer.sort_values_within_row,
|
||||||
dictionary_builder,
|
dictionary_builder,
|
||||||
str_or_bytes_column_writer.operation_iterator(
|
str_or_bytes_column_writer
|
||||||
arena,
|
.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||||
old_to_new_row_ids,
|
|
||||||
&mut symbol_byte_buffer,
|
|
||||||
),
|
|
||||||
buffers,
|
buffers,
|
||||||
&self.arena,
|
&self.arena,
|
||||||
&mut column_serializer,
|
&mut column_serializer,
|
||||||
@@ -427,11 +362,7 @@ impl ColumnarWriter {
|
|||||||
cardinality,
|
cardinality,
|
||||||
num_docs,
|
num_docs,
|
||||||
numerical_type,
|
numerical_type,
|
||||||
numerical_column_writer.operation_iterator(
|
numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||||
arena,
|
|
||||||
old_to_new_row_ids,
|
|
||||||
&mut symbol_byte_buffer,
|
|
||||||
),
|
|
||||||
buffers,
|
buffers,
|
||||||
&mut column_serializer,
|
&mut column_serializer,
|
||||||
)?;
|
)?;
|
||||||
@@ -446,11 +377,7 @@ impl ColumnarWriter {
|
|||||||
cardinality,
|
cardinality,
|
||||||
num_docs,
|
num_docs,
|
||||||
NumericalType::I64,
|
NumericalType::I64,
|
||||||
column_writer.operation_iterator(
|
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||||
arena,
|
|
||||||
old_to_new_row_ids,
|
|
||||||
&mut symbol_byte_buffer,
|
|
||||||
),
|
|
||||||
buffers,
|
buffers,
|
||||||
&mut column_serializer,
|
&mut column_serializer,
|
||||||
)?;
|
)?;
|
||||||
@@ -635,16 +562,16 @@ fn send_to_serialize_column_mappable_to_u128<
|
|||||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||||
let optional_index = optional_index_builder.finish(num_rows);
|
let optional_index = optional_index_builder.finish(num_rows);
|
||||||
SerializableColumnIndex::Optional {
|
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||||
num_rows,
|
num_rows,
|
||||||
non_null_row_ids: Box::new(optional_index),
|
non_null_row_ids: Box::new(optional_index),
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
Cardinality::Multivalued => {
|
Cardinality::Multivalued => {
|
||||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||||
let multivalued_index = multivalued_index_builder.finish(num_rows);
|
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
crate::column::serialize_column_mappable_to_u128(
|
crate::column::serialize_column_mappable_to_u128(
|
||||||
@@ -655,15 +582,6 @@ fn send_to_serialize_column_mappable_to_u128<
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
|
|
||||||
let mut start_index: usize = 0;
|
|
||||||
for end_index in multivalued_index.iter().copied() {
|
|
||||||
let end_index = end_index as usize;
|
|
||||||
values[start_index..end_index].sort_unstable();
|
|
||||||
start_index = end_index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn send_to_serialize_column_mappable_to_u64(
|
fn send_to_serialize_column_mappable_to_u64(
|
||||||
op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
|
op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
|
||||||
cardinality: Cardinality,
|
cardinality: Cardinality,
|
||||||
@@ -687,19 +605,22 @@ fn send_to_serialize_column_mappable_to_u64(
|
|||||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||||
let optional_index = optional_index_builder.finish(num_rows);
|
let optional_index = optional_index_builder.finish(num_rows);
|
||||||
SerializableColumnIndex::Optional {
|
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||||
non_null_row_ids: Box::new(optional_index),
|
non_null_row_ids: Box::new(optional_index),
|
||||||
num_rows,
|
num_rows,
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
Cardinality::Multivalued => {
|
Cardinality::Multivalued => {
|
||||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||||
let multivalued_index = multivalued_index_builder.finish(num_rows);
|
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||||
if sort_values_within_row {
|
if sort_values_within_row {
|
||||||
sort_values_within_row_in_place(multivalued_index, values);
|
sort_values_within_row_in_place(
|
||||||
|
serializable_multivalued_index.start_offsets.boxed_iter(),
|
||||||
|
values,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
crate::column::serialize_column_mappable_to_u64(
|
crate::column::serialize_column_mappable_to_u64(
|
||||||
@@ -710,6 +631,18 @@ fn send_to_serialize_column_mappable_to_u64(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn sort_values_within_row_in_place(
|
||||||
|
multivalued_index: impl Iterator<Item = RowId>,
|
||||||
|
values: &mut [u64],
|
||||||
|
) {
|
||||||
|
let mut start_index: usize = 0;
|
||||||
|
for end_index in multivalued_index {
|
||||||
|
let end_index = end_index as usize;
|
||||||
|
values[start_index..end_index].sort_unstable();
|
||||||
|
start_index = end_index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn coerce_numerical_symbol<T>(
|
fn coerce_numerical_symbol<T>(
|
||||||
operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
|
operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
|
||||||
) -> impl Iterator<Item = ColumnOperation<u64>>
|
) -> impl Iterator<Item = ColumnOperation<u64>>
|
||||||
@@ -757,7 +690,7 @@ mod tests {
|
|||||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
|
assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||||
.operation_iterator(&arena, None, &mut buffer)
|
.operation_iterator(&arena, &mut buffer)
|
||||||
.collect();
|
.collect();
|
||||||
assert_eq!(symbols.len(), 6);
|
assert_eq!(symbols.len(), 6);
|
||||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||||
@@ -786,7 +719,7 @@ mod tests {
|
|||||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
|
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||||
.operation_iterator(&arena, None, &mut buffer)
|
.operation_iterator(&arena, &mut buffer)
|
||||||
.collect();
|
.collect();
|
||||||
assert_eq!(symbols.len(), 4);
|
assert_eq!(symbols.len(), 4);
|
||||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
|
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
|
||||||
@@ -809,7 +742,7 @@ mod tests {
|
|||||||
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
|
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||||
.operation_iterator(&arena, None, &mut buffer)
|
.operation_iterator(&arena, &mut buffer)
|
||||||
.collect();
|
.collect();
|
||||||
assert_eq!(symbols.len(), 2);
|
assert_eq!(symbols.len(), 2);
|
||||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||||
@@ -828,7 +761,7 @@ mod tests {
|
|||||||
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
|
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||||
.operation_iterator(&arena, None, &mut buffer)
|
.operation_iterator(&arena, &mut buffer)
|
||||||
.collect();
|
.collect();
|
||||||
assert_eq!(symbols.len(), 3);
|
assert_eq!(symbols.len(), 3);
|
||||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
|
use common::json_path_writer::JSON_END_OF_PATH;
|
||||||
use common::{BinarySerializable, CountingWriter};
|
use common::{BinarySerializable, CountingWriter};
|
||||||
use sstable::value::RangeValueWriter;
|
use sstable::value::RangeValueWriter;
|
||||||
use sstable::RangeSSTable;
|
use sstable::RangeSSTable;
|
||||||
@@ -18,13 +19,8 @@ pub struct ColumnarSerializer<W: io::Write> {
|
|||||||
/// code.
|
/// code.
|
||||||
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
|
buffer.extend_from_slice(key);
|
||||||
if key.contains(&0u8) {
|
buffer.push(JSON_END_OF_PATH);
|
||||||
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
|
||||||
} else {
|
|
||||||
buffer.extend_from_slice(key);
|
|
||||||
}
|
|
||||||
buffer.push(0u8);
|
|
||||||
buffer.push(column_type.to_code());
|
buffer.push(column_type.to_code());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
|
|||||||
self.columnar_serializer.wrt.write_all(buf)
|
self.columnar_serializer.wrt.write_all(buf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_prepare_key_bytes() {
|
|
||||||
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
|
|
||||||
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
|
|
||||||
assert_eq!(buffer.len(), 12);
|
|
||||||
assert_eq!(&buffer[..10], b"root0child");
|
|
||||||
assert_eq!(buffer[10], 0u8);
|
|
||||||
assert_eq!(buffer[11], ColumnType::Str.to_code());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
|
||||||
use crate::iterable::Iterable;
|
use crate::iterable::Iterable;
|
||||||
use crate::RowId;
|
use crate::RowId;
|
||||||
|
|
||||||
@@ -59,31 +60,47 @@ impl IndexBuilder for OptionalIndexBuilder {
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MultivaluedIndexBuilder {
|
pub struct MultivaluedIndexBuilder {
|
||||||
start_offsets: Vec<RowId>,
|
doc_with_values: Vec<RowId>,
|
||||||
|
start_offsets: Vec<u32>,
|
||||||
total_num_vals_seen: u32,
|
total_num_vals_seen: u32,
|
||||||
|
current_row: RowId,
|
||||||
|
current_row_has_value: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MultivaluedIndexBuilder {
|
impl MultivaluedIndexBuilder {
|
||||||
pub fn finish(&mut self, num_docs: RowId) -> &[u32] {
|
pub fn finish(&mut self, num_docs: RowId) -> SerializableMultivalueIndex<'_> {
|
||||||
self.start_offsets
|
self.start_offsets.push(self.total_num_vals_seen);
|
||||||
.resize(num_docs as usize + 1, self.total_num_vals_seen);
|
let non_null_row_ids: Box<dyn Iterable<RowId>> = Box::new(&self.doc_with_values[..]);
|
||||||
&self.start_offsets[..]
|
SerializableMultivalueIndex {
|
||||||
|
doc_ids_with_values: SerializableOptionalIndex {
|
||||||
|
non_null_row_ids,
|
||||||
|
num_rows: num_docs,
|
||||||
|
},
|
||||||
|
start_offsets: Box::new(&self.start_offsets[..]),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reset(&mut self) {
|
fn reset(&mut self) {
|
||||||
|
self.doc_with_values.clear();
|
||||||
self.start_offsets.clear();
|
self.start_offsets.clear();
|
||||||
self.start_offsets.push(0u32);
|
|
||||||
self.total_num_vals_seen = 0;
|
self.total_num_vals_seen = 0;
|
||||||
|
self.current_row = 0;
|
||||||
|
self.current_row_has_value = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexBuilder for MultivaluedIndexBuilder {
|
impl IndexBuilder for MultivaluedIndexBuilder {
|
||||||
fn record_row(&mut self, row_id: RowId) {
|
fn record_row(&mut self, row_id: RowId) {
|
||||||
self.start_offsets
|
self.current_row = row_id;
|
||||||
.resize(row_id as usize + 1, self.total_num_vals_seen);
|
self.current_row_has_value = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_value(&mut self) {
|
fn record_value(&mut self) {
|
||||||
|
if !self.current_row_has_value {
|
||||||
|
self.current_row_has_value = true;
|
||||||
|
self.doc_with_values.push(self.current_row);
|
||||||
|
self.start_offsets.push(self.total_num_vals_seen);
|
||||||
|
}
|
||||||
self.total_num_vals_seen += 1;
|
self.total_num_vals_seen += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -141,6 +158,32 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multivalued_value_index_builder_simple() {
|
||||||
|
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||||
|
{
|
||||||
|
multivalued_value_index_builder.record_row(0u32);
|
||||||
|
multivalued_value_index_builder.record_value();
|
||||||
|
multivalued_value_index_builder.record_value();
|
||||||
|
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
|
||||||
|
let start_offsets: Vec<u32> = serialized_multivalue_index
|
||||||
|
.start_offsets
|
||||||
|
.boxed_iter()
|
||||||
|
.collect();
|
||||||
|
assert_eq!(&start_offsets, &[0, 2]);
|
||||||
|
}
|
||||||
|
multivalued_value_index_builder.reset();
|
||||||
|
multivalued_value_index_builder.record_row(0u32);
|
||||||
|
multivalued_value_index_builder.record_value();
|
||||||
|
multivalued_value_index_builder.record_value();
|
||||||
|
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
|
||||||
|
let start_offsets: Vec<u32> = serialized_multivalue_index
|
||||||
|
.start_offsets
|
||||||
|
.boxed_iter()
|
||||||
|
.collect();
|
||||||
|
assert_eq!(&start_offsets, &[0, 2]);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_value_index_builder() {
|
fn test_multivalued_value_index_builder() {
|
||||||
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||||
@@ -149,17 +192,15 @@ mod tests {
|
|||||||
multivalued_value_index_builder.record_value();
|
multivalued_value_index_builder.record_value();
|
||||||
multivalued_value_index_builder.record_row(2u32);
|
multivalued_value_index_builder.record_row(2u32);
|
||||||
multivalued_value_index_builder.record_value();
|
multivalued_value_index_builder.record_value();
|
||||||
assert_eq!(
|
let SerializableMultivalueIndex {
|
||||||
multivalued_value_index_builder.finish(4u32).to_vec(),
|
doc_ids_with_values,
|
||||||
vec![0, 0, 2, 3, 3]
|
start_offsets,
|
||||||
);
|
} = multivalued_value_index_builder.finish(4u32);
|
||||||
multivalued_value_index_builder.reset();
|
assert_eq!(doc_ids_with_values.num_rows, 4u32);
|
||||||
multivalued_value_index_builder.record_row(2u32);
|
let doc_ids_with_values: Vec<u32> =
|
||||||
multivalued_value_index_builder.record_value();
|
doc_ids_with_values.non_null_row_ids.boxed_iter().collect();
|
||||||
multivalued_value_index_builder.record_value();
|
assert_eq!(&doc_ids_with_values, &[1u32, 2u32]);
|
||||||
assert_eq!(
|
let start_offsets: Vec<u32> = start_offsets.boxed_iter().collect();
|
||||||
multivalued_value_index_builder.finish(4u32).to_vec(),
|
assert_eq!(&start_offsets[..], &[0, 2, 3]);
|
||||||
vec![0, 0, 0, 2, 2]
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
183
columnar/src/compat_tests.rs
Normal file
183
columnar/src/compat_tests.rs
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
|
||||||
|
CURRENT_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
const NUM_DOCS: u32 = u16::MAX as u32;
|
||||||
|
|
||||||
|
fn generate_columnar(num_docs: u32, value_offset: u64) -> Vec<u8> {
|
||||||
|
use crate::ColumnarWriter;
|
||||||
|
|
||||||
|
let mut columnar_writer = ColumnarWriter::default();
|
||||||
|
|
||||||
|
for i in 0..num_docs {
|
||||||
|
if i % 100 == 0 {
|
||||||
|
columnar_writer.record_numerical(i, "sparse", value_offset + i as u64);
|
||||||
|
}
|
||||||
|
if i % 5 == 0 {
|
||||||
|
columnar_writer.record_numerical(i, "dense", value_offset + i as u64);
|
||||||
|
}
|
||||||
|
columnar_writer.record_numerical(i, "full", value_offset + i as u64);
|
||||||
|
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
||||||
|
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut wrt: Vec<u8> = Vec::new();
|
||||||
|
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||||
|
|
||||||
|
wrt
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
/// Writes a columnar for the CURRENT_VERSION to disk.
|
||||||
|
fn create_format() {
|
||||||
|
let version = CURRENT_VERSION.to_string();
|
||||||
|
let file_path = path_for_version(&version);
|
||||||
|
if PathBuf::from(file_path.clone()).exists() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let columnar = generate_columnar(NUM_DOCS, 0);
|
||||||
|
std::fs::write(file_path, columnar).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path_for_version(version: &str) -> String {
|
||||||
|
format!("./compat_tests_data/{}.columnar", version)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_format_v1() {
|
||||||
|
let path = path_for_version("v1");
|
||||||
|
test_format(&path);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_format_v2() {
|
||||||
|
let path = path_for_version("v2");
|
||||||
|
test_format(&path);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_format(path: &str) {
|
||||||
|
let file_content = std::fs::read(path).unwrap();
|
||||||
|
let reader = ColumnarReader::open(file_content).unwrap();
|
||||||
|
|
||||||
|
check_columns(&reader);
|
||||||
|
|
||||||
|
// Test merge
|
||||||
|
let reader2 = ColumnarReader::open(generate_columnar(NUM_DOCS, NUM_DOCS as u64)).unwrap();
|
||||||
|
let columnar_readers = vec![&reader, &reader2];
|
||||||
|
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||||
|
let mut out = Vec::new();
|
||||||
|
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||||
|
let reader = ColumnarReader::open(out).unwrap();
|
||||||
|
check_columns(&reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_columns(reader: &ColumnarReader) {
|
||||||
|
let column = open_column(reader, "full");
|
||||||
|
check_column(&column, |doc_id| vec![(doc_id, doc_id as u64).into()]);
|
||||||
|
assert_eq!(column.get_cardinality(), Cardinality::Full);
|
||||||
|
|
||||||
|
let column = open_column(reader, "multi");
|
||||||
|
check_column(&column, |doc_id| {
|
||||||
|
vec![
|
||||||
|
(doc_id * 2, doc_id as u64).into(),
|
||||||
|
(doc_id * 2 + 1, doc_id as u64).into(),
|
||||||
|
]
|
||||||
|
});
|
||||||
|
assert_eq!(column.get_cardinality(), Cardinality::Multivalued);
|
||||||
|
|
||||||
|
let column = open_column(reader, "sparse");
|
||||||
|
check_column(&column, |doc_id| {
|
||||||
|
if doc_id % 100 == 0 {
|
||||||
|
vec![(doc_id / 100, doc_id as u64).into()]
|
||||||
|
} else {
|
||||||
|
vec![]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
||||||
|
|
||||||
|
let column = open_column(reader, "dense");
|
||||||
|
check_column(&column, |doc_id| {
|
||||||
|
if doc_id % 5 == 0 {
|
||||||
|
vec![(doc_id / 5, doc_id as u64).into()]
|
||||||
|
} else {
|
||||||
|
vec![]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct RowIdAndValue {
|
||||||
|
row_id: u32,
|
||||||
|
value: u64,
|
||||||
|
}
|
||||||
|
impl From<(u32, u64)> for RowIdAndValue {
|
||||||
|
fn from((row_id, value): (u32, u64)) -> Self {
|
||||||
|
Self { row_id, value }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_column<F: Fn(u32) -> Vec<RowIdAndValue>>(column: &Column<u64>, expected: F) {
|
||||||
|
let num_docs = column.num_docs();
|
||||||
|
let test_doc = |doc: u32| {
|
||||||
|
if expected(doc).is_empty() {
|
||||||
|
assert_eq!(column.first(doc), None);
|
||||||
|
} else {
|
||||||
|
assert_eq!(column.first(doc), Some(expected(doc)[0].value));
|
||||||
|
}
|
||||||
|
let values = column.values_for_doc(doc).collect_vec();
|
||||||
|
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
||||||
|
let mut row_ids = Vec::new();
|
||||||
|
column.row_ids_for_docs(&[doc], &mut vec![], &mut row_ids);
|
||||||
|
assert_eq!(
|
||||||
|
row_ids,
|
||||||
|
expected(doc).iter().map(|x| x.row_id).collect_vec()
|
||||||
|
);
|
||||||
|
let values = column.values_for_doc(doc).collect_vec();
|
||||||
|
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
||||||
|
|
||||||
|
// Docid rowid conversion
|
||||||
|
let mut row_ids = Vec::new();
|
||||||
|
let safe_next_doc = |doc: u32| (doc + 1).min(num_docs - 1);
|
||||||
|
column
|
||||||
|
.index
|
||||||
|
.docids_to_rowids(&[doc, safe_next_doc(doc)], &mut vec![], &mut row_ids);
|
||||||
|
let expected_rowids = expected(doc)
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.row_id)
|
||||||
|
.chain(expected(safe_next_doc(doc)).iter().map(|x| x.row_id))
|
||||||
|
.collect_vec();
|
||||||
|
assert_eq!(row_ids, expected_rowids);
|
||||||
|
let rowid_range = column
|
||||||
|
.index
|
||||||
|
.docid_range_to_rowids(doc..safe_next_doc(doc) + 1);
|
||||||
|
if expected_rowids.is_empty() {
|
||||||
|
assert!(rowid_range.is_empty());
|
||||||
|
} else {
|
||||||
|
assert_eq!(
|
||||||
|
rowid_range,
|
||||||
|
expected_rowids[0]..expected_rowids.last().unwrap() + 1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
test_doc(0);
|
||||||
|
test_doc(num_docs - 1);
|
||||||
|
test_doc(num_docs - 2);
|
||||||
|
test_doc(65000);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn open_column(reader: &ColumnarReader, name: &str) -> Column<u64> {
|
||||||
|
let column = reader.read_columns(name).unwrap()[0]
|
||||||
|
.open()
|
||||||
|
.unwrap()
|
||||||
|
.coerce_numerical(crate::NumericalType::U64)
|
||||||
|
.unwrap();
|
||||||
|
let DynamicColumn::U64(column) = column else {
|
||||||
|
panic!();
|
||||||
|
};
|
||||||
|
column
|
||||||
|
}
|
||||||
@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
|||||||
use crate::column::{BytesColumn, Column, StrColumn};
|
use crate::column::{BytesColumn, Column, StrColumn};
|
||||||
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
||||||
use crate::columnar::ColumnType;
|
use crate::columnar::ColumnType;
|
||||||
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType};
|
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub enum DynamicColumn {
|
pub enum DynamicColumn {
|
||||||
@@ -232,6 +232,7 @@ static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
|
|||||||
pub struct DynamicColumnHandle {
|
pub struct DynamicColumnHandle {
|
||||||
pub(crate) file_slice: FileSlice,
|
pub(crate) file_slice: FileSlice,
|
||||||
pub(crate) column_type: ColumnType,
|
pub(crate) column_type: ColumnType,
|
||||||
|
pub(crate) format_version: Version,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DynamicColumnHandle {
|
impl DynamicColumnHandle {
|
||||||
@@ -260,11 +261,15 @@ impl DynamicColumnHandle {
|
|||||||
let column_bytes = self.file_slice.read_bytes()?;
|
let column_bytes = self.file_slice.read_bytes()?;
|
||||||
match self.column_type {
|
match self.column_type {
|
||||||
ColumnType::Str | ColumnType::Bytes => {
|
ColumnType::Str | ColumnType::Bytes => {
|
||||||
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
|
let column: BytesColumn =
|
||||||
|
crate::column::open_column_bytes(column_bytes, self.format_version)?;
|
||||||
Ok(Some(column.term_ord_column))
|
Ok(Some(column.term_ord_column))
|
||||||
}
|
}
|
||||||
ColumnType::IpAddr => {
|
ColumnType::IpAddr => {
|
||||||
let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
|
let column = crate::column::open_column_u128_as_compact_u64(
|
||||||
|
column_bytes,
|
||||||
|
self.format_version,
|
||||||
|
)?;
|
||||||
Ok(Some(column))
|
Ok(Some(column))
|
||||||
}
|
}
|
||||||
ColumnType::Bool
|
ColumnType::Bool
|
||||||
@@ -272,7 +277,8 @@ impl DynamicColumnHandle {
|
|||||||
| ColumnType::U64
|
| ColumnType::U64
|
||||||
| ColumnType::F64
|
| ColumnType::F64
|
||||||
| ColumnType::DateTime => {
|
| ColumnType::DateTime => {
|
||||||
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
let column =
|
||||||
|
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?;
|
||||||
Ok(Some(column))
|
Ok(Some(column))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -280,15 +286,31 @@ impl DynamicColumnHandle {
|
|||||||
|
|
||||||
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
|
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
|
||||||
let dynamic_column: DynamicColumn = match self.column_type {
|
let dynamic_column: DynamicColumn = match self.column_type {
|
||||||
ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
|
ColumnType::Bytes => {
|
||||||
ColumnType::Str => crate::column::open_column_str(column_bytes)?.into(),
|
crate::column::open_column_bytes(column_bytes, self.format_version)?.into()
|
||||||
ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
|
}
|
||||||
ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
|
ColumnType::Str => {
|
||||||
ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
|
crate::column::open_column_str(column_bytes, self.format_version)?.into()
|
||||||
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
|
}
|
||||||
ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
|
ColumnType::I64 => {
|
||||||
|
crate::column::open_column_u64::<i64>(column_bytes, self.format_version)?.into()
|
||||||
|
}
|
||||||
|
ColumnType::U64 => {
|
||||||
|
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?.into()
|
||||||
|
}
|
||||||
|
ColumnType::F64 => {
|
||||||
|
crate::column::open_column_u64::<f64>(column_bytes, self.format_version)?.into()
|
||||||
|
}
|
||||||
|
ColumnType::Bool => {
|
||||||
|
crate::column::open_column_u64::<bool>(column_bytes, self.format_version)?.into()
|
||||||
|
}
|
||||||
|
ColumnType::IpAddr => {
|
||||||
|
crate::column::open_column_u128::<Ipv6Addr>(column_bytes, self.format_version)?
|
||||||
|
.into()
|
||||||
|
}
|
||||||
ColumnType::DateTime => {
|
ColumnType::DateTime => {
|
||||||
crate::column::open_column_u64::<DateTime>(column_bytes)?.into()
|
crate::column::open_column_u64::<DateTime>(column_bytes, self.format_version)?
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(dynamic_column)
|
Ok(dynamic_column)
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use crate::{ColumnValues, RowId};
|
||||||
|
|
||||||
pub trait Iterable<T = u64> {
|
pub trait Iterable<T = u64> {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
|
||||||
@@ -17,3 +20,9 @@ where Range<T>: Iterator<Item = T>
|
|||||||
Box::new(self.clone())
|
Box::new(self.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Iterable for Arc<dyn crate::ColumnValues<RowId>> {
|
||||||
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||||
|
Box::new(self.iter().map(|row_id| row_id as u64))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ pub use column_values::{
|
|||||||
};
|
};
|
||||||
pub use columnar::{
|
pub use columnar::{
|
||||||
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
|
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
|
||||||
};
|
};
|
||||||
use sstable::VoidSSTable;
|
use sstable::VoidSSTable;
|
||||||
pub use value::{NumericalType, NumericalValue};
|
pub use value::{NumericalType, NumericalValue};
|
||||||
@@ -131,3 +131,6 @@ impl Cardinality {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod compat_tests;
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ fn test_dataframe_writer_str() {
|
|||||||
dataframe_writer.record_str(1u32, "my_string", "hello");
|
dataframe_writer.record_str(1u32, "my_string", "hello");
|
||||||
dataframe_writer.record_str(3u32, "my_string", "helloeee");
|
dataframe_writer.record_str(3u32, "my_string", "helloeee");
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||||
@@ -35,7 +35,7 @@ fn test_dataframe_writer_bytes() {
|
|||||||
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
|
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
|
||||||
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
|
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||||
@@ -49,7 +49,7 @@ fn test_dataframe_writer_bool() {
|
|||||||
dataframe_writer.record_bool(1u32, "bool.value", false);
|
dataframe_writer.record_bool(1u32, "bool.value", false);
|
||||||
dataframe_writer.record_bool(3u32, "bool.value", true);
|
dataframe_writer.record_bool(3u32, "bool.value", true);
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
|
||||||
@@ -74,12 +74,12 @@ fn test_dataframe_writer_u64_multivalued() {
|
|||||||
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
|
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
|
||||||
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
|
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer.serialize(7, None, &mut buffer).unwrap();
|
dataframe_writer.serialize(7, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
|
||||||
assert_eq!(cols.len(), 1);
|
assert_eq!(cols.len(), 1);
|
||||||
assert_eq!(cols[0].num_bytes(), 29);
|
assert_eq!(cols[0].num_bytes(), 50);
|
||||||
let dyn_i64_col = cols[0].open().unwrap();
|
let dyn_i64_col = cols[0].open().unwrap();
|
||||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
|
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
|
||||||
panic!();
|
panic!();
|
||||||
@@ -97,7 +97,7 @@ fn test_dataframe_writer_ip_addr() {
|
|||||||
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
|
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
|
||||||
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
|
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
|
||||||
@@ -128,7 +128,7 @@ fn test_dataframe_writer_numerical() {
|
|||||||
dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
|
dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
|
||||||
dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
|
dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
|
||||||
let mut buffer: Vec<u8> = Vec::new();
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
dataframe_writer.serialize(6, None, &mut buffer).unwrap();
|
dataframe_writer.serialize(6, &mut buffer).unwrap();
|
||||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar.num_columns(), 1);
|
assert_eq!(columnar.num_columns(), 1);
|
||||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
|
||||||
@@ -153,46 +153,6 @@ fn test_dataframe_writer_numerical() {
|
|||||||
assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
|
assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_dataframe_sort_by_full() {
|
|
||||||
let mut dataframe_writer = ColumnarWriter::default();
|
|
||||||
dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
|
|
||||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
|
|
||||||
let data = dataframe_writer.sort_order("value", 2, false);
|
|
||||||
assert_eq!(data, vec![0, 1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_dataframe_sort_by_opt() {
|
|
||||||
let mut dataframe_writer = ColumnarWriter::default();
|
|
||||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
|
|
||||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
|
|
||||||
let data = dataframe_writer.sort_order("value", 5, false);
|
|
||||||
// 0, 2, 4 is 0.0
|
|
||||||
assert_eq!(data, vec![0, 2, 4, 3, 1]);
|
|
||||||
let data = dataframe_writer.sort_order("value", 5, true);
|
|
||||||
assert_eq!(
|
|
||||||
data,
|
|
||||||
vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_dataframe_sort_by_multi() {
|
|
||||||
let mut dataframe_writer = ColumnarWriter::default();
|
|
||||||
// valid for sort
|
|
||||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
|
|
||||||
// those are ignored for sort
|
|
||||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
|
|
||||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
|
|
||||||
// valid for sort
|
|
||||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
|
|
||||||
// ignored, would change sort order
|
|
||||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
|
|
||||||
let data = dataframe_writer.sort_order("value", 4, false);
|
|
||||||
assert_eq!(data, vec![0, 2, 1, 3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_dictionary_encoded_str() {
|
fn test_dictionary_encoded_str() {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
@@ -201,7 +161,7 @@ fn test_dictionary_encoded_str() {
|
|||||||
columnar_writer.record_str(3, "my.column", "c");
|
columnar_writer.record_str(3, "my.column", "c");
|
||||||
columnar_writer.record_str(3, "my.column2", "different_column!");
|
columnar_writer.record_str(3, "my.column2", "different_column!");
|
||||||
columnar_writer.record_str(4, "my.column", "b");
|
columnar_writer.record_str(4, "my.column", "b");
|
||||||
columnar_writer.serialize(5, None, &mut buffer).unwrap();
|
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar_reader.num_columns(), 2);
|
assert_eq!(columnar_reader.num_columns(), 2);
|
||||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||||
@@ -235,7 +195,7 @@ fn test_dictionary_encoded_bytes() {
|
|||||||
columnar_writer.record_bytes(3, "my.column", b"c");
|
columnar_writer.record_bytes(3, "my.column", b"c");
|
||||||
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
|
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
|
||||||
columnar_writer.record_bytes(4, "my.column", b"b");
|
columnar_writer.record_bytes(4, "my.column", b"b");
|
||||||
columnar_writer.serialize(5, None, &mut buffer).unwrap();
|
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||||
assert_eq!(columnar_reader.num_columns(), 2);
|
assert_eq!(columnar_reader.num_columns(), 2);
|
||||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||||
@@ -344,7 +304,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
|
|||||||
ip_addr_byte
|
ip_addr_byte
|
||||||
))),
|
))),
|
||||||
1 => any::<bool>().prop_map(ColumnValue::Bool),
|
1 => any::<bool>().prop_map(ColumnValue::Bool),
|
||||||
1 => (0_679_723_993i64..1_679_723_995i64)
|
1 => (679_723_993i64..1_679_723_995i64)
|
||||||
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
|
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -369,26 +329,12 @@ fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, Colu
|
|||||||
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
|
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn columnar_docs_and_mapping_strategy(
|
|
||||||
) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
|
|
||||||
columnar_docs_strategy().prop_flat_map(|docs| {
|
|
||||||
permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
|
|
||||||
Just((0u32..n as RowId).collect()).prop_shuffle()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
|
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
|
||||||
let vals: Vec<usize> = (0..n).collect();
|
let vals: Vec<usize> = (0..n).collect();
|
||||||
subsequence(vals, 0..=n).prop_shuffle()
|
subsequence(vals, 0..=n).prop_shuffle()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_columnar_with_mapping(
|
fn build_columnar_with_mapping(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
||||||
docs: &[Vec<(&'static str, ColumnValue)>],
|
|
||||||
old_to_new_row_ids_opt: Option<&[RowId]>,
|
|
||||||
) -> ColumnarReader {
|
|
||||||
let num_docs = docs.len() as u32;
|
let num_docs = docs.len() as u32;
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut columnar_writer = ColumnarWriter::default();
|
let mut columnar_writer = ColumnarWriter::default();
|
||||||
@@ -416,15 +362,13 @@ fn build_columnar_with_mapping(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
columnar_writer
|
columnar_writer.serialize(num_docs, &mut buffer).unwrap();
|
||||||
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
ColumnarReader::open(buffer).unwrap()
|
ColumnarReader::open(buffer).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
||||||
build_columnar_with_mapping(docs, None)
|
build_columnar_with_mapping(docs)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
|
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
|
||||||
@@ -448,6 +392,7 @@ fn assert_columnar_eq(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[track_caller]
|
||||||
fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
|
fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
|
||||||
left: &Column<T>,
|
left: &Column<T>,
|
||||||
right: &Column<T>,
|
right: &Column<T>,
|
||||||
@@ -683,54 +628,6 @@ proptest! {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
|
|
||||||
proptest! {
|
|
||||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
|
||||||
#[test]
|
|
||||||
fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
|
|
||||||
let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
|
|
||||||
assert_eq!(columnar.num_rows() as usize, docs.len());
|
|
||||||
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
|
|
||||||
for (doc_id, doc_vals) in docs.iter().enumerate() {
|
|
||||||
for (col_name, col_val) in doc_vals {
|
|
||||||
expected_columns
|
|
||||||
.entry((col_name, col_val.column_type_category()))
|
|
||||||
.or_default()
|
|
||||||
.entry(mapping[doc_id])
|
|
||||||
.or_default()
|
|
||||||
.push(col_val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let column_list = columnar.list_columns().unwrap();
|
|
||||||
assert_eq!(expected_columns.len(), column_list.len());
|
|
||||||
for (column_name, column) in column_list {
|
|
||||||
let dynamic_column = column.open().unwrap();
|
|
||||||
let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
|
|
||||||
let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
|
|
||||||
for _doc_id in 0..columnar.num_rows() {
|
|
||||||
match &dynamic_column {
|
|
||||||
DynamicColumn::Bool(col) =>
|
|
||||||
assert_column_values(col, expected_col_values),
|
|
||||||
DynamicColumn::I64(col) =>
|
|
||||||
assert_column_values(col, expected_col_values),
|
|
||||||
DynamicColumn::U64(col) =>
|
|
||||||
assert_column_values(col, expected_col_values),
|
|
||||||
DynamicColumn::F64(col) =>
|
|
||||||
assert_column_values(col, expected_col_values),
|
|
||||||
DynamicColumn::IpAddr(col) =>
|
|
||||||
assert_column_values(col, expected_col_values),
|
|
||||||
DynamicColumn::DateTime(col) =>
|
|
||||||
assert_column_values(col, expected_col_values),
|
|
||||||
DynamicColumn::Bytes(col) =>
|
|
||||||
assert_bytes_column_values(col, expected_col_values, false),
|
|
||||||
DynamicColumn::Str(col) =>
|
|
||||||
assert_bytes_column_values(col, expected_col_values, true),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// This tests create 2 or 3 random small columnar and attempts to merge them.
|
// This tests create 2 or 3 random small columnar and attempts to merge them.
|
||||||
// It compares the resulting merged dataframe with what would have been obtained by building the
|
// It compares the resulting merged dataframe with what would have been obtained by building the
|
||||||
// dataframe from the concatenated rows to begin with.
|
// dataframe from the concatenated rows to begin with.
|
||||||
@@ -844,24 +741,68 @@ fn columnar_docs_and_remap(
|
|||||||
proptest! {
|
proptest! {
|
||||||
#![proptest_config(ProptestConfig::with_cases(1000))]
|
#![proptest_config(ProptestConfig::with_cases(1000))]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
|
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in
|
||||||
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
|
columnar_docs_and_remap()) {
|
||||||
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
|
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
|
||||||
.collect();
|
|
||||||
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
|
|
||||||
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
|
|
||||||
.map(|docs| build_columnar(&docs[..]))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
|
|
||||||
let mut output: Vec<u8> = Vec::new();
|
|
||||||
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
|
|
||||||
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
|
|
||||||
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
|
|
||||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
|
||||||
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn test_columnar_merge_and_remap(
|
||||||
|
columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>,
|
||||||
|
shuffle_merge_order: Vec<RowAddr>,
|
||||||
|
) {
|
||||||
|
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order
|
||||||
|
.iter()
|
||||||
|
.map(|row_addr| {
|
||||||
|
columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
|
||||||
|
let columnar_readers: Vec<ColumnarReader> = columnar_docs
|
||||||
|
.iter()
|
||||||
|
.map(|docs| build_columnar(&docs[..]))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let columnar_readers_ref: Vec<&ColumnarReader> = columnar_readers.iter().collect();
|
||||||
|
let mut output: Vec<u8> = Vec::new();
|
||||||
|
let segment_num_rows: Vec<RowId> = columnar_docs
|
||||||
|
.iter()
|
||||||
|
.map(|docs| docs.len() as RowId)
|
||||||
|
.collect();
|
||||||
|
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
|
||||||
|
crate::merge_columnar(
|
||||||
|
&columnar_readers_ref[..],
|
||||||
|
&[],
|
||||||
|
shuffle_merge_order.into(),
|
||||||
|
&mut output,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||||
|
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_columnar_merge_and_remap_bug_1() {
|
||||||
|
let columnar_docs = vec![vec![
|
||||||
|
vec![
|
||||||
|
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
|
||||||
|
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
|
||||||
|
],
|
||||||
|
vec![],
|
||||||
|
]];
|
||||||
|
let shuffle_merge_order: Vec<RowAddr> = vec![
|
||||||
|
RowAddr {
|
||||||
|
segment_ord: 0,
|
||||||
|
row_id: 1,
|
||||||
|
},
|
||||||
|
RowAddr {
|
||||||
|
segment_ord: 0,
|
||||||
|
row_id: 0,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_columnar_merge_empty() {
|
fn test_columnar_merge_empty() {
|
||||||
let columnar_reader_1 = build_columnar(&[]);
|
let columnar_reader_1 = build_columnar(&[]);
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ documentation = "https://docs.rs/tantivy_common/"
|
|||||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||||
repository = "https://github.com/quickwit-oss/tantivy"
|
repository = "https://github.com/quickwit-oss/tantivy"
|
||||||
|
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
@@ -20,8 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
|
|||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
binggan = "0.8.1"
|
||||||
proptest = "1.0.0"
|
proptest = "1.0.0"
|
||||||
rand = "0.8.4"
|
rand = "0.8.4"
|
||||||
|
|
||||||
[features]
|
|
||||||
unstable = [] # useful for benches.
|
|
||||||
|
|||||||
@@ -1,39 +1,64 @@
|
|||||||
#![feature(test)]
|
use binggan::{black_box, BenchRunner};
|
||||||
|
use rand::seq::IteratorRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
|
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
|
||||||
|
|
||||||
extern crate test;
|
fn bench_vint() {
|
||||||
|
let mut runner = BenchRunner::new();
|
||||||
|
|
||||||
#[cfg(test)]
|
let vals: Vec<u32> = (0..20_000).collect();
|
||||||
mod tests {
|
runner.bench_function("bench_vint", move |_| {
|
||||||
use rand::seq::IteratorRandom;
|
let mut out = 0u64;
|
||||||
use rand::thread_rng;
|
for val in vals.iter().cloned() {
|
||||||
use tantivy_common::serialize_vint_u32;
|
let mut buf = [0u8; 8];
|
||||||
use test::Bencher;
|
serialize_vint_u32(val, &mut buf);
|
||||||
|
out += u64::from(buf[0]);
|
||||||
|
}
|
||||||
|
black_box(out);
|
||||||
|
});
|
||||||
|
|
||||||
#[bench]
|
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||||
fn bench_vint(b: &mut Bencher) {
|
runner.bench_function("bench_vint_rand", move |_| {
|
||||||
let vals: Vec<u32> = (0..20_000).collect();
|
let mut out = 0u64;
|
||||||
b.iter(|| {
|
for val in vals.iter().cloned() {
|
||||||
let mut out = 0u64;
|
let mut buf = [0u8; 8];
|
||||||
for val in vals.iter().cloned() {
|
serialize_vint_u32(val, &mut buf);
|
||||||
let mut buf = [0u8; 8];
|
out += u64::from(buf[0]);
|
||||||
serialize_vint_u32(val, &mut buf);
|
}
|
||||||
out += u64::from(buf[0]);
|
black_box(out);
|
||||||
}
|
});
|
||||||
out
|
}
|
||||||
});
|
|
||||||
}
|
fn bench_bitset() {
|
||||||
|
let mut runner = BenchRunner::new();
|
||||||
#[bench]
|
|
||||||
fn bench_vint_rand(b: &mut Bencher) {
|
runner.bench_function("bench_tinyset_pop", move |_| {
|
||||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
let mut tinyset = TinySet::singleton(black_box(31u32));
|
||||||
b.iter(|| {
|
tinyset.pop_lowest();
|
||||||
let mut out = 0u64;
|
tinyset.pop_lowest();
|
||||||
for val in vals.iter().cloned() {
|
tinyset.pop_lowest();
|
||||||
let mut buf = [0u8; 8];
|
tinyset.pop_lowest();
|
||||||
serialize_vint_u32(val, &mut buf);
|
tinyset.pop_lowest();
|
||||||
out += u64::from(buf[0]);
|
tinyset.pop_lowest();
|
||||||
}
|
black_box(tinyset);
|
||||||
out
|
});
|
||||||
});
|
|
||||||
}
|
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||||
|
runner.bench_function("bench_tinyset_sum", move |_| {
|
||||||
|
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||||
|
});
|
||||||
|
|
||||||
|
let v = [10u32, 14u32, 21u32];
|
||||||
|
runner.bench_function("bench_tinyarr_sum", move |_| {
|
||||||
|
black_box(v.iter().cloned().sum::<u32>());
|
||||||
|
});
|
||||||
|
|
||||||
|
runner.bench_function("bench_bitset_initialize", move |_| {
|
||||||
|
black_box(BitSet::with_max_value(1_000_000));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
bench_vint();
|
||||||
|
bench_bitset();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -696,43 +696,3 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
|
|
||||||
use test;
|
|
||||||
|
|
||||||
use super::{BitSet, TinySet};
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
|
||||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
|
||||||
b.iter(|| {
|
|
||||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
|
||||||
let v = [10u32, 14u32, 21u32];
|
|
||||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
|
||||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
BIN
doc/assets/images/paradedb.png
Normal file
BIN
doc/assets/images/paradedb.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
@@ -7,6 +7,11 @@
|
|||||||
- [Other](#other)
|
- [Other](#other)
|
||||||
- [Usage](#usage)
|
- [Usage](#usage)
|
||||||
|
|
||||||
|
# Index Sorting has been removed!
|
||||||
|
More infos here:
|
||||||
|
|
||||||
|
https://github.com/quickwit-oss/tantivy/issues/2352
|
||||||
|
|
||||||
# Index Sorting
|
# Index Sorting
|
||||||
|
|
||||||
Tantivy allows you to sort the index according to a property.
|
Tantivy allows you to sort the index according to a property.
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
// # Searching a range on an indexed int field.
|
// # Searching a range on an indexed int field.
|
||||||
//
|
//
|
||||||
// Below is an example of creating an indexed integer field in your schema
|
// Below is an example of creating an indexed integer field in your schema
|
||||||
@@ -5,7 +7,7 @@
|
|||||||
use tantivy::collector::Count;
|
use tantivy::collector::Count;
|
||||||
use tantivy::query::RangeQuery;
|
use tantivy::query::RangeQuery;
|
||||||
use tantivy::schema::{Schema, INDEXED};
|
use tantivy::schema::{Schema, INDEXED};
|
||||||
use tantivy::{doc, Index, IndexWriter, Result};
|
use tantivy::{doc, Index, IndexWriter, Result, Term};
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
// For the sake of simplicity, this schema will only have 1 field
|
// For the sake of simplicity, this schema will only have 1 field
|
||||||
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
|
|||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
// The end is excluded i.e. here we are searching up to 1969
|
// The end is excluded i.e. here we are searching up to 1969
|
||||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
let docs_in_the_sixties = RangeQuery::new(
|
||||||
|
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||||
|
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||||
|
);
|
||||||
// Uses a Count collector to sum the total number of docs in the range
|
// Uses a Count collector to sum the total number of docs in the range
|
||||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
assert_eq!(num_60s_books, 10);
|
assert_eq!(num_60s_books, 10);
|
||||||
|
|||||||
@@ -34,8 +34,9 @@ use super::bucket::{
|
|||||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||||
};
|
};
|
||||||
use super::metric::{
|
use super::metric::{
|
||||||
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
|
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||||
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
|
MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
|
||||||
|
TopHitsAggregationReq,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
|
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
|
||||||
@@ -159,7 +160,10 @@ pub enum AggregationVariants {
|
|||||||
Percentiles(PercentilesAggregationReq),
|
Percentiles(PercentilesAggregationReq),
|
||||||
/// Finds the top k values matching some order
|
/// Finds the top k values matching some order
|
||||||
#[serde(rename = "top_hits")]
|
#[serde(rename = "top_hits")]
|
||||||
TopHits(TopHitsAggregation),
|
TopHits(TopHitsAggregationReq),
|
||||||
|
/// Computes an estimate of the number of unique values
|
||||||
|
#[serde(rename = "cardinality")]
|
||||||
|
Cardinality(CardinalityAggregationReq),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationVariants {
|
impl AggregationVariants {
|
||||||
@@ -179,6 +183,7 @@ impl AggregationVariants {
|
|||||||
AggregationVariants::Sum(sum) => vec![sum.field_name()],
|
AggregationVariants::Sum(sum) => vec![sum.field_name()],
|
||||||
AggregationVariants::Percentiles(per) => vec![per.field_name()],
|
AggregationVariants::Percentiles(per) => vec![per.field_name()],
|
||||||
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
|
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
|
||||||
|
AggregationVariants::Cardinality(per) => vec![per.field_name()],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,7 +208,7 @@ impl AggregationVariants {
|
|||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
|
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
|
||||||
match &self {
|
match &self {
|
||||||
AggregationVariants::TopHits(top_hits) => Some(top_hits),
|
AggregationVariants::TopHits(top_hits) => Some(top_hits),
|
||||||
_ => None,
|
_ => None,
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ use super::bucket::{
|
|||||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||||
};
|
};
|
||||||
use super::metric::{
|
use super::metric::{
|
||||||
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
|
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||||
StatsAggregation, SumAggregation,
|
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
||||||
};
|
};
|
||||||
use super::segment_agg_result::AggregationLimits;
|
use super::segment_agg_result::AggregationLimits;
|
||||||
use super::VecWithNames;
|
use super::VecWithNames;
|
||||||
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
|
|||||||
field: ref field_name,
|
field: ref field_name,
|
||||||
ref missing,
|
ref missing,
|
||||||
..
|
..
|
||||||
|
})
|
||||||
|
| Cardinality(CardinalityAggregationReq {
|
||||||
|
field: ref field_name,
|
||||||
|
ref missing,
|
||||||
|
..
|
||||||
}) => {
|
}) => {
|
||||||
let str_dict_column = reader.fast_fields().str(field_name)?;
|
let str_dict_column = reader.fast_fields().str(field_name)?;
|
||||||
let allowed_column_types = [
|
let allowed_column_types = [
|
||||||
|
|||||||
@@ -98,6 +98,8 @@ pub enum MetricResult {
|
|||||||
Percentiles(PercentilesMetricResult),
|
Percentiles(PercentilesMetricResult),
|
||||||
/// Top hits metric result
|
/// Top hits metric result
|
||||||
TopHits(TopHitsMetricResult),
|
TopHits(TopHitsMetricResult),
|
||||||
|
/// Cardinality metric result
|
||||||
|
Cardinality(SingleMetricResult),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricResult {
|
impl MetricResult {
|
||||||
@@ -116,6 +118,7 @@ impl MetricResult {
|
|||||||
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
|
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
|
||||||
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
|
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
|
||||||
)),
|
)),
|
||||||
|
MetricResult::Cardinality(card) => Ok(card.value),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"cardinality_string_id":{
|
||||||
|
"cardinality": {
|
||||||
|
"field": "string_id"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cardinality_score":{
|
||||||
|
"cardinality": {
|
||||||
|
"field": "score"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
assert_eq!(res["cardinality_string_id"]["value"], 2.0);
|
||||||
|
assert_eq!(res["cardinality_score"]["value"], 80.0);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -926,10 +939,10 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
},
|
},
|
||||||
"termagg": {
|
"termagg": {
|
||||||
"buckets": [
|
"buckets": [
|
||||||
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
|
{ "doc_count": 1, "key": 10.0, "key_as_string": "10", "min_price": { "value": 10.0 } },
|
||||||
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
||||||
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
||||||
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
{ "doc_count": 1, "key": -20.5, "key_as_string": "-20.5", "min_price": { "value": -20.5 } },
|
||||||
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
||||||
],
|
],
|
||||||
"sum_other_doc_count": 0
|
"sum_other_doc_count": 0
|
||||||
|
|||||||
@@ -1,10 +1,9 @@
|
|||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
use std::io;
|
||||||
use std::net::Ipv6Addr;
|
use std::net::Ipv6Addr;
|
||||||
|
|
||||||
use columnar::column_values::CompactSpaceU64Accessor;
|
use columnar::column_values::CompactSpaceU64Accessor;
|
||||||
use columnar::{
|
use columnar::{ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||||
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
|
|
||||||
};
|
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
@@ -466,49 +465,66 @@ impl SegmentTermCollector {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if self.column_type == ColumnType::Str {
|
if self.column_type == ColumnType::Str {
|
||||||
|
let fallback_dict = Dictionary::empty();
|
||||||
let term_dict = agg_with_accessor
|
let term_dict = agg_with_accessor
|
||||||
.str_dict_column
|
.str_dict_column
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.cloned()
|
.map(|el| el.dictionary())
|
||||||
.unwrap_or_else(|| {
|
.unwrap_or_else(|| &fallback_dict);
|
||||||
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
|
let mut buffer = Vec::new();
|
||||||
});
|
|
||||||
let mut buffer = String::new();
|
// special case for missing key
|
||||||
for (term_id, doc_count) in entries {
|
if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
|
||||||
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
|
let entry = entries[index];
|
||||||
// Special case for missing key
|
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
|
||||||
if term_id == u64::MAX {
|
let missing_key = self
|
||||||
let missing_key = self
|
.req
|
||||||
.req
|
.missing
|
||||||
.missing
|
.as_ref()
|
||||||
.as_ref()
|
.expect("Found placeholder term_id but `missing` is None");
|
||||||
.expect("Found placeholder term_id but `missing` is None");
|
match missing_key {
|
||||||
match missing_key {
|
Key::Str(missing) => {
|
||||||
Key::Str(missing) => {
|
buffer.clear();
|
||||||
buffer.clear();
|
buffer.extend_from_slice(missing.as_bytes());
|
||||||
buffer.push_str(missing);
|
dict.insert(
|
||||||
dict.insert(
|
IntermediateKey::Str(
|
||||||
IntermediateKey::Str(buffer.to_string()),
|
String::from_utf8(buffer.to_vec())
|
||||||
intermediate_entry,
|
.expect("could not convert to String"),
|
||||||
);
|
),
|
||||||
}
|
intermediate_entry,
|
||||||
Key::F64(val) => {
|
);
|
||||||
buffer.push_str(&val.to_string());
|
|
||||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
Key::F64(val) => {
|
||||||
if !term_dict.ord_to_str(term_id, &mut buffer)? {
|
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||||
return Err(TantivyError::InternalError(format!(
|
|
||||||
"Couldn't find term_id {term_id} in dict"
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
entries.swap_remove(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sort by term ord
|
||||||
|
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||||||
|
let mut idx = 0;
|
||||||
|
term_dict.sorted_ords_to_term_cb(
|
||||||
|
entries.iter().map(|(term_id, _)| *term_id),
|
||||||
|
|term| {
|
||||||
|
let entry = entries[idx];
|
||||||
|
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
|
||||||
|
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
|
||||||
|
dict.insert(
|
||||||
|
IntermediateKey::Str(
|
||||||
|
String::from_utf8(term.to_vec()).expect("could not convert to String"),
|
||||||
|
),
|
||||||
|
intermediate_entry,
|
||||||
|
);
|
||||||
|
idx += 1;
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
if self.req.min_doc_count == 0 {
|
if self.req.min_doc_count == 0 {
|
||||||
// TODO: Handle rev streaming for descending sorting by keys
|
// TODO: Handle rev streaming for descending sorting by keys
|
||||||
let mut stream = term_dict.dictionary().stream()?;
|
let mut stream = term_dict.stream()?;
|
||||||
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
|
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
|
||||||
agg_with_accessor.agg.sub_aggregation(),
|
agg_with_accessor.agg.sub_aggregation(),
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ use super::segment_agg_result::AggregationLimits;
|
|||||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||||
|
use crate::aggregation::metric::CardinalityCollector;
|
||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||||
@@ -227,6 +228,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
|||||||
TopHits(ref req) => IntermediateAggregationResult::Metric(
|
TopHits(ref req) => IntermediateAggregationResult::Metric(
|
||||||
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
|
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
|
||||||
),
|
),
|
||||||
|
Cardinality(_) => IntermediateAggregationResult::Metric(
|
||||||
|
IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -291,6 +295,8 @@ pub enum IntermediateMetricResult {
|
|||||||
Sum(IntermediateSum),
|
Sum(IntermediateSum),
|
||||||
/// Intermediate top_hits result
|
/// Intermediate top_hits result
|
||||||
TopHits(TopHitsTopNComputer),
|
TopHits(TopHitsTopNComputer),
|
||||||
|
/// Intermediate cardinality result
|
||||||
|
Cardinality(CardinalityCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IntermediateMetricResult {
|
impl IntermediateMetricResult {
|
||||||
@@ -324,6 +330,9 @@ impl IntermediateMetricResult {
|
|||||||
IntermediateMetricResult::TopHits(top_hits) => {
|
IntermediateMetricResult::TopHits(top_hits) => {
|
||||||
MetricResult::TopHits(top_hits.into_final_result())
|
MetricResult::TopHits(top_hits.into_final_result())
|
||||||
}
|
}
|
||||||
|
IntermediateMetricResult::Cardinality(cardinality) => {
|
||||||
|
MetricResult::Cardinality(cardinality.finalize().into())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -372,6 +381,12 @@ impl IntermediateMetricResult {
|
|||||||
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
|
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
|
||||||
left.merge_fruits(right)?;
|
left.merge_fruits(right)?;
|
||||||
}
|
}
|
||||||
|
(
|
||||||
|
IntermediateMetricResult::Cardinality(left),
|
||||||
|
IntermediateMetricResult::Cardinality(right),
|
||||||
|
) => {
|
||||||
|
left.merge_fruits(right)?;
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
panic!("incompatible fruit types in tree or missing merge_fruits handler");
|
panic!("incompatible fruit types in tree or missing merge_fruits handler");
|
||||||
}
|
}
|
||||||
@@ -584,6 +599,7 @@ impl IntermediateTermBucketResult {
|
|||||||
let val = if key { "true" } else { "false" };
|
let val = if key { "true" } else { "false" };
|
||||||
Some(val.to_string())
|
Some(val.to_string())
|
||||||
}
|
}
|
||||||
|
IntermediateKey::F64(val) => Some(val.to_string()),
|
||||||
_ => None,
|
_ => None,
|
||||||
};
|
};
|
||||||
Ok(BucketEntry {
|
Ok(BucketEntry {
|
||||||
|
|||||||
466
src/aggregation/metric/cardinality.rs
Normal file
466
src/aggregation/metric/cardinality.rs
Normal file
@@ -0,0 +1,466 @@
|
|||||||
|
use std::collections::hash_map::DefaultHasher;
|
||||||
|
use std::hash::{BuildHasher, Hasher};
|
||||||
|
|
||||||
|
use columnar::column_values::CompactSpaceU64Accessor;
|
||||||
|
use columnar::Dictionary;
|
||||||
|
use common::f64_to_u64;
|
||||||
|
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
|
||||||
|
use rustc_hash::FxHashSet;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::aggregation::agg_req_with_accessor::{
|
||||||
|
AggregationWithAccessor, AggregationsWithAccessor,
|
||||||
|
};
|
||||||
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
||||||
|
};
|
||||||
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
|
use crate::aggregation::*;
|
||||||
|
use crate::TantivyError;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
struct BuildSaltedHasher {
|
||||||
|
salt: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BuildHasher for BuildSaltedHasher {
|
||||||
|
type Hasher = DefaultHasher;
|
||||||
|
|
||||||
|
fn build_hasher(&self) -> Self::Hasher {
|
||||||
|
let mut hasher = DefaultHasher::new();
|
||||||
|
hasher.write_u8(self.salt);
|
||||||
|
|
||||||
|
hasher
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// # Cardinality
|
||||||
|
///
|
||||||
|
/// The cardinality aggregation allows for computing an estimate
|
||||||
|
/// of the number of different values in a data set based on the
|
||||||
|
/// HyperLogLog++ algorithm. This is particularly useful for understanding the
|
||||||
|
/// uniqueness of values in a large dataset where counting each unique value
|
||||||
|
/// individually would be computationally expensive.
|
||||||
|
///
|
||||||
|
/// For example, you might use a cardinality aggregation to estimate the number
|
||||||
|
/// of unique visitors to a website by aggregating on a field that contains
|
||||||
|
/// user IDs or session IDs.
|
||||||
|
///
|
||||||
|
/// To use the cardinality aggregation, you'll need to provide a field to
|
||||||
|
/// aggregate on. The following example demonstrates a request for the cardinality
|
||||||
|
/// of the "user_id" field:
|
||||||
|
///
|
||||||
|
/// ```JSON
|
||||||
|
/// {
|
||||||
|
/// "cardinality": {
|
||||||
|
/// "field": "user_id"
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// This request will return an estimate of the number of unique values in the
|
||||||
|
/// "user_id" field.
|
||||||
|
///
|
||||||
|
/// ## Missing Values
|
||||||
|
///
|
||||||
|
/// The `missing` parameter defines how documents that are missing a value should be treated.
|
||||||
|
/// By default, documents without a value for the specified field are ignored. However, you can
|
||||||
|
/// specify a default value for these documents using the `missing` parameter. This can be useful
|
||||||
|
/// when you want to include documents with missing values in the aggregation.
|
||||||
|
///
|
||||||
|
/// For example, the following request treats documents with missing values in the "user_id"
|
||||||
|
/// field as if they had a value of "unknown":
|
||||||
|
///
|
||||||
|
/// ```JSON
|
||||||
|
/// {
|
||||||
|
/// "cardinality": {
|
||||||
|
/// "field": "user_id",
|
||||||
|
/// "missing": "unknown"
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// # Estimation Accuracy
|
||||||
|
///
|
||||||
|
/// The cardinality aggregation provides an approximate count, which is usually
|
||||||
|
/// accurate within a small error range. This trade-off allows for efficient
|
||||||
|
/// computation even on very large datasets.
|
||||||
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct CardinalityAggregationReq {
|
||||||
|
/// The field name to compute the percentiles on.
|
||||||
|
pub field: String,
|
||||||
|
/// The missing parameter defines how documents that are missing a value should be treated.
|
||||||
|
/// By default they will be ignored but it is also possible to treat them as if they had a
|
||||||
|
/// value. Examples in JSON format:
|
||||||
|
/// { "field": "my_numbers", "missing": "10.0" }
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||||
|
pub missing: Option<Key>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CardinalityAggregationReq {
|
||||||
|
/// Creates a new [`CardinalityAggregationReq`] instance from a field name.
|
||||||
|
pub fn from_field_name(field_name: String) -> Self {
|
||||||
|
Self {
|
||||||
|
field: field_name,
|
||||||
|
missing: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// Returns the field name the aggregation is computed on.
|
||||||
|
pub fn field_name(&self) -> &str {
|
||||||
|
&self.field
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
|
pub(crate) struct SegmentCardinalityCollector {
|
||||||
|
cardinality: CardinalityCollector,
|
||||||
|
entries: FxHashSet<u64>,
|
||||||
|
column_type: ColumnType,
|
||||||
|
accessor_idx: usize,
|
||||||
|
missing: Option<Key>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCardinalityCollector {
|
||||||
|
pub fn from_req(column_type: ColumnType, accessor_idx: usize, missing: &Option<Key>) -> Self {
|
||||||
|
Self {
|
||||||
|
cardinality: CardinalityCollector::new(column_type as u8),
|
||||||
|
entries: Default::default(),
|
||||||
|
column_type,
|
||||||
|
accessor_idx,
|
||||||
|
missing: missing.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_block_with_field(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_accessor: &mut AggregationWithAccessor,
|
||||||
|
) {
|
||||||
|
if let Some(missing) = agg_accessor.missing_value_for_accessor {
|
||||||
|
agg_accessor.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&agg_accessor.accessor,
|
||||||
|
missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
agg_accessor
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &agg_accessor.accessor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_intermediate_metric_result(
|
||||||
|
mut self,
|
||||||
|
agg_with_accessor: &AggregationWithAccessor,
|
||||||
|
) -> crate::Result<IntermediateMetricResult> {
|
||||||
|
if self.column_type == ColumnType::Str {
|
||||||
|
let fallback_dict = Dictionary::empty();
|
||||||
|
let dict = agg_with_accessor
|
||||||
|
.str_dict_column
|
||||||
|
.as_ref()
|
||||||
|
.map(|el| el.dictionary())
|
||||||
|
.unwrap_or_else(|| &fallback_dict);
|
||||||
|
let mut has_missing = false;
|
||||||
|
|
||||||
|
// TODO: replace FxHashSet with something that allows iterating in order
|
||||||
|
// (e.g. sparse bitvec)
|
||||||
|
let mut term_ids = Vec::new();
|
||||||
|
for term_ord in self.entries.into_iter() {
|
||||||
|
if term_ord == u64::MAX {
|
||||||
|
has_missing = true;
|
||||||
|
} else {
|
||||||
|
// we can reasonably exclude values above u32::MAX
|
||||||
|
term_ids.push(term_ord as u32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
term_ids.sort_unstable();
|
||||||
|
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
|
||||||
|
self.cardinality.sketch.insert_any(&term);
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
if has_missing {
|
||||||
|
let missing_key = self
|
||||||
|
.missing
|
||||||
|
.as_ref()
|
||||||
|
.expect("Found placeholder term_ord but `missing` is None");
|
||||||
|
match missing_key {
|
||||||
|
Key::Str(missing) => {
|
||||||
|
self.cardinality.sketch.insert_any(&missing);
|
||||||
|
}
|
||||||
|
Key::F64(val) => {
|
||||||
|
let val = f64_to_u64(*val);
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||||
|
fn add_intermediate_aggregation_result(
|
||||||
|
self: Box<Self>,
|
||||||
|
agg_with_accessor: &AggregationsWithAccessor,
|
||||||
|
results: &mut IntermediateAggregationResults,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
|
||||||
|
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
|
||||||
|
|
||||||
|
let intermediate_result = self.into_intermediate_metric_result(agg_with_accessor)?;
|
||||||
|
results.push(
|
||||||
|
name,
|
||||||
|
IntermediateAggregationResult::Metric(intermediate_result),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(
|
||||||
|
&mut self,
|
||||||
|
doc: crate::DocId,
|
||||||
|
agg_with_accessor: &mut AggregationsWithAccessor,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_with_accessor)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_with_accessor: &mut AggregationsWithAccessor,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
|
||||||
|
self.fetch_block_with_field(docs, bucket_agg_accessor);
|
||||||
|
|
||||||
|
let col_block_accessor = &bucket_agg_accessor.column_block_accessor;
|
||||||
|
if self.column_type == ColumnType::Str {
|
||||||
|
for term_ord in col_block_accessor.iter_vals() {
|
||||||
|
self.entries.insert(term_ord);
|
||||||
|
}
|
||||||
|
} else if self.column_type == ColumnType::IpAddr {
|
||||||
|
let compact_space_accessor = bucket_agg_accessor
|
||||||
|
.accessor
|
||||||
|
.values
|
||||||
|
.clone()
|
||||||
|
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||||
|
.map_err(|_| {
|
||||||
|
TantivyError::AggregationError(
|
||||||
|
crate::aggregation::AggregationError::InternalError(
|
||||||
|
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||||
|
.to_string(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
for val in col_block_accessor.iter_vals() {
|
||||||
|
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for val in col_block_accessor.iter_vals() {
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
/// The percentiles collector used during segment collection and for merging results.
|
||||||
|
pub struct CardinalityCollector {
|
||||||
|
sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
|
||||||
|
}
|
||||||
|
impl Default for CardinalityCollector {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for CardinalityCollector {
|
||||||
|
fn eq(&self, _other: &Self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CardinalityCollector {
|
||||||
|
/// Compute the final cardinality estimate.
|
||||||
|
pub fn finalize(self) -> Option<f64> {
|
||||||
|
Some(self.sketch.clone().count().trunc())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new(salt: u8) -> Self {
|
||||||
|
Self {
|
||||||
|
sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
|
||||||
|
self.sketch.merge(&right.sketch).map_err(|err| {
|
||||||
|
TantivyError::AggregationError(AggregationError::InternalError(format!(
|
||||||
|
"Error while merging cardinality {err:?}"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use std::net::IpAddr;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use columnar::MonotonicallyMappableToU64;
|
||||||
|
|
||||||
|
use crate::aggregation::agg_req::Aggregations;
|
||||||
|
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||||
|
use crate::schema::{IntoIpv6Addr, Schema, FAST};
|
||||||
|
use crate::Index;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_test_empty_index() -> crate::Result<()> {
|
||||||
|
let values = vec![];
|
||||||
|
let index = get_test_index_from_terms(false, &values)?;
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "string_id",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 0.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_test_single_segment() -> crate::Result<()> {
|
||||||
|
cardinality_aggregation_test_merge_segment(true)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_test() -> crate::Result<()> {
|
||||||
|
cardinality_aggregation_test_merge_segment(false)
|
||||||
|
}
|
||||||
|
fn cardinality_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
|
||||||
|
let segment_and_terms = vec![
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["termb"],
|
||||||
|
vec!["termc"],
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["termb"],
|
||||||
|
vec!["terma"],
|
||||||
|
];
|
||||||
|
let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "string_id",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 3.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_u64() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
writer.add_document(doc!(id_field => 1u64))?;
|
||||||
|
writer.add_document(doc!(id_field => 2u64))?;
|
||||||
|
writer.add_document(doc!(id_field => 3u64))?;
|
||||||
|
writer.add_document(doc!())?;
|
||||||
|
writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "id",
|
||||||
|
"missing": 0u64
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_ip_addr() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
// IpV6 loopback
|
||||||
|
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||||||
|
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||||||
|
// IpV4
|
||||||
|
writer.add_document(
|
||||||
|
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
|
||||||
|
)?;
|
||||||
|
writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "ip_field"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 2.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_json() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let field = schema_builder.add_json_field("json", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": false})))?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": true})))?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": i64::from_u64(0u64)})))?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": i64::from_u64(1u64)})))?;
|
||||||
|
writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "json.value"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@
|
|||||||
//! - [Percentiles](PercentilesAggregationReq)
|
//! - [Percentiles](PercentilesAggregationReq)
|
||||||
|
|
||||||
mod average;
|
mod average;
|
||||||
|
mod cardinality;
|
||||||
mod count;
|
mod count;
|
||||||
mod extended_stats;
|
mod extended_stats;
|
||||||
mod max;
|
mod max;
|
||||||
@@ -29,6 +30,7 @@ mod top_hits;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
pub use average::*;
|
pub use average::*;
|
||||||
|
pub use cardinality::*;
|
||||||
pub use count::*;
|
pub use count::*;
|
||||||
pub use extended_stats::*;
|
pub use extended_stats::*;
|
||||||
pub use max::*;
|
pub use max::*;
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
|
|||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
|
||||||
pub struct TopHitsAggregation {
|
pub struct TopHitsAggregationReq {
|
||||||
sort: Vec<KeyOrder>,
|
sort: Vec<KeyOrder>,
|
||||||
size: usize,
|
size: usize,
|
||||||
from: Option<usize>,
|
from: Option<usize>,
|
||||||
@@ -164,7 +164,7 @@ fn unsupported_err(parameter: &str) -> crate::Result<()> {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TopHitsAggregation {
|
impl TopHitsAggregationReq {
|
||||||
/// Validate and resolve field retrieval parameters
|
/// Validate and resolve field retrieval parameters
|
||||||
pub fn validate_and_resolve_field_names(
|
pub fn validate_and_resolve_field_names(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -431,7 +431,7 @@ impl Eq for DocSortValuesAndFields {}
|
|||||||
/// The TopHitsCollector used for collecting over segments and merging results.
|
/// The TopHitsCollector used for collecting over segments and merging results.
|
||||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||||
pub struct TopHitsTopNComputer {
|
pub struct TopHitsTopNComputer {
|
||||||
req: TopHitsAggregation,
|
req: TopHitsAggregationReq,
|
||||||
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
|
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -443,7 +443,7 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
|
|||||||
|
|
||||||
impl TopHitsTopNComputer {
|
impl TopHitsTopNComputer {
|
||||||
/// Create a new TopHitsCollector
|
/// Create a new TopHitsCollector
|
||||||
pub fn new(req: &TopHitsAggregation) -> Self {
|
pub fn new(req: &TopHitsAggregationReq) -> Self {
|
||||||
Self {
|
Self {
|
||||||
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
||||||
req: req.clone(),
|
req: req.clone(),
|
||||||
@@ -496,7 +496,7 @@ pub(crate) struct TopHitsSegmentCollector {
|
|||||||
|
|
||||||
impl TopHitsSegmentCollector {
|
impl TopHitsSegmentCollector {
|
||||||
pub fn from_req(
|
pub fn from_req(
|
||||||
req: &TopHitsAggregation,
|
req: &TopHitsAggregationReq,
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
@@ -509,7 +509,7 @@ impl TopHitsSegmentCollector {
|
|||||||
fn into_top_hits_collector(
|
fn into_top_hits_collector(
|
||||||
self,
|
self,
|
||||||
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
||||||
req: &TopHitsAggregation,
|
req: &TopHitsAggregationReq,
|
||||||
) -> TopHitsTopNComputer {
|
) -> TopHitsTopNComputer {
|
||||||
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
||||||
let top_results = self.top_n.into_vec();
|
let top_results = self.top_n.into_vec();
|
||||||
@@ -532,7 +532,7 @@ impl TopHitsSegmentCollector {
|
|||||||
fn collect_with(
|
fn collect_with(
|
||||||
&mut self,
|
&mut self,
|
||||||
doc_id: crate::DocId,
|
doc_id: crate::DocId,
|
||||||
req: &TopHitsAggregation,
|
req: &TopHitsAggregationReq,
|
||||||
accessors: &[(Column<u64>, ColumnType)],
|
accessors: &[(Column<u64>, ColumnType)],
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let sorts: Vec<DocValueAndOrder> = req
|
let sorts: Vec<DocValueAndOrder> = req
|
||||||
|
|||||||
@@ -44,11 +44,14 @@
|
|||||||
//! - [Metric](metric)
|
//! - [Metric](metric)
|
||||||
//! - [Average](metric::AverageAggregation)
|
//! - [Average](metric::AverageAggregation)
|
||||||
//! - [Stats](metric::StatsAggregation)
|
//! - [Stats](metric::StatsAggregation)
|
||||||
|
//! - [ExtendedStats](metric::ExtendedStatsAggregation)
|
||||||
//! - [Min](metric::MinAggregation)
|
//! - [Min](metric::MinAggregation)
|
||||||
//! - [Max](metric::MaxAggregation)
|
//! - [Max](metric::MaxAggregation)
|
||||||
//! - [Sum](metric::SumAggregation)
|
//! - [Sum](metric::SumAggregation)
|
||||||
//! - [Count](metric::CountAggregation)
|
//! - [Count](metric::CountAggregation)
|
||||||
//! - [Percentiles](metric::PercentilesAggregationReq)
|
//! - [Percentiles](metric::PercentilesAggregationReq)
|
||||||
|
//! - [Cardinality](metric::CardinalityAggregationReq)
|
||||||
|
//! - [TopHits](metric::TopHitsAggregationReq)
|
||||||
//!
|
//!
|
||||||
//! # Example
|
//! # Example
|
||||||
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
|
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
|
||||||
|
|||||||
@@ -16,7 +16,10 @@ use super::metric::{
|
|||||||
SumAggregation,
|
SumAggregation,
|
||||||
};
|
};
|
||||||
use crate::aggregation::bucket::TermMissingAgg;
|
use crate::aggregation::bucket::TermMissingAgg;
|
||||||
use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
|
use crate::aggregation::metric::{
|
||||||
|
CardinalityAggregationReq, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
|
||||||
|
TopHitsSegmentCollector,
|
||||||
|
};
|
||||||
|
|
||||||
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
|
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
@@ -169,6 +172,9 @@ pub(crate) fn build_single_agg_segment_collector(
|
|||||||
accessor_idx,
|
accessor_idx,
|
||||||
req.segment_ordinal,
|
req.segment_ordinal,
|
||||||
))),
|
))),
|
||||||
|
Cardinality(CardinalityAggregationReq { missing, .. }) => Ok(Box::new(
|
||||||
|
SegmentCardinalityCollector::from_req(req.field_type, accessor_idx, missing),
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
|
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||||
use common::{replace_in_place, JsonPathWriter};
|
use common::{replace_in_place, JsonPathWriter};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
|
|
||||||
@@ -83,6 +83,9 @@ fn index_json_object<'a, V: Value<'a>>(
|
|||||||
positions_per_path: &mut IndexingPositionsPerPath,
|
positions_per_path: &mut IndexingPositionsPerPath,
|
||||||
) {
|
) {
|
||||||
for (json_path_segment, json_value_visitor) in json_visitor {
|
for (json_path_segment, json_value_visitor) in json_visitor {
|
||||||
|
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
json_path_writer.push(json_path_segment);
|
json_path_writer.push(json_path_segment);
|
||||||
index_json_value(
|
index_json_value(
|
||||||
doc,
|
doc,
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ mod tests {
|
|||||||
fast_field_writers
|
fast_field_writers
|
||||||
.add_document(&doc!(*FIELD=>2u64))
|
.add_document(&doc!(*FIELD=>2u64))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -178,7 +178,7 @@ mod tests {
|
|||||||
fast_field_writers
|
fast_field_writers
|
||||||
.add_document(&doc!(*FIELD=>215u64))
|
.add_document(&doc!(*FIELD=>215u64))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -211,7 +211,7 @@ mod tests {
|
|||||||
.add_document(&doc!(*FIELD=>100_000u64))
|
.add_document(&doc!(*FIELD=>100_000u64))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -243,7 +243,7 @@ mod tests {
|
|||||||
.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id))
|
.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -276,7 +276,7 @@ mod tests {
|
|||||||
doc.add_i64(i64_field, i);
|
doc.add_i64(i64_field, i);
|
||||||
fast_field_writers.add_document(&doc).unwrap();
|
fast_field_writers.add_document(&doc).unwrap();
|
||||||
}
|
}
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -315,7 +315,7 @@ mod tests {
|
|||||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||||
let doc = TantivyDocument::default();
|
let doc = TantivyDocument::default();
|
||||||
fast_field_writers.add_document(&doc).unwrap();
|
fast_field_writers.add_document(&doc).unwrap();
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -348,7 +348,7 @@ mod tests {
|
|||||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||||
let doc = TantivyDocument::default();
|
let doc = TantivyDocument::default();
|
||||||
fast_field_writers.add_document(&doc).unwrap();
|
fast_field_writers.add_document(&doc).unwrap();
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -385,7 +385,7 @@ mod tests {
|
|||||||
for &x in &permutation {
|
for &x in &permutation {
|
||||||
fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
|
fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
|
||||||
}
|
}
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -770,7 +770,7 @@ mod tests {
|
|||||||
fast_field_writers
|
fast_field_writers
|
||||||
.add_document(&doc!(field=>false))
|
.add_document(&doc!(field=>false))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -802,7 +802,7 @@ mod tests {
|
|||||||
.add_document(&doc!(field=>false))
|
.add_document(&doc!(field=>false))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -827,7 +827,7 @@ mod tests {
|
|||||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||||
let doc = TantivyDocument::default();
|
let doc = TantivyDocument::default();
|
||||||
fast_field_writers.add_document(&doc).unwrap();
|
fast_field_writers.add_document(&doc).unwrap();
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
@@ -855,7 +855,7 @@ mod tests {
|
|||||||
for doc in docs {
|
for doc in docs {
|
||||||
fast_field_writers.add_document(doc).unwrap();
|
fast_field_writers.add_document(doc).unwrap();
|
||||||
}
|
}
|
||||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
fast_field_writers.serialize(&mut write).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
}
|
}
|
||||||
Ok(directory)
|
Ok(directory)
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ use columnar::{ColumnarWriter, NumericalValue};
|
|||||||
use common::{DateTimePrecision, JsonPathWriter};
|
use common::{DateTimePrecision, JsonPathWriter};
|
||||||
use tokenizer_api::Token;
|
use tokenizer_api::Token;
|
||||||
|
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
|
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
|
||||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
|
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
|
||||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||||
@@ -106,16 +105,6 @@ impl FastFieldsWriter {
|
|||||||
self.columnar_writer.mem_usage()
|
self.columnar_writer.mem_usage()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn sort_order(
|
|
||||||
&self,
|
|
||||||
sort_field: &str,
|
|
||||||
num_docs: DocId,
|
|
||||||
reversed: bool,
|
|
||||||
) -> Vec<DocId> {
|
|
||||||
self.columnar_writer
|
|
||||||
.sort_order(sort_field, num_docs, reversed)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Indexes all of the fastfields of a new document.
|
/// Indexes all of the fastfields of a new document.
|
||||||
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
|
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
|
||||||
let doc_id = self.num_docs;
|
let doc_id = self.num_docs;
|
||||||
@@ -233,16 +222,9 @@ impl FastFieldsWriter {
|
|||||||
|
|
||||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||||
/// order to the fast field serializer.
|
/// order to the fast field serializer.
|
||||||
pub fn serialize(
|
pub fn serialize(mut self, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||||
mut self,
|
|
||||||
wrt: &mut dyn io::Write,
|
|
||||||
doc_id_map_opt: Option<&DocIdMapping>,
|
|
||||||
) -> io::Result<()> {
|
|
||||||
let num_docs = self.num_docs;
|
let num_docs = self.num_docs;
|
||||||
let old_to_new_row_ids =
|
self.columnar_writer.serialize(num_docs, wrt)?;
|
||||||
doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
|
|
||||||
self.columnar_writer
|
|
||||||
.serialize(num_docs, old_to_new_row_ids, wrt)?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -392,7 +374,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
columnar_writer
|
columnar_writer
|
||||||
.serialize(json_docs.len() as DocId, None, &mut buffer)
|
.serialize(json_docs.len() as DocId, &mut buffer)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
ColumnarReader::open(buffer).unwrap()
|
ColumnarReader::open(buffer).unwrap()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ mod tests {
|
|||||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
||||||
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
|
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
|
||||||
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
|
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
|
||||||
fieldnorm_writers.serialize(serializer, None)?;
|
fieldnorm_writers.serialize(serializer)?;
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path)?;
|
let file = directory.open_read(path)?;
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use std::cmp::Ordering;
|
|||||||
use std::{io, iter};
|
use std::{io, iter};
|
||||||
|
|
||||||
use super::{fieldnorm_to_id, FieldNormsSerializer};
|
use super::{fieldnorm_to_id, FieldNormsSerializer};
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::schema::{Field, Schema};
|
use crate::schema::{Field, Schema};
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
|
|
||||||
@@ -92,11 +91,7 @@ impl FieldNormsWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||||
pub fn serialize(
|
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
|
||||||
&self,
|
|
||||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
) -> io::Result<()> {
|
|
||||||
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|
||||||
|(field_id, fieldnorms_buffer_opt)| {
|
|(field_id, fieldnorms_buffer_opt)| {
|
||||||
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
|
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
|
||||||
@@ -104,12 +99,7 @@ impl FieldNormsWriter {
|
|||||||
})
|
})
|
||||||
},
|
},
|
||||||
) {
|
) {
|
||||||
if let Some(doc_id_map) = doc_id_map {
|
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
||||||
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
|
|
||||||
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
|
|
||||||
} else {
|
|
||||||
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
fieldnorms_serializer.close()?;
|
fieldnorms_serializer.close()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use rand::{thread_rng, Rng};
|
|||||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
#[allow(deprecated)]
|
#[allow(deprecated)]
|
||||||
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
|
use crate::{doc, schema, Index, IndexWriter, Searcher};
|
||||||
|
|
||||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||||
assert!(searcher.segment_readers().len() < 20);
|
assert!(searcher.segment_readers().len() < 20);
|
||||||
@@ -65,71 +65,6 @@ fn get_num_iterations() -> usize {
|
|||||||
.map(|str| str.parse().unwrap())
|
.map(|str| str.parse().unwrap())
|
||||||
.unwrap_or(2000)
|
.unwrap_or(2000)
|
||||||
}
|
}
|
||||||
#[test]
|
|
||||||
#[ignore]
|
|
||||||
fn test_functional_indexing_sorted() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
|
|
||||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
|
||||||
let text_field_options = TextOptions::default()
|
|
||||||
.set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
|
||||||
)
|
|
||||||
.set_stored();
|
|
||||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let mut index_builder = Index::builder().schema(schema);
|
|
||||||
index_builder = index_builder.settings(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "id".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
});
|
|
||||||
let index = index_builder.create_from_tempdir().unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader()?;
|
|
||||||
|
|
||||||
let mut rng = thread_rng();
|
|
||||||
|
|
||||||
let mut index_writer: IndexWriter =
|
|
||||||
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
|
||||||
|
|
||||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
|
||||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
|
||||||
|
|
||||||
for _ in 0..get_num_iterations() {
|
|
||||||
let random_val = rng.gen_range(0..20);
|
|
||||||
if random_val == 0 {
|
|
||||||
index_writer.commit()?;
|
|
||||||
committed_docs.extend(&uncommitted_docs);
|
|
||||||
uncommitted_docs.clear();
|
|
||||||
reader.reload()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
// check that everything is correct.
|
|
||||||
check_index_content(
|
|
||||||
&searcher,
|
|
||||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
|
||||||
)?;
|
|
||||||
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
|
||||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
|
||||||
index_writer.delete_term(doc_id_term);
|
|
||||||
} else {
|
|
||||||
uncommitted_docs.insert(random_val);
|
|
||||||
let mut doc = TantivyDocument::new();
|
|
||||||
doc.add_u64(id_field, random_val);
|
|
||||||
for i in 1u64..10u64 {
|
|
||||||
doc.add_u64(multiples_field, random_val * i);
|
|
||||||
}
|
|
||||||
doc.add_text(text_field, get_text());
|
|
||||||
index_writer.add_document(doc)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
|
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
|
||||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ use crate::indexer::segment_updater::save_metas;
|
|||||||
use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||||
use crate::schema::document::Document;
|
use crate::schema::document::Document;
|
||||||
use crate::schema::{Field, FieldType, Schema, Type};
|
use crate::schema::{Field, FieldType, Schema};
|
||||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||||
use crate::SegmentReader;
|
use crate::SegmentReader;
|
||||||
|
|
||||||
@@ -232,31 +232,7 @@ impl IndexBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn validate(&self) -> crate::Result<()> {
|
fn validate(&self) -> crate::Result<()> {
|
||||||
if let Some(schema) = self.schema.as_ref() {
|
if let Some(_schema) = self.schema.as_ref() {
|
||||||
if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() {
|
|
||||||
let schema_field = schema.get_field(&sort_by_field.field).map_err(|_| {
|
|
||||||
TantivyError::InvalidArgument(format!(
|
|
||||||
"Field to sort index {} not found in schema",
|
|
||||||
sort_by_field.field
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
let entry = schema.get_field_entry(schema_field);
|
|
||||||
if !entry.is_fast() {
|
|
||||||
return Err(TantivyError::InvalidArgument(format!(
|
|
||||||
"Field {} is no fast field. Field needs to be a single value fast field \
|
|
||||||
to be used to sort an index",
|
|
||||||
sort_by_field.field
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let supported_field_types = [Type::I64, Type::U64, Type::F64, Type::Date];
|
|
||||||
let field_type = entry.field_type().value_type();
|
|
||||||
if !supported_field_types.contains(&field_type) {
|
|
||||||
return Err(TantivyError::InvalidArgument(format!(
|
|
||||||
"Unsupported field type in sort_by_field: {field_type:?}. Supported field \
|
|
||||||
types: {supported_field_types:?} ",
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
Err(TantivyError::InvalidArgument(
|
Err(TantivyError::InvalidArgument(
|
||||||
|
|||||||
@@ -249,10 +249,6 @@ fn is_true(val: &bool) -> bool {
|
|||||||
/// index, like presort documents.
|
/// index, like presort documents.
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
pub struct IndexSettings {
|
pub struct IndexSettings {
|
||||||
/// Sorts the documents by information
|
|
||||||
/// provided in `IndexSortByField`
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
pub sort_by_field: Option<IndexSortByField>,
|
|
||||||
/// The `Compressor` used to compress the doc store.
|
/// The `Compressor` used to compress the doc store.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub docstore_compression: Compressor,
|
pub docstore_compression: Compressor,
|
||||||
@@ -275,7 +271,6 @@ fn default_docstore_blocksize() -> usize {
|
|||||||
impl Default for IndexSettings {
|
impl Default for IndexSettings {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
sort_by_field: None,
|
|
||||||
docstore_compression: Compressor::default(),
|
docstore_compression: Compressor::default(),
|
||||||
docstore_blocksize: default_docstore_blocksize(),
|
docstore_blocksize: default_docstore_blocksize(),
|
||||||
docstore_compress_dedicated_thread: true,
|
docstore_compress_dedicated_thread: true,
|
||||||
@@ -283,22 +278,6 @@ impl Default for IndexSettings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Settings to presort the documents in an index
|
|
||||||
///
|
|
||||||
/// Presorting documents can greatly improve performance
|
|
||||||
/// in some scenarios, by applying top n
|
|
||||||
/// optimizations.
|
|
||||||
#[deprecated(
|
|
||||||
since = "0.22.0",
|
|
||||||
note = "We plan to remove index sorting in `0.23`. If you need index sorting, please comment on the related issue https://github.com/quickwit-oss/tantivy/issues/2352 and explain your use case."
|
|
||||||
)]
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
|
||||||
pub struct IndexSortByField {
|
|
||||||
/// The field to sort the documents by
|
|
||||||
pub field: String,
|
|
||||||
/// The order to sort the documents by
|
|
||||||
pub order: Order,
|
|
||||||
}
|
|
||||||
/// The order to sort by
|
/// The order to sort by
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
pub enum Order {
|
pub enum Order {
|
||||||
@@ -417,7 +396,7 @@ mod tests {
|
|||||||
use crate::store::Compressor;
|
use crate::store::Compressor;
|
||||||
#[cfg(feature = "zstd-compression")]
|
#[cfg(feature = "zstd-compression")]
|
||||||
use crate::store::ZstdCompressor;
|
use crate::store::ZstdCompressor;
|
||||||
use crate::{IndexSettings, IndexSortByField, Order};
|
use crate::IndexSettings;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_serialize_metas() {
|
fn test_serialize_metas() {
|
||||||
@@ -427,13 +406,7 @@ mod tests {
|
|||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
let index_metas = IndexMeta {
|
let index_metas = IndexMeta {
|
||||||
index_settings: IndexSettings {
|
index_settings: IndexSettings::default(),
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "text".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
},
|
|
||||||
segments: Vec::new(),
|
segments: Vec::new(),
|
||||||
schema,
|
schema,
|
||||||
opstamp: 0u64,
|
opstamp: 0u64,
|
||||||
@@ -442,7 +415,7 @@ mod tests {
|
|||||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
json,
|
json,
|
||||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||||
);
|
);
|
||||||
|
|
||||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||||
@@ -461,10 +434,6 @@ mod tests {
|
|||||||
};
|
};
|
||||||
let index_metas = IndexMeta {
|
let index_metas = IndexMeta {
|
||||||
index_settings: IndexSettings {
|
index_settings: IndexSettings {
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "text".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
|
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
|
||||||
compression_level: Some(4),
|
compression_level: Some(4),
|
||||||
}),
|
}),
|
||||||
@@ -479,7 +448,7 @@ mod tests {
|
|||||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
json,
|
json,
|
||||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||||
);
|
);
|
||||||
|
|
||||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||||
@@ -491,35 +460,35 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
|
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
|
||||||
fn test_serialize_metas_invalid_comp() {
|
fn test_serialize_metas_invalid_comp() {
|
||||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
let json = r#"{"index_settings":{"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||||
|
|
||||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
err.to_string(),
|
err.to_string(),
|
||||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
|
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
|
||||||
`zstd(compression_level=5)` at line 1 column 96"
|
`zstd(compression_level=5)` at line 1 column 49"
|
||||||
.to_string()
|
.to_string()
|
||||||
);
|
);
|
||||||
|
|
||||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
let json = r#"{"index_settings":{"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||||
|
|
||||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
err.to_string(),
|
err.to_string(),
|
||||||
"unknown zstd option \"bla\" at line 1 column 103".to_string()
|
"unknown zstd option \"bla\" at line 1 column 56".to_string()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg(not(feature = "zstd-compression"))]
|
#[cfg(not(feature = "zstd-compression"))]
|
||||||
fn test_serialize_metas_unsupported_comp() {
|
fn test_serialize_metas_unsupported_comp() {
|
||||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
let json = r#"{"index_settings":{"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||||
|
|
||||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
err.to_string(),
|
err.to_string(),
|
||||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
|
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
|
||||||
line 1 column 95"
|
line 1 column 48"
|
||||||
.to_string()
|
.to_string()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -531,7 +500,6 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
index_settings,
|
index_settings,
|
||||||
IndexSettings {
|
IndexSettings {
|
||||||
sort_by_field: None,
|
|
||||||
docstore_compression: Compressor::default(),
|
docstore_compression: Compressor::default(),
|
||||||
docstore_compress_dedicated_thread: true,
|
docstore_compress_dedicated_thread: true,
|
||||||
docstore_blocksize: 16_384
|
docstore_blocksize: 16_384
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ mod segment_reader;
|
|||||||
|
|
||||||
pub use self::index::{Index, IndexBuilder};
|
pub use self::index::{Index, IndexBuilder};
|
||||||
pub(crate) use self::index_meta::SegmentMetaInventory;
|
pub(crate) use self::index_meta::SegmentMetaInventory;
|
||||||
pub use self::index_meta::{IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta};
|
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
|
||||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||||
pub use self::segment::Segment;
|
pub use self::segment::Segment;
|
||||||
pub use self::segment_component::SegmentComponent;
|
pub use self::segment_component::SegmentComponent;
|
||||||
|
|||||||
@@ -3,15 +3,12 @@
|
|||||||
|
|
||||||
use common::ReadOnlyBitSet;
|
use common::ReadOnlyBitSet;
|
||||||
|
|
||||||
use super::SegmentWriter;
|
use crate::DocAddress;
|
||||||
use crate::schema::{Field, Schema};
|
|
||||||
use crate::{DocAddress, DocId, IndexSortByField, TantivyError};
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||||
pub enum MappingType {
|
pub enum MappingType {
|
||||||
Stacked,
|
Stacked,
|
||||||
StackedWithDeletes,
|
StackedWithDeletes,
|
||||||
Shuffled,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||||
@@ -46,537 +43,4 @@ impl SegmentDocIdMapping {
|
|||||||
pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator<Item = DocAddress> + '_ {
|
pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator<Item = DocAddress> + '_ {
|
||||||
self.new_doc_id_to_old_doc_addr.iter().copied()
|
self.new_doc_id_to_old_doc_addr.iter().copied()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This flags means the segments are simply stacked in the order of their ordinal.
|
|
||||||
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
|
|
||||||
///
|
|
||||||
/// The different segment may present some deletes, in which case it is expressed by skipping a
|
|
||||||
/// `DocId`. [(0, 1), (0, 3)] <--- here doc_id=0 and doc_id=1 have been deleted
|
|
||||||
///
|
|
||||||
/// Being trivial is equivalent to having the `new_doc_id_to_old_doc_addr` array sorted.
|
|
||||||
///
|
|
||||||
/// This allows for some optimization.
|
|
||||||
pub(crate) fn is_trivial(&self) -> bool {
|
|
||||||
match self.mapping_type {
|
|
||||||
MappingType::Stacked | MappingType::StackedWithDeletes => true,
|
|
||||||
MappingType::Shuffled => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
|
|
||||||
pub struct DocIdMapping {
|
|
||||||
new_doc_id_to_old: Vec<DocId>,
|
|
||||||
old_doc_id_to_new: Vec<DocId>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DocIdMapping {
|
|
||||||
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
|
||||||
let max_doc = new_doc_id_to_old.len();
|
|
||||||
let old_max_doc = new_doc_id_to_old
|
|
||||||
.iter()
|
|
||||||
.cloned()
|
|
||||||
.max()
|
|
||||||
.map(|n| n + 1)
|
|
||||||
.unwrap_or(0);
|
|
||||||
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
|
|
||||||
for i in 0..max_doc {
|
|
||||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
|
||||||
}
|
|
||||||
DocIdMapping {
|
|
||||||
new_doc_id_to_old,
|
|
||||||
old_doc_id_to_new,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// returns the new doc_id for the old doc_id
|
|
||||||
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
|
||||||
self.old_doc_id_to_new[doc_id as usize]
|
|
||||||
}
|
|
||||||
/// returns the old doc_id for the new doc_id
|
|
||||||
pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
|
|
||||||
self.new_doc_id_to_old[doc_id as usize]
|
|
||||||
}
|
|
||||||
/// iterate over old doc_ids in order of the new doc_ids
|
|
||||||
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
|
||||||
self.new_doc_id_to_old.iter().cloned()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn old_to_new_ids(&self) -> &[DocId] {
|
|
||||||
&self.old_doc_id_to_new[..]
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Remaps a given array to the new doc ids.
|
|
||||||
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
|
|
||||||
self.new_doc_id_to_old
|
|
||||||
.iter()
|
|
||||||
.map(|old_doc| els[*old_doc as usize])
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
pub fn num_new_doc_ids(&self) -> usize {
|
|
||||||
self.new_doc_id_to_old.len()
|
|
||||||
}
|
|
||||||
pub fn num_old_doc_ids(&self) -> usize {
|
|
||||||
self.old_doc_id_to_new.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn expect_field_id_for_sort_field(
|
|
||||||
schema: &Schema,
|
|
||||||
sort_by_field: &IndexSortByField,
|
|
||||||
) -> crate::Result<Field> {
|
|
||||||
schema.get_field(&sort_by_field.field).map_err(|_| {
|
|
||||||
TantivyError::InvalidArgument(format!(
|
|
||||||
"field to sort index by not found: {:?}",
|
|
||||||
sort_by_field.field
|
|
||||||
))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generates a document mapping in the form of [index new doc_id] -> old doc_id
|
|
||||||
// TODO detect if field is already sorted and discard mapping
|
|
||||||
pub(crate) fn get_doc_id_mapping_from_field(
|
|
||||||
sort_by_field: IndexSortByField,
|
|
||||||
segment_writer: &SegmentWriter,
|
|
||||||
) -> crate::Result<DocIdMapping> {
|
|
||||||
let schema = segment_writer.segment_serializer.segment().schema();
|
|
||||||
expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
|
|
||||||
let new_doc_id_to_old = segment_writer.fast_field_writers.sort_order(
|
|
||||||
sort_by_field.field.as_str(),
|
|
||||||
segment_writer.max_doc(),
|
|
||||||
sort_by_field.order.is_desc(),
|
|
||||||
);
|
|
||||||
// create new doc_id to old doc_id index (used in fast_field_writers)
|
|
||||||
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests_indexsorting {
|
|
||||||
use common::DateTime;
|
|
||||||
|
|
||||||
use crate::collector::TopDocs;
|
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::indexer::NoMergePolicy;
|
|
||||||
use crate::query::QueryParser;
|
|
||||||
use crate::schema::*;
|
|
||||||
use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order};
|
|
||||||
|
|
||||||
fn create_test_index(
|
|
||||||
index_settings: Option<IndexSettings>,
|
|
||||||
text_field_options: TextOptions,
|
|
||||||
) -> crate::Result<Index> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
|
|
||||||
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
|
||||||
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
|
|
||||||
let my_number =
|
|
||||||
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
|
|
||||||
|
|
||||||
let multi_numbers =
|
|
||||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let mut index_builder = Index::builder().schema(schema);
|
|
||||||
if let Some(settings) = index_settings {
|
|
||||||
index_builder = index_builder.settings(settings);
|
|
||||||
}
|
|
||||||
let index = index_builder.create_in_ram()?;
|
|
||||||
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(my_number=>40_u64))?;
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
|
|
||||||
)?;
|
|
||||||
index_writer.add_document(doc!(my_number=>100_u64))?;
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
|
|
||||||
)?;
|
|
||||||
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
Ok(index)
|
|
||||||
}
|
|
||||||
fn get_text_options() -> TextOptions {
|
|
||||||
TextOptions::default().set_indexing_options(
|
|
||||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::Basic),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn test_sort_index_test_text_field() -> crate::Result<()> {
|
|
||||||
// there are different serializers for different settings in postings/recorder.rs
|
|
||||||
// test remapping for all of them
|
|
||||||
let options = vec![
|
|
||||||
get_text_options(),
|
|
||||||
get_text_options().set_indexing_options(
|
|
||||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
|
||||||
),
|
|
||||||
get_text_options().set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
for option in options {
|
|
||||||
// let options = get_text_options();
|
|
||||||
// no index_sort
|
|
||||||
let index = create_test_index(None, option.clone())?;
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
|
||||||
assert_eq!(
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
|
||||||
vec![3]
|
|
||||||
);
|
|
||||||
|
|
||||||
// sort by field asc
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
option.clone(),
|
|
||||||
)?;
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
|
||||||
assert_eq!(
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
|
||||||
vec![0]
|
|
||||||
);
|
|
||||||
|
|
||||||
// test new field norm mapping
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let fieldnorm_reader = searcher
|
|
||||||
.segment_reader(0)
|
|
||||||
.get_fieldnorms_reader(my_text_field)?;
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
|
||||||
}
|
|
||||||
// sort by field desc
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
option.clone(),
|
|
||||||
)?;
|
|
||||||
let my_string_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
|
|
||||||
let query =
|
|
||||||
QueryParser::for_index(&index, vec![my_string_field]).parse_query("text")?;
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
|
||||||
assert_eq!(
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
|
||||||
vec![4]
|
|
||||||
);
|
|
||||||
// test new field norm mapping
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let fieldnorm_reader = searcher
|
|
||||||
.segment_reader(0)
|
|
||||||
.get_fieldnorms_reader(my_text_field)?;
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn test_sort_index_get_documents() -> crate::Result<()> {
|
|
||||||
// default baseline
|
|
||||||
let index = create_test_index(None, get_text_options())?;
|
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
{
|
|
||||||
assert!(searcher
|
|
||||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
|
|
||||||
.get_first(my_string_field)
|
|
||||||
.is_none());
|
|
||||||
assert_eq!(
|
|
||||||
searcher
|
|
||||||
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
|
|
||||||
.get_first(my_string_field)
|
|
||||||
.unwrap()
|
|
||||||
.as_str(),
|
|
||||||
Some("blublub")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// sort by field asc
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
get_text_options(),
|
|
||||||
)?;
|
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
{
|
|
||||||
assert_eq!(
|
|
||||||
searcher
|
|
||||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
|
|
||||||
.get_first(my_string_field)
|
|
||||||
.unwrap()
|
|
||||||
.as_str(),
|
|
||||||
Some("blublub")
|
|
||||||
);
|
|
||||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
|
||||||
assert!(doc.get_first(my_string_field).is_none());
|
|
||||||
}
|
|
||||||
// sort by field desc
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
get_text_options(),
|
|
||||||
)?;
|
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
{
|
|
||||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
|
||||||
assert_eq!(
|
|
||||||
doc.get_first(my_string_field).unwrap().as_str(),
|
|
||||||
Some("blublub")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_index_test_string_field() -> crate::Result<()> {
|
|
||||||
let index = create_test_index(None, get_text_options())?;
|
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
|
||||||
assert_eq!(
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
|
||||||
vec![3]
|
|
||||||
);
|
|
||||||
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
get_text_options(),
|
|
||||||
)?;
|
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
|
||||||
assert_eq!(
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
|
||||||
vec![0]
|
|
||||||
);
|
|
||||||
|
|
||||||
// test new field norm mapping
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let fieldnorm_reader = searcher
|
|
||||||
.segment_reader(0)
|
|
||||||
.get_fieldnorms_reader(my_text_field)?;
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
|
||||||
}
|
|
||||||
// sort by field desc
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
get_text_options(),
|
|
||||||
)?;
|
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
|
||||||
assert_eq!(
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
|
||||||
vec![4]
|
|
||||||
);
|
|
||||||
// test new field norm mapping
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let fieldnorm_reader = searcher
|
|
||||||
.segment_reader(0)
|
|
||||||
.get_fieldnorms_reader(my_text_field)?;
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_index_fast_field() -> crate::Result<()> {
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "my_number".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
get_text_options(),
|
|
||||||
)?;
|
|
||||||
assert_eq!(
|
|
||||||
index.settings().sort_by_field.as_ref().unwrap().field,
|
|
||||||
"my_number".to_string()
|
|
||||||
);
|
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
let fast_fields = segment_reader.fast_fields();
|
|
||||||
|
|
||||||
let fast_field = fast_fields
|
|
||||||
.u64("my_number")
|
|
||||||
.unwrap()
|
|
||||||
.first_or_default_col(999);
|
|
||||||
assert_eq!(fast_field.get_val(0), 10u64);
|
|
||||||
assert_eq!(fast_field.get_val(1), 20u64);
|
|
||||||
assert_eq!(fast_field.get_val(2), 30u64);
|
|
||||||
|
|
||||||
let multifield = fast_fields.u64("multi_numbers").unwrap();
|
|
||||||
let vals: Vec<u64> = multifield.values_for_doc(0u32).collect();
|
|
||||||
assert_eq!(vals, &[] as &[u64]);
|
|
||||||
let vals: Vec<_> = multifield.values_for_doc(1u32).collect();
|
|
||||||
assert_eq!(vals, &[5, 6]);
|
|
||||||
|
|
||||||
let vals: Vec<_> = multifield.values_for_doc(2u32).collect();
|
|
||||||
assert_eq!(vals, &[3]);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_with_sort_by_date_field() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let date_field = schema_builder.add_date_field("date", INDEXED | STORED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let settings = IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "date".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let index = Index::builder()
|
|
||||||
.schema(schema)
|
|
||||||
.settings(settings)
|
|
||||||
.create_in_ram()?;
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
date_field => DateTime::from_timestamp_secs(1000),
|
|
||||||
))?;
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
date_field => DateTime::from_timestamp_secs(999),
|
|
||||||
))?;
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
date_field => DateTime::from_timestamp_secs(1001),
|
|
||||||
))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
let fast_fields = segment_reader.fast_fields();
|
|
||||||
|
|
||||||
let fast_field = fast_fields
|
|
||||||
.date("date")
|
|
||||||
.unwrap()
|
|
||||||
.first_or_default_col(DateTime::from_timestamp_secs(0));
|
|
||||||
assert_eq!(fast_field.get_val(0), DateTime::from_timestamp_secs(1001));
|
|
||||||
assert_eq!(fast_field.get_val(1), DateTime::from_timestamp_secs(1000));
|
|
||||||
assert_eq!(fast_field.get_val(2), DateTime::from_timestamp_secs(999));
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_doc_mapping() {
|
|
||||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
|
|
||||||
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
|
|
||||||
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
|
|
||||||
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_doc_mapping_remap() {
|
|
||||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
|
|
||||||
assert_eq!(
|
|
||||||
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
|
|
||||||
&[2000, 8000, 3000]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_text_sort() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
|
||||||
schema_builder.add_text_field("id", STRING | FAST | STORED);
|
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
|
||||||
|
|
||||||
let resp = IndexBuilder::new()
|
|
||||||
.schema(schema_builder.build())
|
|
||||||
.settings(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "id".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
})
|
|
||||||
.create_in_ram();
|
|
||||||
assert!(resp
|
|
||||||
.unwrap_err()
|
|
||||||
.to_string()
|
|
||||||
.contains("Unsupported field type"));
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -803,7 +803,7 @@ mod tests {
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::net::Ipv6Addr;
|
use std::net::Ipv6Addr;
|
||||||
|
|
||||||
use columnar::{Cardinality, Column, MonotonicallyMappableToU128};
|
use columnar::{Column, MonotonicallyMappableToU128};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use proptest::prop_oneof;
|
use proptest::prop_oneof;
|
||||||
|
|
||||||
@@ -813,15 +813,16 @@ mod tests {
|
|||||||
use crate::error::*;
|
use crate::error::*;
|
||||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||||
use crate::indexer::NoMergePolicy;
|
use crate::indexer::NoMergePolicy;
|
||||||
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
|
use crate::query::{QueryParser, TermQuery};
|
||||||
use crate::schema::{
|
use crate::schema::{
|
||||||
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
|
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, JsonObjectOptions,
|
||||||
TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
|
NumericOptions, Schema, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED,
|
||||||
|
STRING, TEXT,
|
||||||
};
|
};
|
||||||
use crate::store::DOCSTORE_CACHE_CAPACITY;
|
use crate::store::DOCSTORE_CACHE_CAPACITY;
|
||||||
use crate::{
|
use crate::{
|
||||||
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order,
|
DateTime, DocAddress, Index, IndexSettings, IndexWriter, ReloadPolicy, TantivyDocument,
|
||||||
ReloadPolicy, TantivyDocument, Term,
|
Term,
|
||||||
};
|
};
|
||||||
|
|
||||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
|
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
|
||||||
@@ -1462,116 +1463,6 @@ mod tests {
|
|||||||
assert!(text_fast_field.term_ords(1).eq([1].into_iter()));
|
assert!(text_fast_field.term_ords(1).eq([1].into_iter()));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_with_sort_by_field() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INDEXED | schema::STORED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let settings = IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "id".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let index = Index::builder()
|
|
||||||
.schema(schema)
|
|
||||||
.settings(settings)
|
|
||||||
.create_in_ram()?;
|
|
||||||
let index_reader = index.reader()?;
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
// create and delete docs in same commit
|
|
||||||
for id in 0u64..5u64 {
|
|
||||||
index_writer.add_document(doc!(id_field => id))?;
|
|
||||||
}
|
|
||||||
for id in 2u64..4u64 {
|
|
||||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
|
||||||
}
|
|
||||||
for id in 5u64..10u64 {
|
|
||||||
index_writer.add_document(doc!(id_field => id))?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
index_reader.reload()?;
|
|
||||||
|
|
||||||
let searcher = index_reader.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
|
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
assert_eq!(segment_reader.num_docs(), 8);
|
|
||||||
assert_eq!(segment_reader.max_doc(), 10);
|
|
||||||
let fast_field_reader = segment_reader.fast_fields().u64("id")?;
|
|
||||||
|
|
||||||
let in_order_alive_ids: Vec<u64> = segment_reader
|
|
||||||
.doc_ids_alive()
|
|
||||||
.flat_map(|doc| fast_field_reader.values_for_doc(doc))
|
|
||||||
.collect();
|
|
||||||
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_query_with_sort_by_field() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INDEXED | schema::STORED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let settings = IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "id".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let index = Index::builder()
|
|
||||||
.schema(schema)
|
|
||||||
.settings(settings)
|
|
||||||
.create_in_ram()?;
|
|
||||||
let index_reader = index.reader()?;
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
// create and delete docs in same commit
|
|
||||||
for id in 0u64..5u64 {
|
|
||||||
index_writer.add_document(doc!(id_field => id))?;
|
|
||||||
}
|
|
||||||
for id in 1u64..4u64 {
|
|
||||||
let term = Term::from_field_u64(id_field, id);
|
|
||||||
let not_term = Term::from_field_u64(id_field, 2);
|
|
||||||
let term = Box::new(TermQuery::new(term, Default::default()));
|
|
||||||
let not_term = Box::new(TermQuery::new(not_term, Default::default()));
|
|
||||||
|
|
||||||
let query: BooleanQuery = vec![
|
|
||||||
(Occur::Must, term as Box<dyn Query>),
|
|
||||||
(Occur::MustNot, not_term as Box<dyn Query>),
|
|
||||||
]
|
|
||||||
.into();
|
|
||||||
|
|
||||||
index_writer.delete_query(Box::new(query))?;
|
|
||||||
}
|
|
||||||
for id in 5u64..10u64 {
|
|
||||||
index_writer.add_document(doc!(id_field => id))?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
index_reader.reload()?;
|
|
||||||
|
|
||||||
let searcher = index_reader.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
|
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
assert_eq!(segment_reader.num_docs(), 8);
|
|
||||||
assert_eq!(segment_reader.max_doc(), 10);
|
|
||||||
let fast_field_reader = segment_reader.fast_fields().u64("id")?;
|
|
||||||
let in_order_alive_ids: Vec<u64> = segment_reader
|
|
||||||
.doc_ids_alive()
|
|
||||||
.flat_map(|doc| fast_field_reader.values_for_doc(doc))
|
|
||||||
.collect();
|
|
||||||
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
enum IndexingOp {
|
enum IndexingOp {
|
||||||
AddMultipleDoc {
|
AddMultipleDoc {
|
||||||
@@ -1683,11 +1574,11 @@ mod tests {
|
|||||||
deleted_ids.remove(id);
|
deleted_ids.remove(id);
|
||||||
}
|
}
|
||||||
IndexingOp::DeleteDoc { id } => {
|
IndexingOp::DeleteDoc { id } => {
|
||||||
existing_ids.remove(&id);
|
existing_ids.remove(id);
|
||||||
deleted_ids.insert(*id);
|
deleted_ids.insert(*id);
|
||||||
}
|
}
|
||||||
IndexingOp::DeleteDocQuery { id } => {
|
IndexingOp::DeleteDocQuery { id } => {
|
||||||
existing_ids.remove(&id);
|
existing_ids.remove(id);
|
||||||
deleted_ids.insert(*id);
|
deleted_ids.insert(*id);
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
@@ -1718,11 +1609,7 @@ mod tests {
|
|||||||
id_list
|
id_list
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_operation_strategy(
|
fn test_operation_strategy(ops: &[IndexingOp], force_end_merge: bool) -> crate::Result<Index> {
|
||||||
ops: &[IndexingOp],
|
|
||||||
sort_index: bool,
|
|
||||||
force_end_merge: bool,
|
|
||||||
) -> crate::Result<Index> {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let json_field = schema_builder.add_json_field("json", FAST | TEXT | STORED);
|
let json_field = schema_builder.add_json_field("json", FAST | TEXT | STORED);
|
||||||
let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED);
|
let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED);
|
||||||
@@ -1758,15 +1645,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let settings = if sort_index {
|
let settings = {
|
||||||
IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "id_opt".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
IndexSettings {
|
IndexSettings {
|
||||||
..Default::default()
|
..Default::default()
|
||||||
}
|
}
|
||||||
@@ -2329,78 +2208,13 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test if index property is in sort order
|
|
||||||
if sort_index {
|
|
||||||
// load all id_opt in each segment and check they are in order
|
|
||||||
|
|
||||||
for reader in searcher.segment_readers() {
|
|
||||||
let (ff_reader, _) = reader.fast_fields().u64_lenient("id_opt").unwrap().unwrap();
|
|
||||||
let mut ids_in_segment: Vec<u64> = Vec::new();
|
|
||||||
|
|
||||||
for doc in 0..reader.num_docs() {
|
|
||||||
ids_in_segment.extend(ff_reader.values_for_doc(doc));
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(is_sorted(&ids_in_segment));
|
|
||||||
|
|
||||||
fn is_sorted<T>(data: &[T]) -> bool
|
|
||||||
where T: Ord {
|
|
||||||
data.windows(2).all(|w| w[0] <= w[1])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fast_field_range() {
|
fn test_fast_field_range() {
|
||||||
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
|
let ops: Vec<_> = (0..1000).map(IndexingOp::add).collect();
|
||||||
assert!(test_operation_strategy(&ops, false, true).is_ok());
|
assert!(test_operation_strategy(&ops, true).is_ok());
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_index_on_opt_field_regression() {
|
|
||||||
assert!(test_operation_strategy(
|
|
||||||
&[
|
|
||||||
IndexingOp::add(81),
|
|
||||||
IndexingOp::add(70),
|
|
||||||
IndexingOp::DeleteDoc { id: 70 }
|
|
||||||
],
|
|
||||||
true,
|
|
||||||
false
|
|
||||||
)
|
|
||||||
.is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_simple_multiple_doc() {
|
|
||||||
assert!(test_operation_strategy(
|
|
||||||
&[
|
|
||||||
IndexingOp::AddMultipleDoc {
|
|
||||||
id: 7,
|
|
||||||
num_docs: 800,
|
|
||||||
value: IndexValue::U64(0),
|
|
||||||
},
|
|
||||||
IndexingOp::AddMultipleDoc {
|
|
||||||
id: 92,
|
|
||||||
num_docs: 800,
|
|
||||||
value: IndexValue::U64(0),
|
|
||||||
},
|
|
||||||
IndexingOp::AddMultipleDoc {
|
|
||||||
id: 30,
|
|
||||||
num_docs: 800,
|
|
||||||
value: IndexValue::U64(0),
|
|
||||||
},
|
|
||||||
IndexingOp::AddMultipleDoc {
|
|
||||||
id: 33,
|
|
||||||
num_docs: 800,
|
|
||||||
value: IndexValue::U64(0),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
true,
|
|
||||||
false
|
|
||||||
)
|
|
||||||
.is_ok());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2414,7 +2228,6 @@ mod tests {
|
|||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
IndexingOp::Merge
|
IndexingOp::Merge
|
||||||
],
|
],
|
||||||
true,
|
|
||||||
false
|
false
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
@@ -2431,7 +2244,6 @@ mod tests {
|
|||||||
IndexingOp::add(1),
|
IndexingOp::add(1),
|
||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
],
|
],
|
||||||
false,
|
|
||||||
true
|
true
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
@@ -2439,184 +2251,48 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_minimal_sort_force_end_merge() {
|
fn test_minimal_sort_force_end_merge() {
|
||||||
assert!(test_operation_strategy(
|
assert!(
|
||||||
&[IndexingOp::add(23), IndexingOp::add(13),],
|
test_operation_strategy(&[IndexingOp::add(23), IndexingOp::add(13),], false).is_ok()
|
||||||
false,
|
);
|
||||||
false
|
|
||||||
)
|
|
||||||
.is_ok());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_minimal_sort() {
|
fn test_minimal_no_force_end_merge() {
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let val = schema_builder.add_u64_field("val", FAST);
|
|
||||||
let id = schema_builder.add_u64_field("id", FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let settings = IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "id".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
let index = Index::builder()
|
|
||||||
.schema(schema)
|
|
||||||
.settings(settings)
|
|
||||||
.create_in_ram()
|
|
||||||
.unwrap();
|
|
||||||
let mut writer = index.writer_for_tests().unwrap();
|
|
||||||
writer
|
|
||||||
.add_document(doc!(id=> 3u64, val=>4u64, val=>4u64))
|
|
||||||
.unwrap();
|
|
||||||
writer
|
|
||||||
.add_document(doc!(id=> 2u64, val=>2u64, val=>2u64))
|
|
||||||
.unwrap();
|
|
||||||
writer
|
|
||||||
.add_document(doc!(id=> 1u64, val=>1u64, val=>1u64))
|
|
||||||
.unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
let id_col: Column = segment_reader
|
|
||||||
.fast_fields()
|
|
||||||
.column_opt("id")
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
let val_col: Column = segment_reader
|
|
||||||
.fast_fields()
|
|
||||||
.column_opt("val")
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(id_col.get_cardinality(), Cardinality::Full);
|
|
||||||
assert_eq!(val_col.get_cardinality(), Cardinality::Multivalued);
|
|
||||||
assert_eq!(id_col.first(0u32), Some(1u64));
|
|
||||||
assert_eq!(id_col.first(1u32), Some(2u64));
|
|
||||||
assert!(val_col.values_for_doc(0u32).eq([1u64, 1u64].into_iter()));
|
|
||||||
assert!(val_col.values_for_doc(1u32).eq([2u64, 2u64].into_iter()));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_minimal_sort_force_end_merge_with_delete() {
|
|
||||||
assert!(test_operation_strategy(
|
assert!(test_operation_strategy(
|
||||||
&[
|
&[
|
||||||
IndexingOp::add(23),
|
IndexingOp::add(23),
|
||||||
IndexingOp::add(13),
|
IndexingOp::add(13),
|
||||||
IndexingOp::DeleteDoc { id: 13 }
|
IndexingOp::DeleteDoc { id: 13 }
|
||||||
],
|
],
|
||||||
true,
|
|
||||||
true
|
|
||||||
)
|
|
||||||
.is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_minimal_no_sort_no_force_end_merge() {
|
|
||||||
assert!(test_operation_strategy(
|
|
||||||
&[
|
|
||||||
IndexingOp::add(23),
|
|
||||||
IndexingOp::add(13),
|
|
||||||
IndexingOp::DeleteDoc { id: 13 }
|
|
||||||
],
|
|
||||||
false,
|
|
||||||
false
|
false
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_minimal_sort_merge() {
|
|
||||||
assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
use proptest::prelude::*;
|
use proptest::prelude::*;
|
||||||
|
|
||||||
proptest! {
|
proptest! {
|
||||||
|
|
||||||
#![proptest_config(ProptestConfig::with_cases(20))]
|
#![proptest_config(ProptestConfig::with_cases(20))]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
||||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
assert!(test_operation_strategy(&ops[..], false).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
||||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
assert!(test_operation_strategy(&ops[..], true).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
||||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
assert!(test_operation_strategy(&ops[..], false).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
|
||||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());}
|
assert!(test_operation_strategy(&ops[..], true).is_ok());
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_with_sort_by_field_last_opstamp_is_not_max() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let sort_by_field = schema_builder.add_u64_field("sort_by", FAST);
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let settings = IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "sort_by".to_string(),
|
|
||||||
order: Order::Asc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
let index = Index::builder()
|
|
||||||
.schema(schema)
|
|
||||||
.settings(settings)
|
|
||||||
.create_in_ram()?;
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
// We add a doc...
|
|
||||||
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?;
|
|
||||||
// And remove it.
|
|
||||||
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
|
|
||||||
// We add another doc.
|
|
||||||
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?;
|
|
||||||
|
|
||||||
// The expected result is a segment with
|
|
||||||
// maxdoc = 2
|
|
||||||
// numdoc = 1.
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
|
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
assert_eq!(segment_reader.max_doc(), 2);
|
|
||||||
assert_eq!(segment_reader.num_docs(), 1);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2633,7 +2309,7 @@ mod tests {
|
|||||||
IndexingOp::add(4),
|
IndexingOp::add(4),
|
||||||
Commit,
|
Commit,
|
||||||
];
|
];
|
||||||
test_operation_strategy(&ops[..], false, true).unwrap();
|
test_operation_strategy(&ops[..], true).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2646,7 +2322,7 @@ mod tests {
|
|||||||
Commit,
|
Commit,
|
||||||
Merge,
|
Merge,
|
||||||
];
|
];
|
||||||
test_operation_strategy(&ops[..], false, true).unwrap();
|
test_operation_strategy(&ops[..], true).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2658,7 +2334,7 @@ mod tests {
|
|||||||
IndexingOp::add(13),
|
IndexingOp::add(13),
|
||||||
Commit,
|
Commit,
|
||||||
];
|
];
|
||||||
test_operation_strategy(&ops[..], false, true).unwrap();
|
test_operation_strategy(&ops[..], true).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2669,7 +2345,7 @@ mod tests {
|
|||||||
IndexingOp::add(9),
|
IndexingOp::add(9),
|
||||||
IndexingOp::add(10),
|
IndexingOp::add(10),
|
||||||
];
|
];
|
||||||
test_operation_strategy(&ops[..], false, false).unwrap();
|
test_operation_strategy(&ops[..], false).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2696,7 +2372,6 @@ mod tests {
|
|||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
IndexingOp::Commit
|
IndexingOp::Commit
|
||||||
],
|
],
|
||||||
false,
|
|
||||||
false
|
false
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
@@ -2704,11 +2379,11 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bug_1617_2() {
|
fn test_bug_1617_2() {
|
||||||
assert!(test_operation_strategy(
|
test_operation_strategy(
|
||||||
&[
|
&[
|
||||||
IndexingOp::AddDoc {
|
IndexingOp::AddDoc {
|
||||||
id: 13,
|
id: 13,
|
||||||
value: Default::default()
|
value: Default::default(),
|
||||||
},
|
},
|
||||||
IndexingOp::DeleteDoc { id: 13 },
|
IndexingOp::DeleteDoc { id: 13 },
|
||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
@@ -2716,10 +2391,9 @@ mod tests {
|
|||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
IndexingOp::Merge,
|
IndexingOp::Merge,
|
||||||
],
|
],
|
||||||
false,
|
true,
|
||||||
true
|
|
||||||
)
|
)
|
||||||
.is_ok());
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2817,4 +2491,46 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_2442_reserved_character_fast_field() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let json_field = schema_builder.add_json_field("json", FAST | TEXT);
|
||||||
|
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::builder().schema(schema).create_in_ram()?;
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(
|
||||||
|
json_field=>json!({"\u{0000}B":"1"})
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(
|
||||||
|
json_field=>json!({" A":"1"})
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_2442_reserved_character_columnar() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let options = JsonObjectOptions::from(FAST).set_expand_dots_enabled();
|
||||||
|
let field = schema_builder.add_json_field("json", options);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(field=>json!({"\u{0000}": "A"})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(field=>json!({format!("\u{0000}\u{0000}"): "A"})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
147
src/indexer/merge_index_test.rs
Normal file
147
src/indexer/merge_index_test.rs
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::collector::TopDocs;
|
||||||
|
use crate::fastfield::AliveBitSet;
|
||||||
|
use crate::index::Index;
|
||||||
|
use crate::postings::Postings;
|
||||||
|
use crate::query::QueryParser;
|
||||||
|
use crate::schema::{
|
||||||
|
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||||
|
TextFieldIndexing, TextOptions,
|
||||||
|
};
|
||||||
|
use crate::{DocAddress, DocSet, IndexSettings, IndexWriter, Term};
|
||||||
|
|
||||||
|
fn create_test_index(index_settings: Option<IndexSettings>) -> crate::Result<Index> {
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let int_options = NumericOptions::default()
|
||||||
|
.set_fast()
|
||||||
|
.set_stored()
|
||||||
|
.set_indexed();
|
||||||
|
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||||
|
|
||||||
|
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
||||||
|
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
||||||
|
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||||
|
|
||||||
|
let multi_numbers =
|
||||||
|
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
||||||
|
let text_field_options = TextOptions::default()
|
||||||
|
.set_indexing_options(
|
||||||
|
TextFieldIndexing::default()
|
||||||
|
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||||
|
)
|
||||||
|
.set_stored();
|
||||||
|
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
let mut index_builder = Index::builder().schema(schema);
|
||||||
|
if let Some(settings) = index_settings {
|
||||||
|
index_builder = index_builder.settings(settings);
|
||||||
|
}
|
||||||
|
let index = index_builder.create_in_ram()?;
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
|
||||||
|
// segment 1 - range 1-3
|
||||||
|
index_writer.add_document(doc!(int_field=>1_u64))?;
|
||||||
|
index_writer.add_document(
|
||||||
|
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||||
|
)?;
|
||||||
|
index_writer.add_document(
|
||||||
|
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
||||||
|
)?;
|
||||||
|
index_writer.add_document(
|
||||||
|
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
index_writer.commit()?;
|
||||||
|
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
|
||||||
|
|
||||||
|
let in_val = 1u64;
|
||||||
|
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
|
||||||
|
index_writer.commit()?;
|
||||||
|
let int_vals = [10u64, 5];
|
||||||
|
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||||
|
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||||
|
)?;
|
||||||
|
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
|
||||||
|
index_writer.add_document(
|
||||||
|
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||||
|
)?;
|
||||||
|
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
||||||
|
index_writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merging the segments
|
||||||
|
{
|
||||||
|
let segment_ids = index.searchable_segment_ids()?;
|
||||||
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||||
|
index_writer.merge(&segment_ids).wait()?;
|
||||||
|
index_writer.wait_merging_threads()?;
|
||||||
|
}
|
||||||
|
Ok(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_merge_index() {
|
||||||
|
let index = create_test_index(Some(IndexSettings {
|
||||||
|
..Default::default()
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
|
let segment_reader = searcher.segment_readers().last().unwrap();
|
||||||
|
|
||||||
|
let searcher = index.reader().unwrap().searcher();
|
||||||
|
{
|
||||||
|
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||||
|
|
||||||
|
let do_search = |term: &str| {
|
||||||
|
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||||
|
.parse_query(term)
|
||||||
|
.unwrap();
|
||||||
|
let top_docs: Vec<(f32, DocAddress)> =
|
||||||
|
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||||
|
|
||||||
|
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||||
|
};
|
||||||
|
|
||||||
|
assert_eq!(do_search("some"), vec![1]);
|
||||||
|
assert_eq!(do_search("blubber"), vec![3]);
|
||||||
|
assert_eq!(do_search("biggest"), vec![4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// postings file
|
||||||
|
{
|
||||||
|
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||||
|
let term_a = Term::from_field_text(my_text_field, "text");
|
||||||
|
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||||
|
let mut postings = inverted_index
|
||||||
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(postings.doc_freq(), 2);
|
||||||
|
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||||
|
assert_eq!(
|
||||||
|
postings.doc_freq_given_deletes(
|
||||||
|
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||||
|
),
|
||||||
|
2
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(postings.term_freq(), 1);
|
||||||
|
let mut output = vec![];
|
||||||
|
postings.positions(&mut output);
|
||||||
|
assert_eq!(output, vec![1]);
|
||||||
|
postings.advance();
|
||||||
|
|
||||||
|
assert_eq!(postings.term_freq(), 2);
|
||||||
|
postings.positions(&mut output);
|
||||||
|
assert_eq!(output, vec![1, 3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,8 +1,7 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use columnar::{
|
use columnar::{
|
||||||
ColumnType, ColumnValues, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder,
|
ColumnType, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, StackMergeOrder,
|
||||||
StackMergeOrder,
|
|
||||||
};
|
};
|
||||||
use common::ReadOnlyBitSet;
|
use common::ReadOnlyBitSet;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
@@ -11,7 +10,7 @@ use measure_time::debug_time;
|
|||||||
use crate::directory::WritePtr;
|
use crate::directory::WritePtr;
|
||||||
use crate::docset::{DocSet, TERMINATED};
|
use crate::docset::{DocSet, TERMINATED};
|
||||||
use crate::error::DataCorruption;
|
use crate::error::DataCorruption;
|
||||||
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
|
use crate::fastfield::AliveBitSet;
|
||||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||||
use crate::index::{Segment, SegmentComponent, SegmentReader};
|
use crate::index::{Segment, SegmentComponent, SegmentReader};
|
||||||
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
|
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
|
||||||
@@ -20,9 +19,7 @@ use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
|
|||||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
|
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
|
||||||
use crate::store::StoreWriter;
|
use crate::store::StoreWriter;
|
||||||
use crate::termdict::{TermMerger, TermOrdinal};
|
use crate::termdict::{TermMerger, TermOrdinal};
|
||||||
use crate::{
|
use crate::{DocAddress, DocId, InvertedIndexReader};
|
||||||
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||||
///
|
///
|
||||||
@@ -80,7 +77,6 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct IndexMerger {
|
pub struct IndexMerger {
|
||||||
index_settings: IndexSettings,
|
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
pub(crate) readers: Vec<SegmentReader>,
|
pub(crate) readers: Vec<SegmentReader>,
|
||||||
max_doc: u32,
|
max_doc: u32,
|
||||||
@@ -116,7 +112,7 @@ fn convert_to_merge_order(
|
|||||||
) -> MergeRowOrder {
|
) -> MergeRowOrder {
|
||||||
match doc_id_mapping.mapping_type() {
|
match doc_id_mapping.mapping_type() {
|
||||||
MappingType::Stacked => MergeRowOrder::Stack(StackMergeOrder::stack(columnars)),
|
MappingType::Stacked => MergeRowOrder::Stack(StackMergeOrder::stack(columnars)),
|
||||||
MappingType::StackedWithDeletes | MappingType::Shuffled => {
|
MappingType::StackedWithDeletes => {
|
||||||
// RUST/LLVM is amazing. The following conversion is actually a no-op:
|
// RUST/LLVM is amazing. The following conversion is actually a no-op:
|
||||||
// no allocation, no copy.
|
// no allocation, no copy.
|
||||||
let new_row_id_to_old_row_id: Vec<RowAddr> = doc_id_mapping
|
let new_row_id_to_old_row_id: Vec<RowAddr> = doc_id_mapping
|
||||||
@@ -149,13 +145,9 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl IndexMerger {
|
impl IndexMerger {
|
||||||
pub fn open(
|
pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result<IndexMerger> {
|
||||||
schema: Schema,
|
|
||||||
index_settings: IndexSettings,
|
|
||||||
segments: &[Segment],
|
|
||||||
) -> crate::Result<IndexMerger> {
|
|
||||||
let alive_bitset = segments.iter().map(|_| None).collect_vec();
|
let alive_bitset = segments.iter().map(|_| None).collect_vec();
|
||||||
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
|
Self::open_with_custom_alive_set(schema, segments, alive_bitset)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create merge with a custom delete set.
|
// Create merge with a custom delete set.
|
||||||
@@ -172,7 +164,6 @@ impl IndexMerger {
|
|||||||
// segments and partitions them e.g. by a value in a field.
|
// segments and partitions them e.g. by a value in a field.
|
||||||
pub fn open_with_custom_alive_set(
|
pub fn open_with_custom_alive_set(
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
index_settings: IndexSettings,
|
|
||||||
segments: &[Segment],
|
segments: &[Segment],
|
||||||
alive_bitset_opt: Vec<Option<AliveBitSet>>,
|
alive_bitset_opt: Vec<Option<AliveBitSet>>,
|
||||||
) -> crate::Result<IndexMerger> {
|
) -> crate::Result<IndexMerger> {
|
||||||
@@ -186,9 +177,6 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
|
let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
|
||||||
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
|
||||||
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
|
||||||
}
|
|
||||||
// sort segments by their natural sort setting
|
// sort segments by their natural sort setting
|
||||||
if max_doc >= MAX_DOC_LIMIT {
|
if max_doc >= MAX_DOC_LIMIT {
|
||||||
let err_msg = format!(
|
let err_msg = format!(
|
||||||
@@ -198,37 +186,12 @@ impl IndexMerger {
|
|||||||
return Err(crate::TantivyError::InvalidArgument(err_msg));
|
return Err(crate::TantivyError::InvalidArgument(err_msg));
|
||||||
}
|
}
|
||||||
Ok(IndexMerger {
|
Ok(IndexMerger {
|
||||||
index_settings,
|
|
||||||
schema,
|
schema,
|
||||||
readers,
|
readers,
|
||||||
max_doc,
|
max_doc,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sort_readers_by_min_sort_field(
|
|
||||||
readers: Vec<SegmentReader>,
|
|
||||||
sort_by_field: &IndexSortByField,
|
|
||||||
) -> crate::Result<Vec<SegmentReader>> {
|
|
||||||
// presort the readers by their min_values, so that when they are disjunct, we can use
|
|
||||||
// the regular merge logic (implicitly sorted)
|
|
||||||
let mut readers_with_min_sort_values = readers
|
|
||||||
.into_iter()
|
|
||||||
.map(|reader| {
|
|
||||||
let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?;
|
|
||||||
Ok((reader, accessor.min_value()))
|
|
||||||
})
|
|
||||||
.collect::<crate::Result<Vec<_>>>()?;
|
|
||||||
if sort_by_field.order.is_asc() {
|
|
||||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
|
|
||||||
} else {
|
|
||||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
|
|
||||||
}
|
|
||||||
Ok(readers_with_min_sort_values
|
|
||||||
.into_iter()
|
|
||||||
.map(|(reader, _)| reader)
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_fieldnorms(
|
fn write_fieldnorms(
|
||||||
&self,
|
&self,
|
||||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||||
@@ -276,128 +239,6 @@ impl IndexMerger {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks if the readers are disjunct for their sort property and in the correct order to be
|
|
||||||
/// able to just stack them.
|
|
||||||
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
|
|
||||||
&self,
|
|
||||||
sort_by_field: &IndexSortByField,
|
|
||||||
) -> crate::Result<bool> {
|
|
||||||
let reader_ordinal_and_field_accessors =
|
|
||||||
self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
|
||||||
|
|
||||||
let everything_is_in_order = reader_ordinal_and_field_accessors
|
|
||||||
.into_iter()
|
|
||||||
.map(|(_, col)| Arc::new(col))
|
|
||||||
.tuple_windows()
|
|
||||||
.all(|(field_accessor1, field_accessor2)| {
|
|
||||||
if sort_by_field.order.is_asc() {
|
|
||||||
field_accessor1.max_value() <= field_accessor2.min_value()
|
|
||||||
} else {
|
|
||||||
field_accessor1.min_value() >= field_accessor2.max_value()
|
|
||||||
}
|
|
||||||
});
|
|
||||||
Ok(everything_is_in_order)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_sort_field_accessor(
|
|
||||||
reader: &SegmentReader,
|
|
||||||
sort_by_field: &IndexSortByField,
|
|
||||||
) -> crate::Result<Arc<dyn ColumnValues>> {
|
|
||||||
reader.schema().get_field(&sort_by_field.field)?;
|
|
||||||
let (value_accessor, _column_type) = reader
|
|
||||||
.fast_fields()
|
|
||||||
.u64_lenient(&sort_by_field.field)?
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError {
|
|
||||||
field_name: sort_by_field.field.to_string(),
|
|
||||||
})?;
|
|
||||||
Ok(value_accessor.first_or_default_col(0u64))
|
|
||||||
}
|
|
||||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
|
||||||
pub(crate) fn get_reader_with_sort_field_accessor(
|
|
||||||
&self,
|
|
||||||
sort_by_field: &IndexSortByField,
|
|
||||||
) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn ColumnValues>)>> {
|
|
||||||
let reader_ordinal_and_field_accessors = self
|
|
||||||
.readers
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(reader_ordinal, _)| reader_ordinal as SegmentOrdinal)
|
|
||||||
.map(|reader_ordinal: SegmentOrdinal| {
|
|
||||||
let value_accessor = Self::get_sort_field_accessor(
|
|
||||||
&self.readers[reader_ordinal as usize],
|
|
||||||
sort_by_field,
|
|
||||||
)?;
|
|
||||||
Ok((reader_ordinal, value_accessor))
|
|
||||||
})
|
|
||||||
.collect::<crate::Result<Vec<_>>>()?;
|
|
||||||
Ok(reader_ordinal_and_field_accessors)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generates the doc_id mapping where position in the vec=new
|
|
||||||
/// doc_id.
|
|
||||||
/// ReaderWithOrdinal will include the ordinal position of the
|
|
||||||
/// reader in self.readers.
|
|
||||||
pub(crate) fn generate_doc_id_mapping_with_sort_by_field(
|
|
||||||
&self,
|
|
||||||
sort_by_field: &IndexSortByField,
|
|
||||||
) -> crate::Result<SegmentDocIdMapping> {
|
|
||||||
let reader_ordinal_and_field_accessors =
|
|
||||||
self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
|
||||||
// Loading the field accessor on demand causes a 15x regression
|
|
||||||
|
|
||||||
// create iterators over segment/sort_accessor/doc_id tuple
|
|
||||||
let doc_id_reader_pair =
|
|
||||||
reader_ordinal_and_field_accessors
|
|
||||||
.iter()
|
|
||||||
.map(|(reader_ord, ff_reader)| {
|
|
||||||
let reader = &self.readers[*reader_ord as usize];
|
|
||||||
reader
|
|
||||||
.doc_ids_alive()
|
|
||||||
.map(move |doc_id| (doc_id, reader_ord, ff_reader))
|
|
||||||
});
|
|
||||||
|
|
||||||
let total_num_new_docs = self
|
|
||||||
.readers
|
|
||||||
.iter()
|
|
||||||
.map(|reader| reader.num_docs() as usize)
|
|
||||||
.sum();
|
|
||||||
|
|
||||||
let mut sorted_doc_ids: Vec<DocAddress> = Vec::with_capacity(total_num_new_docs);
|
|
||||||
|
|
||||||
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
|
|
||||||
sorted_doc_ids.extend(
|
|
||||||
doc_id_reader_pair
|
|
||||||
.into_iter()
|
|
||||||
.kmerge_by(|a, b| {
|
|
||||||
let val1 = a.2.get_val(a.0);
|
|
||||||
let val2 = b.2.get_val(b.0);
|
|
||||||
if sort_by_field.order == Order::Asc {
|
|
||||||
val1 < val2
|
|
||||||
} else {
|
|
||||||
val1 > val2
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.map(|(doc_id, &segment_ord, _)| DocAddress {
|
|
||||||
doc_id,
|
|
||||||
segment_ord,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
|
|
||||||
let alive_bitsets: Vec<Option<ReadOnlyBitSet>> = self
|
|
||||||
.readers
|
|
||||||
.iter()
|
|
||||||
.map(|segment_reader| {
|
|
||||||
let alive_bitset = segment_reader.alive_bitset()?;
|
|
||||||
Some(alive_bitset.bitset().clone())
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
Ok(SegmentDocIdMapping::new(
|
|
||||||
sorted_doc_ids,
|
|
||||||
MappingType::Shuffled,
|
|
||||||
alive_bitsets,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
|
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
|
||||||
/// index sorting and the others
|
/// index sorting and the others
|
||||||
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
|
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
|
||||||
@@ -515,7 +356,6 @@ impl IndexMerger {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
|
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
|
||||||
let mut doc_id_and_positions = vec![];
|
|
||||||
|
|
||||||
while merged_terms.advance() {
|
while merged_terms.advance() {
|
||||||
segment_postings_containing_the_term.clear();
|
segment_postings_containing_the_term.clear();
|
||||||
@@ -611,37 +451,13 @@ impl IndexMerger {
|
|||||||
0u32
|
0u32
|
||||||
};
|
};
|
||||||
|
|
||||||
// if doc_id_mapping exists, the doc_ids are reordered, they are
|
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||||
// not just stacked. The field serializer expects monotonically increasing
|
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
||||||
// doc_ids, so we collect and sort them first, before writing.
|
|
||||||
//
|
|
||||||
// I think this is not strictly necessary, it would be possible to
|
|
||||||
// avoid the loading into a vec via some form of kmerge, but then the merge
|
|
||||||
// logic would deviate much more from the stacking case (unsorted index)
|
|
||||||
if !doc_id_mapping.is_trivial() {
|
|
||||||
doc_id_and_positions.push((
|
|
||||||
remapped_doc_id,
|
|
||||||
term_freq,
|
|
||||||
positions_buffer.to_vec(),
|
|
||||||
));
|
|
||||||
} else {
|
|
||||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
|
||||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
doc = segment_postings.advance();
|
doc = segment_postings.advance();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !doc_id_mapping.is_trivial() {
|
|
||||||
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
|
|
||||||
|
|
||||||
for (doc_id, term_freq, positions) in &doc_id_and_positions {
|
|
||||||
let delta_positions = delta_computer.compute_delta(positions);
|
|
||||||
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
|
|
||||||
}
|
|
||||||
doc_id_and_positions.clear();
|
|
||||||
}
|
|
||||||
// closing the term.
|
// closing the term.
|
||||||
field_serializer.close_term()?;
|
field_serializer.close_term()?;
|
||||||
}
|
}
|
||||||
@@ -670,47 +486,13 @@ impl IndexMerger {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_storable_fields(
|
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> {
|
||||||
&self,
|
|
||||||
store_writer: &mut StoreWriter,
|
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
debug_time!("write-storable-fields");
|
debug_time!("write-storable-fields");
|
||||||
debug!("write-storable-field");
|
debug!("write-storable-field");
|
||||||
|
|
||||||
if !doc_id_mapping.is_trivial() {
|
for reader in &self.readers {
|
||||||
debug!("non-trivial-doc-id-mapping");
|
let store_reader = reader.get_store_reader(1)?;
|
||||||
|
if reader.has_deletes()
|
||||||
let store_readers: Vec<_> = self
|
|
||||||
.readers
|
|
||||||
.iter()
|
|
||||||
.map(|reader| reader.get_store_reader(50))
|
|
||||||
.collect::<Result<_, _>>()?;
|
|
||||||
|
|
||||||
let mut document_iterators: Vec<_> = store_readers
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
|
|
||||||
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
|
|
||||||
if let Some(doc_bytes_res) = doc_bytes_it.next() {
|
|
||||||
let doc_bytes = doc_bytes_res?;
|
|
||||||
store_writer.store_bytes(&doc_bytes)?;
|
|
||||||
} else {
|
|
||||||
return Err(DataCorruption::comment_only(format!(
|
|
||||||
"unexpected missing document in docstore on merge, doc address \
|
|
||||||
{old_doc_addr:?}",
|
|
||||||
))
|
|
||||||
.into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
debug!("trivial-doc-id-mapping");
|
|
||||||
for reader in &self.readers {
|
|
||||||
let store_reader = reader.get_store_reader(1)?;
|
|
||||||
if reader.has_deletes()
|
|
||||||
// If there is not enough data in the store, we avoid stacking in order to
|
// If there is not enough data in the store, we avoid stacking in order to
|
||||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||||
@@ -726,14 +508,13 @@ impl IndexMerger {
|
|||||||
// take 7 in order to not walk over all checkpoints.
|
// take 7 in order to not walk over all checkpoints.
|
||||||
|| store_reader.block_checkpoints().take(7).count() < 6
|
|| store_reader.block_checkpoints().take(7).count() < 6
|
||||||
|| store_reader.decompressor() != store_writer.compressor().into()
|
|| store_reader.decompressor() != store_writer.compressor().into()
|
||||||
{
|
{
|
||||||
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
|
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
|
||||||
let doc_bytes = doc_bytes_res?;
|
let doc_bytes = doc_bytes_res?;
|
||||||
store_writer.store_bytes(&doc_bytes)?;
|
store_writer.store_bytes(&doc_bytes)?;
|
||||||
}
|
|
||||||
} else {
|
|
||||||
store_writer.stack(store_reader)?;
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
store_writer.stack(store_reader)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -745,18 +526,7 @@ impl IndexMerger {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
/// The number of documents in the resulting segment.
|
/// The number of documents in the resulting segment.
|
||||||
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
|
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
|
||||||
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
|
let doc_id_mapping = self.get_doc_id_from_concatenated_data()?;
|
||||||
{
|
|
||||||
// If the documents are already sorted and stackable, we ignore the mapping and execute
|
|
||||||
// it as if there was no sorting
|
|
||||||
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
|
|
||||||
self.get_doc_id_from_concatenated_data()?
|
|
||||||
} else {
|
|
||||||
self.generate_doc_id_mapping_with_sort_by_field(sort_by_field)?
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
self.get_doc_id_from_concatenated_data()?
|
|
||||||
};
|
|
||||||
debug!("write-fieldnorms");
|
debug!("write-fieldnorms");
|
||||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||||
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
|
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
|
||||||
@@ -773,7 +543,7 @@ impl IndexMerger {
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
debug!("write-storagefields");
|
debug!("write-storagefields");
|
||||||
self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?;
|
self.write_storable_fields(serializer.get_store_writer())?;
|
||||||
debug!("write-fastfields");
|
debug!("write-fastfields");
|
||||||
self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;
|
self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;
|
||||||
|
|
||||||
@@ -805,7 +575,7 @@ mod tests {
|
|||||||
use crate::time::OffsetDateTime;
|
use crate::time::OffsetDateTime;
|
||||||
use crate::{
|
use crate::{
|
||||||
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
|
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
|
||||||
IndexSortByField, IndexWriter, Order, Searcher,
|
IndexWriter, Searcher,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1278,60 +1048,6 @@ mod tests {
|
|||||||
test_merge_facets(None, true)
|
test_merge_facets(None, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_facets_sort_asc() {
|
|
||||||
// In the merge case this will go through the doc_id mapping code
|
|
||||||
test_merge_facets(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
// In the merge case this will not go through the doc_id mapping code, because the data
|
|
||||||
// sorted and disjunct
|
|
||||||
test_merge_facets(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
false,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_facets_sort_desc() {
|
|
||||||
// In the merge case this will go through the doc_id mapping code
|
|
||||||
test_merge_facets(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
// In the merge case this will not go through the doc_id mapping code, because the data
|
|
||||||
// sorted and disjunct
|
|
||||||
test_merge_facets(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
false,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
|
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
|
||||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||||
|
|||||||
@@ -1,579 +0,0 @@
|
|||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::collector::TopDocs;
|
|
||||||
use crate::fastfield::AliveBitSet;
|
|
||||||
use crate::index::Index;
|
|
||||||
use crate::postings::Postings;
|
|
||||||
use crate::query::QueryParser;
|
|
||||||
use crate::schema::{
|
|
||||||
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
|
||||||
TextFieldIndexing, TextOptions, Value,
|
|
||||||
};
|
|
||||||
use crate::{
|
|
||||||
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
|
|
||||||
Term,
|
|
||||||
};
|
|
||||||
|
|
||||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
|
||||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
|
||||||
|
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let mut index_builder = Index::builder().schema(schema);
|
|
||||||
if let Some(settings) = index_settings {
|
|
||||||
index_builder = index_builder.settings(settings);
|
|
||||||
}
|
|
||||||
let index = index_builder.create_in_ram().unwrap();
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
|
|
||||||
.unwrap();
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
|
|
||||||
.unwrap();
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
|
|
||||||
.unwrap();
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merging the segments
|
|
||||||
{
|
|
||||||
let segment_ids = index
|
|
||||||
.searchable_segment_ids()
|
|
||||||
.expect("Searchable segments failed.");
|
|
||||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
|
||||||
assert!(index_writer.merge(&segment_ids).wait().is_ok());
|
|
||||||
assert!(index_writer.wait_merging_threads().is_ok());
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have
|
|
||||||
// disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
|
||||||
fn create_test_index(
|
|
||||||
index_settings: Option<IndexSettings>,
|
|
||||||
force_disjunct_segment_sort_values: bool,
|
|
||||||
) -> crate::Result<Index> {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let int_options = NumericOptions::default()
|
|
||||||
.set_fast()
|
|
||||||
.set_stored()
|
|
||||||
.set_indexed();
|
|
||||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
|
||||||
|
|
||||||
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
|
||||||
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
|
||||||
|
|
||||||
let multi_numbers =
|
|
||||||
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
|
|
||||||
let text_field_options = TextOptions::default()
|
|
||||||
.set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
|
||||||
)
|
|
||||||
.set_stored();
|
|
||||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let mut index_builder = Index::builder().schema(schema);
|
|
||||||
if let Some(settings) = index_settings {
|
|
||||||
index_builder = index_builder.settings(settings);
|
|
||||||
}
|
|
||||||
let index = index_builder.create_in_ram()?;
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
// segment 1 - range 1-3
|
|
||||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
|
||||||
)?;
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
|
||||||
)?;
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
index_writer.commit()?;
|
|
||||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
|
||||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
|
|
||||||
|
|
||||||
let in_val = if force_disjunct_segment_sort_values {
|
|
||||||
10_u64
|
|
||||||
} else {
|
|
||||||
1
|
|
||||||
};
|
|
||||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
|
||||||
let int_vals = if force_disjunct_segment_sort_values {
|
|
||||||
[100_u64, 50]
|
|
||||||
} else {
|
|
||||||
[10, 5]
|
|
||||||
};
|
|
||||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
|
||||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
|
||||||
)?;
|
|
||||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
|
||||||
)?;
|
|
||||||
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
|
||||||
index_writer.commit()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merging the segments
|
|
||||||
{
|
|
||||||
let segment_ids = index.searchable_segment_ids()?;
|
|
||||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
||||||
index_writer.merge(&segment_ids).wait()?;
|
|
||||||
index_writer.wait_merging_threads()?;
|
|
||||||
}
|
|
||||||
Ok(index)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_sorted_postinglist_sort_issue() {
|
|
||||||
create_test_index_posting_list_issue(Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_sorted_index_desc_not_disjunct() {
|
|
||||||
test_merge_sorted_index_desc_(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_sorted_index_desc_disjunct() {
|
|
||||||
test_merge_sorted_index_desc_(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
sort_by_field: Some(IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
force_disjunct_segment_sort_values,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let int_field = index.schema().get_field("intval").unwrap();
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
|
||||||
|
|
||||||
let fast_fields = segment_reader.fast_fields();
|
|
||||||
let fast_field = fast_fields.u64("intval").unwrap();
|
|
||||||
assert_eq!(fast_field.first(5), Some(1u64));
|
|
||||||
assert_eq!(fast_field.first(4), Some(2u64));
|
|
||||||
assert_eq!(fast_field.first(3), Some(3u64));
|
|
||||||
if force_disjunct_segment_sort_values {
|
|
||||||
assert_eq!(fast_field.first(2), Some(20u64));
|
|
||||||
assert_eq!(fast_field.first(1), Some(100u64));
|
|
||||||
} else {
|
|
||||||
assert_eq!(fast_field.first(2), Some(10u64));
|
|
||||||
assert_eq!(fast_field.first(1), Some(20u64));
|
|
||||||
}
|
|
||||||
assert_eq!(fast_field.first(0), Some(1_000u64));
|
|
||||||
|
|
||||||
// test new field norm mapping
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
|
|
||||||
if force_disjunct_segment_sort_values {
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
|
||||||
} else {
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
|
||||||
}
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
|
|
||||||
let do_search = |term: &str| {
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
|
||||||
.parse_query(term)
|
|
||||||
.unwrap();
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert_eq!(do_search("some"), vec![3]);
|
|
||||||
if force_disjunct_segment_sort_values {
|
|
||||||
assert_eq!(do_search("blubber"), vec![1]);
|
|
||||||
} else {
|
|
||||||
assert_eq!(do_search("blubber"), vec![2]);
|
|
||||||
}
|
|
||||||
assert_eq!(do_search("biggest"), vec![0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// postings file
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let term_a = Term::from_field_text(my_text_field, "text");
|
|
||||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
|
||||||
let mut postings = inverted_index
|
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(postings.doc_freq(), 2);
|
|
||||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
|
||||||
assert_eq!(
|
|
||||||
postings.doc_freq_given_deletes(
|
|
||||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
|
||||||
),
|
|
||||||
2
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(postings.term_freq(), 1);
|
|
||||||
let mut output = vec![];
|
|
||||||
postings.positions(&mut output);
|
|
||||||
assert_eq!(output, vec![1]);
|
|
||||||
postings.advance();
|
|
||||||
|
|
||||||
assert_eq!(postings.term_freq(), 2);
|
|
||||||
postings.positions(&mut output);
|
|
||||||
assert_eq!(output, vec![1, 3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// access doc store
|
|
||||||
{
|
|
||||||
let blubber_pos = if force_disjunct_segment_sort_values {
|
|
||||||
1
|
|
||||||
} else {
|
|
||||||
2
|
|
||||||
};
|
|
||||||
let doc = searcher
|
|
||||||
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
doc.get_first(my_text_field).unwrap().as_value().as_str(),
|
|
||||||
Some("blubber")
|
|
||||||
);
|
|
||||||
let doc = searcher
|
|
||||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
doc.get_first(int_field).unwrap().as_value().as_u64(),
|
|
||||||
Some(1000)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_unsorted_index() {
|
|
||||||
let index = create_test_index(
|
|
||||||
Some(IndexSettings {
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
|
|
||||||
let do_search = |term: &str| {
|
|
||||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
|
||||||
.parse_query(term)
|
|
||||||
.unwrap();
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert_eq!(do_search("some"), vec![1]);
|
|
||||||
assert_eq!(do_search("blubber"), vec![3]);
|
|
||||||
assert_eq!(do_search("biggest"), vec![4]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// postings file
|
|
||||||
{
|
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
let term_a = Term::from_field_text(my_text_field, "text");
|
|
||||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
|
||||||
let mut postings = inverted_index
|
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(postings.doc_freq(), 2);
|
|
||||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
|
||||||
assert_eq!(
|
|
||||||
postings.doc_freq_given_deletes(
|
|
||||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
|
||||||
),
|
|
||||||
2
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(postings.term_freq(), 1);
|
|
||||||
let mut output = vec![];
|
|
||||||
postings.positions(&mut output);
|
|
||||||
assert_eq!(output, vec![1]);
|
|
||||||
postings.advance();
|
|
||||||
|
|
||||||
assert_eq!(postings.term_freq(), 2);
|
|
||||||
postings.positions(&mut output);
|
|
||||||
assert_eq!(output, vec![1, 3]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// #[test]
|
|
||||||
// fn test_merge_sorted_index_asc() {
|
|
||||||
// let index = create_test_index(
|
|
||||||
// Some(IndexSettings {
|
|
||||||
// sort_by_field: Some(IndexSortByField {
|
|
||||||
// field: "intval".to_string(),
|
|
||||||
// order: Order::Asc,
|
|
||||||
// }),
|
|
||||||
// ..Default::default()
|
|
||||||
// }),
|
|
||||||
// false,
|
|
||||||
// )
|
|
||||||
// .unwrap();
|
|
||||||
|
|
||||||
// let int_field = index.schema().get_field("intval").unwrap();
|
|
||||||
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
|
||||||
// let bytes_field = index.schema().get_field("bytes").unwrap();
|
|
||||||
// let reader = index.reader().unwrap();
|
|
||||||
// let searcher = reader.searcher();
|
|
||||||
// assert_eq!(searcher.segment_readers().len(), 1);
|
|
||||||
// let segment_reader = searcher.segment_readers().last().unwrap();
|
|
||||||
|
|
||||||
// let fast_fields = segment_reader.fast_fields();
|
|
||||||
// let fast_field = fast_fields.u64(int_field).unwrap();
|
|
||||||
// assert_eq!(fast_field.get_val(0), 1u64);
|
|
||||||
// assert_eq!(fast_field.get_val(1), 2u64);
|
|
||||||
// assert_eq!(fast_field.get_val(2), 3u64);
|
|
||||||
// assert_eq!(fast_field.get_val(3), 10u64);
|
|
||||||
// assert_eq!(fast_field.get_val(4), 20u64);
|
|
||||||
// assert_eq!(fast_field.get_val(5), 1_000u64);
|
|
||||||
|
|
||||||
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
|
||||||
// let mut vals = vec![];
|
|
||||||
// fast_field.get_vals(doc_id, &mut vals);
|
|
||||||
// vals
|
|
||||||
// };
|
|
||||||
// let fast_fields = segment_reader.fast_fields();
|
|
||||||
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
|
|
||||||
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
|
|
||||||
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
|
|
||||||
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
|
|
||||||
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
|
|
||||||
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
|
|
||||||
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
|
|
||||||
|
|
||||||
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
|
|
||||||
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
|
|
||||||
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
|
|
||||||
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
|
|
||||||
|
|
||||||
// // test new field norm mapping
|
|
||||||
// {
|
|
||||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
|
||||||
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
|
||||||
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
|
|
||||||
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
|
|
||||||
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
|
|
||||||
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
|
|
||||||
// }
|
|
||||||
|
|
||||||
// let searcher = index.reader().unwrap().searcher();
|
|
||||||
// {
|
|
||||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
|
|
||||||
// let do_search = |term: &str| {
|
|
||||||
// let query = QueryParser::for_index(&index, vec![my_text_field])
|
|
||||||
// .parse_query(term)
|
|
||||||
// .unwrap();
|
|
||||||
// let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
|
||||||
// };
|
|
||||||
|
|
||||||
// assert_eq!(do_search("some"), vec![2]);
|
|
||||||
// assert_eq!(do_search("blubber"), vec![3]);
|
|
||||||
// assert_eq!(do_search("biggest"), vec![5]);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // postings file
|
|
||||||
// {
|
|
||||||
// let my_text_field = index.schema().get_field("text_field").unwrap();
|
|
||||||
// let term_a = Term::from_field_text(my_text_field, "text");
|
|
||||||
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
|
||||||
// let mut postings = inverted_index
|
|
||||||
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
// .unwrap()
|
|
||||||
// .unwrap();
|
|
||||||
|
|
||||||
// assert_eq!(postings.doc_freq(), 2);
|
|
||||||
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
|
||||||
// assert_eq!(
|
|
||||||
// postings.doc_freq_given_deletes(
|
|
||||||
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
|
||||||
// ),
|
|
||||||
// 2
|
|
||||||
// );
|
|
||||||
|
|
||||||
// let mut output = vec![];
|
|
||||||
// postings.positions(&mut output);
|
|
||||||
// assert_eq!(output, vec![1, 3]);
|
|
||||||
// postings.advance();
|
|
||||||
|
|
||||||
// postings.positions(&mut output);
|
|
||||||
// assert_eq!(output, vec![1]);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // access doc store
|
|
||||||
// {
|
|
||||||
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
|
||||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
|
|
||||||
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
|
|
||||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
|
|
||||||
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
|
||||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
|
|
||||||
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
|
|
||||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
|
|
||||||
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
|
|
||||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
|
|
||||||
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
|
|
||||||
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench_sorted_index_merge {
|
|
||||||
|
|
||||||
use test::{self, Bencher};
|
|
||||||
|
|
||||||
use crate::index::Index;
|
|
||||||
use crate::indexer::merger::IndexMerger;
|
|
||||||
use crate::schema::{NumericOptions, Schema};
|
|
||||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
|
||||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let int_options = NumericOptions::default().set_fast().set_indexed();
|
|
||||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let index_builder = Index::builder().schema(schema).settings(IndexSettings {
|
|
||||||
sort_by_field,
|
|
||||||
..Default::default()
|
|
||||||
});
|
|
||||||
let index = index_builder.create_in_ram().unwrap();
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
|
||||||
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
|
|
||||||
index_writer.add_document(doc!(int_field=>val)).unwrap();
|
|
||||||
};
|
|
||||||
// 3 segments with 10_000 values in the fast fields
|
|
||||||
for _ in 0..3 {
|
|
||||||
index_doc(&mut index_writer, 5000); // fix to make it unordered
|
|
||||||
for i in 0..10_000 {
|
|
||||||
index_doc(&mut index_writer, i);
|
|
||||||
}
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
//#[bench]
|
|
||||||
// fn create_sorted_index_walk_overkmerge_on_merge_fastfield(
|
|
||||||
// b: &mut Bencher,
|
|
||||||
//) -> crate::Result<()> {
|
|
||||||
// let sort_by_field = IndexSortByField {
|
|
||||||
// field: "intval".to_string(),
|
|
||||||
// order: Order::Desc,
|
|
||||||
//};
|
|
||||||
// let index = create_index(Some(sort_by_field.clone()));
|
|
||||||
// let segments = index.searchable_segments().unwrap();
|
|
||||||
// let merger: IndexMerger =
|
|
||||||
// IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
|
||||||
// let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
|
|
||||||
// b.iter(|| {
|
|
||||||
// let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
|
|
||||||
// let reader = &merger.readers[doc_addr.segment_ord as usize];
|
|
||||||
// let u64_reader: Arc<dyn Column<u64>> = reader
|
|
||||||
//.fast_fields()
|
|
||||||
//.typed_fast_field_reader("intval")
|
|
||||||
//.expect(
|
|
||||||
//"Failed to find a reader for single fast field. This is a tantivy bug and \
|
|
||||||
// it should never happen.",
|
|
||||||
//);
|
|
||||||
//(doc_addr.doc_id, reader, u64_reader)
|
|
||||||
//});
|
|
||||||
/// add values in order of the new doc_ids
|
|
||||||
// let mut val = 0;
|
|
||||||
// for (doc_id, _reader, field_reader) in sorted_doc_ids {
|
|
||||||
// val = field_reader.get_val(doc_id);
|
|
||||||
//}
|
|
||||||
|
|
||||||
// val
|
|
||||||
//});
|
|
||||||
|
|
||||||
// Ok(())
|
|
||||||
//}
|
|
||||||
#[bench]
|
|
||||||
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
|
|
||||||
let sort_by_field = IndexSortByField {
|
|
||||||
field: "intval".to_string(),
|
|
||||||
order: Order::Desc,
|
|
||||||
};
|
|
||||||
let index = create_index(Some(sort_by_field.clone()));
|
|
||||||
// let field = index.schema().get_field("intval").unwrap();
|
|
||||||
let segments = index.searchable_segments().unwrap();
|
|
||||||
let merger: IndexMerger =
|
|
||||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
|
||||||
b.iter(|| {
|
|
||||||
merger
|
|
||||||
.generate_doc_id_mapping_with_sort_by_field(&sort_by_field)
|
|
||||||
.unwrap();
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -13,10 +13,10 @@ mod flat_map_with_buffer;
|
|||||||
pub(crate) mod index_writer;
|
pub(crate) mod index_writer;
|
||||||
pub(crate) mod index_writer_status;
|
pub(crate) mod index_writer_status;
|
||||||
mod log_merge_policy;
|
mod log_merge_policy;
|
||||||
|
mod merge_index_test;
|
||||||
mod merge_operation;
|
mod merge_operation;
|
||||||
pub(crate) mod merge_policy;
|
pub(crate) mod merge_policy;
|
||||||
pub(crate) mod merger;
|
pub(crate) mod merger;
|
||||||
mod merger_sorted_index_test;
|
|
||||||
pub(crate) mod operation;
|
pub(crate) mod operation;
|
||||||
pub(crate) mod prepared_commit;
|
pub(crate) mod prepared_commit;
|
||||||
mod segment_entry;
|
mod segment_entry;
|
||||||
@@ -145,15 +145,27 @@ mod tests_mmap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_field_null_byte() {
|
fn test_json_field_null_byte_is_ignored() {
|
||||||
// Test when field name contains a zero byte, which has special meaning in tantivy.
|
let mut schema_builder = Schema::builder();
|
||||||
// As a workaround, we convert the zero byte to the ASCII character '0'.
|
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
|
||||||
// https://github.com/quickwit-oss/tantivy/issues/2340
|
let field = schema_builder.add_json_field("json", options);
|
||||||
// https://github.com/quickwit-oss/tantivy/issues/2193
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let field_name_in = "\u{0000}";
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let field_name_out = "0";
|
index_writer
|
||||||
test_json_field_name(field_name_in, field_name_out);
|
.add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
let inv_indexer = segment_reader.inverted_index(field).unwrap();
|
||||||
|
let term_dict = inv_indexer.terms();
|
||||||
|
assert_eq!(term_dict.num_terms(), 1);
|
||||||
|
let mut term_bytes = Vec::new();
|
||||||
|
term_dict.ord_to_term(0, &mut term_bytes).unwrap();
|
||||||
|
assert_eq!(term_bytes, b"key\0stest1");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_field_1byte() {
|
fn test_json_field_1byte() {
|
||||||
// Test when field name contains a '1' byte, which has special meaning in tantivy.
|
// Test when field name contains a '1' byte, which has special meaning in tantivy.
|
||||||
@@ -291,7 +303,7 @@ mod tests_mmap {
|
|||||||
Type::Str,
|
Type::Str,
|
||||||
),
|
),
|
||||||
(format!("{field_name_out_internal}a"), Type::Str),
|
(format!("{field_name_out_internal}a"), Type::Str),
|
||||||
(format!("{field_name_out_internal}"), Type::Str),
|
(field_name_out_internal.to_string(), Type::Str),
|
||||||
(format!("num{field_name_out_internal}"), Type::I64),
|
(format!("num{field_name_out_internal}"), Type::I64),
|
||||||
];
|
];
|
||||||
expected_fields.sort();
|
expected_fields.sort();
|
||||||
|
|||||||
@@ -38,7 +38,8 @@ impl PathToUnorderedId {
|
|||||||
#[cold]
|
#[cold]
|
||||||
fn insert_new_path(&mut self, path: &str) -> u32 {
|
fn insert_new_path(&mut self, path: &str) -> u32 {
|
||||||
let next_id = self.map.len() as u32;
|
let next_id = self.map.len() as u32;
|
||||||
self.map.insert(path.to_string(), next_id);
|
let new_path = path.to_string();
|
||||||
|
self.map.insert(new_path, next_id);
|
||||||
next_id
|
next_id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,27 +18,9 @@ pub struct SegmentSerializer {
|
|||||||
|
|
||||||
impl SegmentSerializer {
|
impl SegmentSerializer {
|
||||||
/// Creates a new `SegmentSerializer`.
|
/// Creates a new `SegmentSerializer`.
|
||||||
pub fn for_segment(
|
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
|
||||||
mut segment: Segment,
|
|
||||||
is_in_merge: bool,
|
|
||||||
) -> crate::Result<SegmentSerializer> {
|
|
||||||
// If the segment is going to be sorted, we stream the docs first to a temporary file.
|
|
||||||
// In the merge case this is not necessary because we can kmerge the already sorted
|
|
||||||
// segments
|
|
||||||
let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
|
|
||||||
let settings = segment.index().settings().clone();
|
let settings = segment.index().settings().clone();
|
||||||
let store_writer = if remapping_required {
|
let store_writer = {
|
||||||
let store_write = segment.open_write(SegmentComponent::TempStore)?;
|
|
||||||
StoreWriter::new(
|
|
||||||
store_write,
|
|
||||||
crate::store::Compressor::None,
|
|
||||||
// We want fast random access on the docs, so we choose a small block size.
|
|
||||||
// If this is zero, the skip index will contain too many checkpoints and
|
|
||||||
// therefore will be relatively slow.
|
|
||||||
16000,
|
|
||||||
settings.docstore_compress_dedicated_thread,
|
|
||||||
)?
|
|
||||||
} else {
|
|
||||||
let store_write = segment.open_write(SegmentComponent::Store)?;
|
let store_write = segment.open_write(SegmentComponent::Store)?;
|
||||||
StoreWriter::new(
|
StoreWriter::new(
|
||||||
store_write,
|
store_write,
|
||||||
@@ -72,10 +54,6 @@ impl SegmentSerializer {
|
|||||||
&self.segment
|
&self.segment
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn segment_mut(&mut self) -> &mut Segment {
|
|
||||||
&mut self.segment
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the `PostingsSerializer`.
|
/// Accessor to the `PostingsSerializer`.
|
||||||
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
|
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
|
||||||
&mut self.postings_serializer
|
&mut self.postings_serializer
|
||||||
|
|||||||
@@ -115,11 +115,10 @@ fn merge(
|
|||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// An IndexMerger is like a "view" of our merged segments.
|
// An IndexMerger is like a "view" of our merged segments.
|
||||||
let merger: IndexMerger =
|
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
|
||||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
|
||||||
|
|
||||||
// ... we just serialize this index merger in our new segment to merge the segments.
|
// ... we just serialize this index merger in our new segment to merge the segments.
|
||||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?;
|
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
|
||||||
|
|
||||||
let num_docs = merger.write(segment_serializer)?;
|
let num_docs = merger.write(segment_serializer)?;
|
||||||
|
|
||||||
@@ -220,13 +219,9 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
|||||||
)?;
|
)?;
|
||||||
let merged_segment = merged_index.new_segment();
|
let merged_segment = merged_index.new_segment();
|
||||||
let merged_segment_id = merged_segment.id();
|
let merged_segment_id = merged_segment.id();
|
||||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
let merger: IndexMerger =
|
||||||
merged_index.schema(),
|
IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?;
|
||||||
merged_index.settings().clone(),
|
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
|
||||||
segments,
|
|
||||||
filter_doc_ids,
|
|
||||||
)?;
|
|
||||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
|
|
||||||
let num_docs = merger.write(segment_serializer)?;
|
let num_docs = merger.write(segment_serializer)?;
|
||||||
|
|
||||||
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
|
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
|
||||||
@@ -1067,7 +1062,6 @@ mod tests {
|
|||||||
)?;
|
)?;
|
||||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||||
merged_index.schema(),
|
merged_index.schema(),
|
||||||
merged_index.settings().clone(),
|
|
||||||
&segments[..],
|
&segments[..],
|
||||||
filter_segments,
|
filter_segments,
|
||||||
)?;
|
)?;
|
||||||
@@ -1083,7 +1077,6 @@ mod tests {
|
|||||||
Index::create(RamDirectory::default(), target_schema, target_settings)?;
|
Index::create(RamDirectory::default(), target_schema, target_settings)?;
|
||||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||||
merged_index.schema(),
|
merged_index.schema(),
|
||||||
merged_index.settings().clone(),
|
|
||||||
&segments[..],
|
&segments[..],
|
||||||
filter_segments,
|
filter_segments,
|
||||||
)?;
|
)?;
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use common::JsonPathWriter;
|
|||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use tokenizer_api::BoxTokenStream;
|
use tokenizer_api::BoxTokenStream;
|
||||||
|
|
||||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
|
||||||
use super::operation::AddOperation;
|
use super::operation::AddOperation;
|
||||||
use crate::fastfield::FastFieldsWriter;
|
use crate::fastfield::FastFieldsWriter;
|
||||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||||
@@ -16,7 +15,6 @@ use crate::postings::{
|
|||||||
};
|
};
|
||||||
use crate::schema::document::{Document, Value};
|
use crate::schema::document::{Document, Value};
|
||||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
|
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
|
||||||
use crate::store::{StoreReader, StoreWriter};
|
|
||||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||||
use crate::{DocId, Opstamp, TantivyError};
|
use crate::{DocId, Opstamp, TantivyError};
|
||||||
|
|
||||||
@@ -41,20 +39,6 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remap_doc_opstamps(
|
|
||||||
opstamps: Vec<Opstamp>,
|
|
||||||
doc_id_mapping_opt: Option<&DocIdMapping>,
|
|
||||||
) -> Vec<Opstamp> {
|
|
||||||
if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
|
|
||||||
doc_id_mapping_opt
|
|
||||||
.iter_old_doc_ids()
|
|
||||||
.map(|doc| opstamps[doc as usize])
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
opstamps
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||||
/// set of documents.
|
/// set of documents.
|
||||||
///
|
///
|
||||||
@@ -90,7 +74,7 @@ impl SegmentWriter {
|
|||||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||||
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
|
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
|
||||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
let segment_serializer = SegmentSerializer::for_segment(segment)?;
|
||||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||||
let per_field_text_analyzers = schema
|
let per_field_text_analyzers = schema
|
||||||
.fields()
|
.fields()
|
||||||
@@ -139,15 +123,6 @@ impl SegmentWriter {
|
|||||||
/// be used afterwards.
|
/// be used afterwards.
|
||||||
pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
|
pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
|
||||||
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
|
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
|
||||||
let mapping: Option<DocIdMapping> = self
|
|
||||||
.segment_serializer
|
|
||||||
.segment()
|
|
||||||
.index()
|
|
||||||
.settings()
|
|
||||||
.sort_by_field
|
|
||||||
.clone()
|
|
||||||
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
|
|
||||||
.transpose()?;
|
|
||||||
remap_and_write(
|
remap_and_write(
|
||||||
self.schema,
|
self.schema,
|
||||||
&self.per_field_postings_writers,
|
&self.per_field_postings_writers,
|
||||||
@@ -155,10 +130,8 @@ impl SegmentWriter {
|
|||||||
self.fast_field_writers,
|
self.fast_field_writers,
|
||||||
&self.fieldnorms_writer,
|
&self.fieldnorms_writer,
|
||||||
self.segment_serializer,
|
self.segment_serializer,
|
||||||
mapping.as_ref(),
|
|
||||||
)?;
|
)?;
|
||||||
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
|
Ok(self.doc_opstamps)
|
||||||
Ok(doc_opstamps)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns an estimation of the current memory usage of the segment writer.
|
/// Returns an estimation of the current memory usage of the segment writer.
|
||||||
@@ -419,11 +392,10 @@ fn remap_and_write(
|
|||||||
fast_field_writers: FastFieldsWriter,
|
fast_field_writers: FastFieldsWriter,
|
||||||
fieldnorms_writer: &FieldNormsWriter,
|
fieldnorms_writer: &FieldNormsWriter,
|
||||||
mut serializer: SegmentSerializer,
|
mut serializer: SegmentSerializer,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
debug!("remap-and-write");
|
debug!("remap-and-write");
|
||||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||||
fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?;
|
fieldnorms_writer.serialize(fieldnorms_serializer)?;
|
||||||
}
|
}
|
||||||
let fieldnorm_data = serializer
|
let fieldnorm_data = serializer
|
||||||
.segment()
|
.segment()
|
||||||
@@ -434,39 +406,10 @@ fn remap_and_write(
|
|||||||
schema,
|
schema,
|
||||||
per_field_postings_writers,
|
per_field_postings_writers,
|
||||||
fieldnorm_readers,
|
fieldnorm_readers,
|
||||||
doc_id_map,
|
|
||||||
serializer.get_postings_serializer(),
|
serializer.get_postings_serializer(),
|
||||||
)?;
|
)?;
|
||||||
debug!("fastfield-serialize");
|
debug!("fastfield-serialize");
|
||||||
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
|
fast_field_writers.serialize(serializer.get_fast_field_write())?;
|
||||||
|
|
||||||
// finalize temp docstore and create version, which reflects the doc_id_map
|
|
||||||
if let Some(doc_id_map) = doc_id_map {
|
|
||||||
debug!("resort-docstore");
|
|
||||||
let store_write = serializer
|
|
||||||
.segment_mut()
|
|
||||||
.open_write(SegmentComponent::Store)?;
|
|
||||||
let settings = serializer.segment().index().settings();
|
|
||||||
let store_writer = StoreWriter::new(
|
|
||||||
store_write,
|
|
||||||
settings.docstore_compression,
|
|
||||||
settings.docstore_blocksize,
|
|
||||||
settings.docstore_compress_dedicated_thread,
|
|
||||||
)?;
|
|
||||||
let old_store_writer = std::mem::replace(&mut serializer.store_writer, store_writer);
|
|
||||||
old_store_writer.close()?;
|
|
||||||
let store_read = StoreReader::open(
|
|
||||||
serializer
|
|
||||||
.segment()
|
|
||||||
.open_read(SegmentComponent::TempStore)?,
|
|
||||||
1, /* The docstore is configured to have one doc per block, and each doc is accessed
|
|
||||||
* only once: we don't need caching. */
|
|
||||||
)?;
|
|
||||||
for old_doc_id in doc_id_map.iter_old_doc_ids() {
|
|
||||||
let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
|
|
||||||
serializer.get_store_writer().store_bytes(&doc_bytes)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("serializer-close");
|
debug!("serializer-close");
|
||||||
serializer.close()?;
|
serializer.close()?;
|
||||||
|
|||||||
@@ -222,8 +222,8 @@ pub use crate::core::{Executor, Searcher, SearcherGeneration};
|
|||||||
pub use crate::directory::Directory;
|
pub use crate::directory::Directory;
|
||||||
#[allow(deprecated)] // Remove with index sorting
|
#[allow(deprecated)] // Remove with index sorting
|
||||||
pub use crate::index::{
|
pub use crate::index::{
|
||||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
|
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
||||||
Segment, SegmentMeta, SegmentReader,
|
SegmentMeta, SegmentReader,
|
||||||
};
|
};
|
||||||
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
||||||
//! This information is useful to run phrase queries.
|
//! This information is useful to run phrase queries.
|
||||||
//!
|
//!
|
||||||
//! The [position](crate::SegmentComponent::Positions) file contains all of the
|
//! The [position](crate::index::SegmentComponent::Positions) file contains all of the
|
||||||
//! bitpacked positions delta, for all terms of a given field, one term after the other.
|
//! bitpacked positions delta, for all terms of a given field, one term after the other.
|
||||||
//!
|
//!
|
||||||
//! Each term is encoded independently.
|
//! Each term is encoded independently.
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::io;
|
|||||||
use common::json_path_writer::JSON_END_OF_PATH;
|
use common::json_path_writer::JSON_END_OF_PATH;
|
||||||
use stacker::Addr;
|
use stacker::Addr;
|
||||||
|
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
||||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||||
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
|
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
|
||||||
@@ -60,9 +59,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
|||||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||||
ordered_id_to_path: &[&str],
|
ordered_id_to_path: &[&str],
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
ctx: &IndexingContext,
|
ctx: &IndexingContext,
|
||||||
serializer: &mut FieldSerializer,
|
serializer: &mut FieldSerializer,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
@@ -71,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
|||||||
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
||||||
let mut prev_term_id = u32::MAX;
|
let mut prev_term_id = u32::MAX;
|
||||||
let mut term_path_len = 0; // this will be set in the first iteration
|
let mut term_path_len = 0; // this will be set in the first iteration
|
||||||
for (_field, path_id, term, addr) in term_addrs {
|
for (_field, path_id, term, addr) in ordered_term_addrs {
|
||||||
if prev_term_id != path_id.path_id() {
|
if prev_term_id != path_id.path_id() {
|
||||||
term_buffer.truncate_value_bytes(0);
|
term_buffer.truncate_value_bytes(0);
|
||||||
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
||||||
@@ -87,7 +85,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
|||||||
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
||||||
term_buffer.serialized_value_bytes(),
|
term_buffer.serialized_value_bytes(),
|
||||||
*addr,
|
*addr,
|
||||||
doc_id_map,
|
|
||||||
&mut buffer_lender,
|
&mut buffer_lender,
|
||||||
ctx,
|
ctx,
|
||||||
serializer,
|
serializer,
|
||||||
@@ -96,7 +93,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
|||||||
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
|
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
|
||||||
term_buffer.serialized_value_bytes(),
|
term_buffer.serialized_value_bytes(),
|
||||||
*addr,
|
*addr,
|
||||||
doc_id_map,
|
|
||||||
&mut buffer_lender,
|
&mut buffer_lender,
|
||||||
ctx,
|
ctx,
|
||||||
serializer,
|
serializer,
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ pub trait Postings: DocSet + 'static {
|
|||||||
fn term_freq(&self) -> u32;
|
fn term_freq(&self) -> u32;
|
||||||
|
|
||||||
/// Returns the positions offsetted with a given value.
|
/// Returns the positions offsetted with a given value.
|
||||||
|
/// It is not necessary to clear the `output` before calling this method.
|
||||||
/// The output vector will be resized to the `term_freq`.
|
/// The output vector will be resized to the `term_freq`.
|
||||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use std::ops::Range;
|
|||||||
use stacker::Addr;
|
use stacker::Addr;
|
||||||
|
|
||||||
use crate::fieldnorm::FieldNormReaders;
|
use crate::fieldnorm::FieldNormReaders;
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
||||||
use crate::postings::recorder::{BufferLender, Recorder};
|
use crate::postings::recorder::{BufferLender, Recorder};
|
||||||
use crate::postings::{
|
use crate::postings::{
|
||||||
@@ -50,7 +49,6 @@ pub(crate) fn serialize_postings(
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
per_field_postings_writers: &PerFieldPostingsWriter,
|
per_field_postings_writers: &PerFieldPostingsWriter,
|
||||||
fieldnorm_readers: FieldNormReaders,
|
fieldnorm_readers: FieldNormReaders,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
// Replace unordered ids by ordered ids to be able to sort
|
// Replace unordered ids by ordered ids to be able to sort
|
||||||
@@ -86,7 +84,6 @@ pub(crate) fn serialize_postings(
|
|||||||
postings_writer.serialize(
|
postings_writer.serialize(
|
||||||
&term_offsets[byte_offsets],
|
&term_offsets[byte_offsets],
|
||||||
&ordered_id_to_path,
|
&ordered_id_to_path,
|
||||||
doc_id_map,
|
|
||||||
&ctx,
|
&ctx,
|
||||||
&mut field_serializer,
|
&mut field_serializer,
|
||||||
)?;
|
)?;
|
||||||
@@ -122,7 +119,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
|
|||||||
&self,
|
&self,
|
||||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||||
ordered_id_to_path: &[&str],
|
ordered_id_to_path: &[&str],
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
ctx: &IndexingContext,
|
ctx: &IndexingContext,
|
||||||
serializer: &mut FieldSerializer,
|
serializer: &mut FieldSerializer,
|
||||||
) -> io::Result<()>;
|
) -> io::Result<()>;
|
||||||
@@ -187,7 +183,6 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
|||||||
pub(crate) fn serialize_one_term(
|
pub(crate) fn serialize_one_term(
|
||||||
term: &[u8],
|
term: &[u8],
|
||||||
addr: Addr,
|
addr: Addr,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
buffer_lender: &mut BufferLender,
|
buffer_lender: &mut BufferLender,
|
||||||
ctx: &IndexingContext,
|
ctx: &IndexingContext,
|
||||||
serializer: &mut FieldSerializer,
|
serializer: &mut FieldSerializer,
|
||||||
@@ -195,7 +190,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
|||||||
let recorder: Rec = ctx.term_index.read(addr);
|
let recorder: Rec = ctx.term_index.read(addr);
|
||||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||||
serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?;
|
serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?;
|
||||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
recorder.serialize(&ctx.arena, serializer, buffer_lender);
|
||||||
serializer.close_term()?;
|
serializer.close_term()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -229,13 +224,12 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
|||||||
&self,
|
&self,
|
||||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||||
_ordered_id_to_path: &[&str],
|
_ordered_id_to_path: &[&str],
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
ctx: &IndexingContext,
|
ctx: &IndexingContext,
|
||||||
serializer: &mut FieldSerializer,
|
serializer: &mut FieldSerializer,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let mut buffer_lender = BufferLender::default();
|
let mut buffer_lender = BufferLender::default();
|
||||||
for (_field, _path_id, term, addr) in term_addrs {
|
for (_field, _path_id, term, addr) in term_addrs {
|
||||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
Self::serialize_one_term(term, *addr, &mut buffer_lender, ctx, serializer)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use common::read_u32_vint;
|
use common::read_u32_vint;
|
||||||
use stacker::{ExpUnrolledLinkedList, MemoryArena};
|
use stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||||
|
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::postings::FieldSerializer;
|
use crate::postings::FieldSerializer;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
|
|
||||||
@@ -71,7 +70,6 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
|
|||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
serializer: &mut FieldSerializer<'_>,
|
serializer: &mut FieldSerializer<'_>,
|
||||||
buffer_lender: &mut BufferLender,
|
buffer_lender: &mut BufferLender,
|
||||||
);
|
);
|
||||||
@@ -115,26 +113,15 @@ impl Recorder for DocIdRecorder {
|
|||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
serializer: &mut FieldSerializer<'_>,
|
serializer: &mut FieldSerializer<'_>,
|
||||||
buffer_lender: &mut BufferLender,
|
buffer_lender: &mut BufferLender,
|
||||||
) {
|
) {
|
||||||
let (buffer, doc_ids) = buffer_lender.lend_all();
|
let buffer = buffer_lender.lend_u8();
|
||||||
// TODO avoid reading twice.
|
// TODO avoid reading twice.
|
||||||
self.stack.read_to_end(arena, buffer);
|
self.stack.read_to_end(arena, buffer);
|
||||||
if let Some(doc_id_map) = doc_id_map {
|
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
|
||||||
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
|
for doc_id in iter {
|
||||||
doc_ids.extend(iter.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)));
|
serializer.write_doc(doc_id, 0u32, &[][..]);
|
||||||
doc_ids.sort_unstable();
|
|
||||||
|
|
||||||
for doc in doc_ids {
|
|
||||||
serializer.write_doc(*doc, 0u32, &[][..]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
|
|
||||||
for doc_id in iter {
|
|
||||||
serializer.write_doc(doc_id, 0u32, &[][..]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -194,35 +181,18 @@ impl Recorder for TermFrequencyRecorder {
|
|||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
serializer: &mut FieldSerializer<'_>,
|
serializer: &mut FieldSerializer<'_>,
|
||||||
buffer_lender: &mut BufferLender,
|
buffer_lender: &mut BufferLender,
|
||||||
) {
|
) {
|
||||||
let buffer = buffer_lender.lend_u8();
|
let buffer = buffer_lender.lend_u8();
|
||||||
self.stack.read_to_end(arena, buffer);
|
self.stack.read_to_end(arena, buffer);
|
||||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||||
if let Some(doc_id_map) = doc_id_map {
|
let mut prev_doc = 0;
|
||||||
let mut doc_id_and_tf = vec![];
|
while let Some(delta_doc_id) = u32_it.next() {
|
||||||
let mut prev_doc = 0;
|
let doc_id = prev_doc + delta_doc_id;
|
||||||
while let Some(delta_doc_id) = u32_it.next() {
|
prev_doc = doc_id;
|
||||||
let doc_id = prev_doc + delta_doc_id;
|
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||||
prev_doc = doc_id;
|
serializer.write_doc(doc_id, term_freq, &[][..]);
|
||||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
|
||||||
doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq));
|
|
||||||
}
|
|
||||||
doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
|
|
||||||
|
|
||||||
for (doc_id, tf) in doc_id_and_tf {
|
|
||||||
serializer.write_doc(doc_id, tf, &[][..]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let mut prev_doc = 0;
|
|
||||||
while let Some(delta_doc_id) = u32_it.next() {
|
|
||||||
let doc_id = prev_doc + delta_doc_id;
|
|
||||||
prev_doc = doc_id;
|
|
||||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
|
||||||
serializer.write_doc(doc_id, term_freq, &[][..]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,14 +238,12 @@ impl Recorder for TfAndPositionRecorder {
|
|||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
arena: &MemoryArena,
|
arena: &MemoryArena,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
|
||||||
serializer: &mut FieldSerializer<'_>,
|
serializer: &mut FieldSerializer<'_>,
|
||||||
buffer_lender: &mut BufferLender,
|
buffer_lender: &mut BufferLender,
|
||||||
) {
|
) {
|
||||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||||
self.stack.read_to_end(arena, buffer_u8);
|
self.stack.read_to_end(arena, buffer_u8);
|
||||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||||
let mut doc_id_and_positions = vec![];
|
|
||||||
let mut prev_doc = 0;
|
let mut prev_doc = 0;
|
||||||
while let Some(delta_doc_id) = u32_it.next() {
|
while let Some(delta_doc_id) = u32_it.next() {
|
||||||
let doc_id = prev_doc + delta_doc_id;
|
let doc_id = prev_doc + delta_doc_id;
|
||||||
@@ -294,19 +262,7 @@ impl Recorder for TfAndPositionRecorder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Some(doc_id_map) = doc_id_map {
|
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
|
||||||
// this simple variant to remap may consume to much memory
|
|
||||||
doc_id_and_positions
|
|
||||||
.push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
|
|
||||||
} else {
|
|
||||||
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if doc_id_map.is_some() {
|
|
||||||
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _)| doc_id);
|
|
||||||
for (doc_id, positions) in doc_id_and_positions {
|
|
||||||
serializer.write_doc(doc_id, positions.len() as u32, &positions);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -22,10 +22,7 @@ pub struct AllWeight;
|
|||||||
|
|
||||||
impl Weight for AllWeight {
|
impl Weight for AllWeight {
|
||||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||||
let all_scorer = AllScorer {
|
let all_scorer = AllScorer::new(reader.max_doc());
|
||||||
doc: 0u32,
|
|
||||||
max_doc: reader.max_doc(),
|
|
||||||
};
|
|
||||||
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,6 +40,13 @@ pub struct AllScorer {
|
|||||||
max_doc: DocId,
|
max_doc: DocId,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl AllScorer {
|
||||||
|
/// Creates a new AllScorer with `max_doc` docs.
|
||||||
|
pub fn new(max_doc: DocId) -> AllScorer {
|
||||||
|
AllScorer { doc: 0u32, max_doc }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl DocSet for AllScorer {
|
impl DocSet for AllScorer {
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn advance(&mut self) -> DocId {
|
fn advance(&mut self) -> DocId {
|
||||||
|
|||||||
@@ -66,6 +66,10 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// Term::from_field_text(title, "diary"),
|
/// Term::from_field_text(title, "diary"),
|
||||||
/// IndexRecordOption::Basic,
|
/// IndexRecordOption::Basic,
|
||||||
/// ));
|
/// ));
|
||||||
|
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
/// Term::from_field_text(title, "cow"),
|
||||||
|
/// IndexRecordOption::Basic
|
||||||
|
/// ));
|
||||||
/// // A TermQuery with "found" in the body
|
/// // A TermQuery with "found" in the body
|
||||||
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
/// Term::from_field_text(body, "found"),
|
/// Term::from_field_text(body, "found"),
|
||||||
@@ -74,7 +78,7 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// // TermQuery "diary" must and "girl" must not be present
|
/// // TermQuery "diary" must and "girl" must not be present
|
||||||
/// let queries_with_occurs1 = vec![
|
/// let queries_with_occurs1 = vec![
|
||||||
/// (Occur::Must, diary_term_query.box_clone()),
|
/// (Occur::Must, diary_term_query.box_clone()),
|
||||||
/// (Occur::MustNot, girl_term_query),
|
/// (Occur::MustNot, girl_term_query.box_clone()),
|
||||||
/// ];
|
/// ];
|
||||||
/// // Make a BooleanQuery equivalent to
|
/// // Make a BooleanQuery equivalent to
|
||||||
/// // title:+diary title:-girl
|
/// // title:+diary title:-girl
|
||||||
@@ -82,15 +86,10 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
||||||
/// assert_eq!(count1, 1);
|
/// assert_eq!(count1, 1);
|
||||||
///
|
///
|
||||||
/// // TermQuery for "cow" in the title
|
|
||||||
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
|
||||||
/// Term::from_field_text(title, "cow"),
|
|
||||||
/// IndexRecordOption::Basic,
|
|
||||||
/// ));
|
|
||||||
/// // "title:diary OR title:cow"
|
/// // "title:diary OR title:cow"
|
||||||
/// let title_diary_or_cow = BooleanQuery::new(vec![
|
/// let title_diary_or_cow = BooleanQuery::new(vec![
|
||||||
/// (Occur::Should, diary_term_query.box_clone()),
|
/// (Occur::Should, diary_term_query.box_clone()),
|
||||||
/// (Occur::Should, cow_term_query),
|
/// (Occur::Should, cow_term_query.box_clone()),
|
||||||
/// ]);
|
/// ]);
|
||||||
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
||||||
/// assert_eq!(count2, 4);
|
/// assert_eq!(count2, 4);
|
||||||
@@ -118,21 +117,38 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// ]);
|
/// ]);
|
||||||
/// let count4 = searcher.search(&nested_query, &Count)?;
|
/// let count4 = searcher.search(&nested_query, &Count)?;
|
||||||
/// assert_eq!(count4, 1);
|
/// assert_eq!(count4, 1);
|
||||||
|
///
|
||||||
|
/// // You may call `with_minimum_required_clauses` to
|
||||||
|
/// // specify the number of should clauses the returned documents must match.
|
||||||
|
/// let minimum_required_query = BooleanQuery::with_minimum_required_clauses(vec![
|
||||||
|
/// (Occur::Should, cow_term_query.box_clone()),
|
||||||
|
/// (Occur::Should, girl_term_query.box_clone()),
|
||||||
|
/// (Occur::Should, diary_term_query.box_clone()),
|
||||||
|
/// ], 2);
|
||||||
|
/// // Return documents contains "Diary Cow", "Diary Girl" or "Cow Girl"
|
||||||
|
/// // Notice: "Diary" isn't "Dairy". ;-)
|
||||||
|
/// let count5 = searcher.search(&minimum_required_query, &Count)?;
|
||||||
|
/// assert_eq!(count5, 1);
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct BooleanQuery {
|
pub struct BooleanQuery {
|
||||||
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for BooleanQuery {
|
impl Clone for BooleanQuery {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
self.subqueries
|
let subqueries = self
|
||||||
|
.subqueries
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
|
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>();
|
||||||
.into()
|
Self {
|
||||||
|
subqueries,
|
||||||
|
minimum_number_should_match: self.minimum_number_should_match,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -149,8 +165,9 @@ impl Query for BooleanQuery {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
|
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
|
||||||
.collect::<crate::Result<_>>()?;
|
.collect::<crate::Result<_>>()?;
|
||||||
Ok(Box::new(BooleanWeight::new(
|
Ok(Box::new(BooleanWeight::with_minimum_number_should_match(
|
||||||
sub_weights,
|
sub_weights,
|
||||||
|
self.minimum_number_should_match,
|
||||||
enable_scoring.is_scoring_enabled(),
|
enable_scoring.is_scoring_enabled(),
|
||||||
Box::new(SumWithCoordsCombiner::default),
|
Box::new(SumWithCoordsCombiner::default),
|
||||||
)))
|
)))
|
||||||
@@ -166,7 +183,41 @@ impl Query for BooleanQuery {
|
|||||||
impl BooleanQuery {
|
impl BooleanQuery {
|
||||||
/// Creates a new boolean query.
|
/// Creates a new boolean query.
|
||||||
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
|
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
|
||||||
BooleanQuery { subqueries }
|
// If the bool query includes at least one should clause
|
||||||
|
// and no Must or MustNot clauses, the default value is 1. Otherwise, the default value is
|
||||||
|
// 0. Keep pace with Elasticsearch.
|
||||||
|
let mut minimum_required = 0;
|
||||||
|
for (occur, _) in &subqueries {
|
||||||
|
match occur {
|
||||||
|
Occur::Should => minimum_required = 1,
|
||||||
|
Occur::Must | Occur::MustNot => {
|
||||||
|
minimum_required = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Self::with_minimum_required_clauses(subqueries, minimum_required)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new boolean query with minimum number of required should clauses specified.
|
||||||
|
pub fn with_minimum_required_clauses(
|
||||||
|
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
|
) -> BooleanQuery {
|
||||||
|
BooleanQuery {
|
||||||
|
subqueries,
|
||||||
|
minimum_number_should_match,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Getter for `minimum_number_should_match`
|
||||||
|
pub fn get_minimum_number_should_match(&self) -> usize {
|
||||||
|
self.minimum_number_should_match
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Setter for `minimum_number_should_match`
|
||||||
|
pub fn set_minimum_number_should_match(&mut self, minimum_number_should_match: usize) {
|
||||||
|
self.minimum_number_should_match = minimum_number_should_match;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the intersection of the queries.
|
/// Returns the intersection of the queries.
|
||||||
@@ -181,6 +232,18 @@ impl BooleanQuery {
|
|||||||
BooleanQuery::new(subqueries)
|
BooleanQuery::new(subqueries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the union of the queries with minimum required clause.
|
||||||
|
pub fn union_with_minimum_required_clauses(
|
||||||
|
queries: Vec<Box<dyn Query>>,
|
||||||
|
minimum_required_clauses: usize,
|
||||||
|
) -> BooleanQuery {
|
||||||
|
let subqueries = queries
|
||||||
|
.into_iter()
|
||||||
|
.map(|sub_query| (Occur::Should, sub_query))
|
||||||
|
.collect();
|
||||||
|
BooleanQuery::with_minimum_required_clauses(subqueries, minimum_required_clauses)
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper method to create a boolean query matching a given list of terms.
|
/// Helper method to create a boolean query matching a given list of terms.
|
||||||
/// The resulting query is a disjunction of the terms.
|
/// The resulting query is a disjunction of the terms.
|
||||||
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
|
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
|
||||||
@@ -203,11 +266,13 @@ impl BooleanQuery {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use super::BooleanQuery;
|
use super::BooleanQuery;
|
||||||
use crate::collector::{Count, DocSetCollector};
|
use crate::collector::{Count, DocSetCollector};
|
||||||
use crate::query::{QueryClone, QueryParser, TermQuery};
|
use crate::query::{Query, QueryClone, QueryParser, TermQuery};
|
||||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
use crate::schema::{Field, IndexRecordOption, Schema, TEXT};
|
||||||
use crate::{DocAddress, Index, Term};
|
use crate::{DocAddress, DocId, Index, Term};
|
||||||
|
|
||||||
fn create_test_index() -> crate::Result<Index> {
|
fn create_test_index() -> crate::Result<Index> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -223,6 +288,73 @@ mod tests {
|
|||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_minimum_required() -> crate::Result<()> {
|
||||||
|
fn create_test_index_with<T: IntoIterator<Item = &'static str>>(
|
||||||
|
docs: T,
|
||||||
|
) -> crate::Result<Index> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
for doc in docs {
|
||||||
|
writer.add_document(doc!(text => doc))?;
|
||||||
|
}
|
||||||
|
writer.commit()?;
|
||||||
|
Ok(index)
|
||||||
|
}
|
||||||
|
fn create_boolean_query_with_mr<T: IntoIterator<Item = &'static str>>(
|
||||||
|
queries: T,
|
||||||
|
field: Field,
|
||||||
|
mr: usize,
|
||||||
|
) -> BooleanQuery {
|
||||||
|
let terms = queries
|
||||||
|
.into_iter()
|
||||||
|
.map(|t| Term::from_field_text(field, t))
|
||||||
|
.map(|t| TermQuery::new(t, IndexRecordOption::Basic))
|
||||||
|
.map(|q| -> Box<dyn Query> { Box::new(q) })
|
||||||
|
.collect();
|
||||||
|
BooleanQuery::union_with_minimum_required_clauses(terms, mr)
|
||||||
|
}
|
||||||
|
fn check_doc_id<T: IntoIterator<Item = DocId>>(
|
||||||
|
expected: T,
|
||||||
|
actually: HashSet<DocAddress>,
|
||||||
|
seg: u32,
|
||||||
|
) {
|
||||||
|
assert_eq!(
|
||||||
|
actually,
|
||||||
|
expected
|
||||||
|
.into_iter()
|
||||||
|
.map(|id| DocAddress::new(seg, id))
|
||||||
|
.collect()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let index = create_test_index_with(["a b c", "a c e", "d f g", "z z z", "c i b"])?;
|
||||||
|
let searcher = index.reader()?.searcher();
|
||||||
|
let text = index.schema().get_field("text").unwrap();
|
||||||
|
// Documents contains 'a c' 'a z' 'a i' 'c z' 'c i' or 'z i' shall be return.
|
||||||
|
let q1 = create_boolean_query_with_mr(["a", "c", "z", "i"], text, 2);
|
||||||
|
let docs = searcher.search(&q1, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1, 4], docs, 0);
|
||||||
|
// Documents contains 'a b c', 'a b e', 'a c e' or 'b c e' shall be return.
|
||||||
|
let q2 = create_boolean_query_with_mr(["a", "b", "c", "e"], text, 3);
|
||||||
|
let docs = searcher.search(&q2, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1], docs, 0);
|
||||||
|
// Nothing queried since minimum_required is too large.
|
||||||
|
let q3 = create_boolean_query_with_mr(["a", "b"], text, 3);
|
||||||
|
let docs = searcher.search(&q3, &DocSetCollector)?;
|
||||||
|
assert!(docs.is_empty());
|
||||||
|
// When mr is set to zero or one, there are no difference with `Boolean::Union`.
|
||||||
|
let q4 = create_boolean_query_with_mr(["a", "z"], text, 1);
|
||||||
|
let docs = searcher.search(&q4, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1, 3], docs, 0);
|
||||||
|
let q5 = create_boolean_query_with_mr(["a", "b"], text, 0);
|
||||||
|
let docs = searcher.search(&q5, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1, 4], docs, 0);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_union() -> crate::Result<()> {
|
fn test_union() -> crate::Result<()> {
|
||||||
let index = create_test_index()?;
|
let index = create_test_index()?;
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use std::collections::HashMap;
|
|||||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||||
use crate::index::SegmentReader;
|
use crate::index::SegmentReader;
|
||||||
use crate::postings::FreqReadingOption;
|
use crate::postings::FreqReadingOption;
|
||||||
|
use crate::query::disjunction::Disjunction;
|
||||||
use crate::query::explanation::does_not_match;
|
use crate::query::explanation::does_not_match;
|
||||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||||
use crate::query::term_query::TermScorer;
|
use crate::query::term_query::TermScorer;
|
||||||
@@ -18,6 +19,26 @@ enum SpecializedScorer {
|
|||||||
Other(Box<dyn Scorer>),
|
Other(Box<dyn Scorer>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn scorer_disjunction<TScoreCombiner>(
|
||||||
|
scorers: Vec<Box<dyn Scorer>>,
|
||||||
|
score_combiner: TScoreCombiner,
|
||||||
|
minimum_match_required: usize,
|
||||||
|
) -> Box<dyn Scorer>
|
||||||
|
where
|
||||||
|
TScoreCombiner: ScoreCombiner,
|
||||||
|
{
|
||||||
|
debug_assert!(!scorers.is_empty());
|
||||||
|
debug_assert!(minimum_match_required > 1);
|
||||||
|
if scorers.len() == 1 {
|
||||||
|
return scorers.into_iter().next().unwrap(); // Safe unwrap.
|
||||||
|
}
|
||||||
|
Box::new(Disjunction::new(
|
||||||
|
scorers,
|
||||||
|
score_combiner,
|
||||||
|
minimum_match_required,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
fn scorer_union<TScoreCombiner>(
|
fn scorer_union<TScoreCombiner>(
|
||||||
scorers: Vec<Box<dyn Scorer>>,
|
scorers: Vec<Box<dyn Scorer>>,
|
||||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||||
@@ -70,6 +91,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
|||||||
/// Weight associated to the `BoolQuery`.
|
/// Weight associated to the `BoolQuery`.
|
||||||
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
|
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
|
||||||
weights: Vec<(Occur, Box<dyn Weight>)>,
|
weights: Vec<(Occur, Box<dyn Weight>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
scoring_enabled: bool,
|
scoring_enabled: bool,
|
||||||
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
|
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
|
||||||
}
|
}
|
||||||
@@ -85,6 +107,22 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
|||||||
weights,
|
weights,
|
||||||
scoring_enabled,
|
scoring_enabled,
|
||||||
score_combiner_fn,
|
score_combiner_fn,
|
||||||
|
minimum_number_should_match: 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new boolean weight with minimum number of required should clauses specified.
|
||||||
|
pub fn with_minimum_number_should_match(
|
||||||
|
weights: Vec<(Occur, Box<dyn Weight>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
|
scoring_enabled: bool,
|
||||||
|
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
|
||||||
|
) -> BooleanWeight<TScoreCombiner> {
|
||||||
|
BooleanWeight {
|
||||||
|
weights,
|
||||||
|
minimum_number_should_match,
|
||||||
|
scoring_enabled,
|
||||||
|
score_combiner_fn,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,43 +149,89 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
|||||||
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
||||||
) -> crate::Result<SpecializedScorer> {
|
) -> crate::Result<SpecializedScorer> {
|
||||||
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
||||||
|
// Indicate how should clauses are combined with other clauses.
|
||||||
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
|
enum CombinationMethod {
|
||||||
.remove(&Occur::Should)
|
Ignored,
|
||||||
.map(|scorers| scorer_union(scorers, &score_combiner_fn));
|
// Only contributes to final score.
|
||||||
|
Optional(SpecializedScorer),
|
||||||
|
// Must be fitted.
|
||||||
|
Required(Box<dyn Scorer>),
|
||||||
|
}
|
||||||
|
let mut must_scorers = per_occur_scorers.remove(&Occur::Must);
|
||||||
|
let should_opt = if let Some(mut should_scorers) = per_occur_scorers.remove(&Occur::Should)
|
||||||
|
{
|
||||||
|
let num_of_should_scorers = should_scorers.len();
|
||||||
|
if self.minimum_number_should_match > num_of_should_scorers {
|
||||||
|
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||||
|
}
|
||||||
|
match self.minimum_number_should_match {
|
||||||
|
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
|
||||||
|
1 => CombinationMethod::Required(into_box_scorer(
|
||||||
|
scorer_union(should_scorers, &score_combiner_fn),
|
||||||
|
&score_combiner_fn,
|
||||||
|
)),
|
||||||
|
n if num_of_should_scorers == n => {
|
||||||
|
// When num_of_should_scorers equals the number of should clauses,
|
||||||
|
// they are no different from must clauses.
|
||||||
|
must_scorers = match must_scorers.take() {
|
||||||
|
Some(mut must_scorers) => {
|
||||||
|
must_scorers.append(&mut should_scorers);
|
||||||
|
Some(must_scorers)
|
||||||
|
}
|
||||||
|
None => Some(should_scorers),
|
||||||
|
};
|
||||||
|
CombinationMethod::Ignored
|
||||||
|
}
|
||||||
|
_ => CombinationMethod::Required(scorer_disjunction(
|
||||||
|
should_scorers,
|
||||||
|
score_combiner_fn(),
|
||||||
|
self.minimum_number_should_match,
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// None of should clauses are provided.
|
||||||
|
if self.minimum_number_should_match > 0 {
|
||||||
|
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||||
|
} else {
|
||||||
|
CombinationMethod::Ignored
|
||||||
|
}
|
||||||
|
};
|
||||||
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||||
.remove(&Occur::MustNot)
|
.remove(&Occur::MustNot)
|
||||||
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
|
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
|
||||||
.map(|specialized_scorer| {
|
.map(|specialized_scorer: SpecializedScorer| {
|
||||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
||||||
});
|
});
|
||||||
|
let positive_scorer = match (should_opt, must_scorers) {
|
||||||
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
(CombinationMethod::Ignored, Some(must_scorers)) => {
|
||||||
.remove(&Occur::Must)
|
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||||
.map(intersect_scorers);
|
}
|
||||||
|
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
|
||||||
let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) {
|
let must_scorer = intersect_scorers(must_scorers);
|
||||||
(Some(should_scorer), Some(must_scorer)) => {
|
|
||||||
if self.scoring_enabled {
|
if self.scoring_enabled {
|
||||||
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
|
SpecializedScorer::Other(Box::new(
|
||||||
Box<dyn Scorer>,
|
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||||
Box<dyn Scorer>,
|
must_scorer,
|
||||||
TComplexScoreCombiner,
|
into_box_scorer(should_scorer, &score_combiner_fn),
|
||||||
>::new(
|
),
|
||||||
must_scorer,
|
))
|
||||||
into_box_scorer(should_scorer, &score_combiner_fn),
|
|
||||||
)))
|
|
||||||
} else {
|
} else {
|
||||||
SpecializedScorer::Other(must_scorer)
|
SpecializedScorer::Other(must_scorer)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
|
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
|
||||||
(Some(should_scorer), None) => should_scorer,
|
must_scorers.push(should_scorer);
|
||||||
(None, None) => {
|
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
|
||||||
}
|
}
|
||||||
|
(CombinationMethod::Ignored, None) => {
|
||||||
|
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
|
||||||
|
}
|
||||||
|
(CombinationMethod::Required(should_scorer), None) => {
|
||||||
|
SpecializedScorer::Other(should_scorer)
|
||||||
|
}
|
||||||
|
// Optional options are promoted to required if no must scorers exists.
|
||||||
|
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||||
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
|
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
|
||||||
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
||||||
|
|||||||
327
src/query/disjunction.rs
Normal file
327
src/query/disjunction.rs
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::BinaryHeap;
|
||||||
|
|
||||||
|
use crate::query::score_combiner::DoNothingCombiner;
|
||||||
|
use crate::query::{ScoreCombiner, Scorer};
|
||||||
|
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||||
|
|
||||||
|
/// `Disjunction` is responsible for merging `DocSet` from multiple
|
||||||
|
/// source. Specifically, It takes the union of two or more `DocSet`s
|
||||||
|
/// then filtering out elements that appear fewer times than a
|
||||||
|
/// specified threshold.
|
||||||
|
pub struct Disjunction<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||||
|
chains: BinaryHeap<ScorerWrapper<TScorer>>,
|
||||||
|
minimum_matches_required: usize,
|
||||||
|
score_combiner: TScoreCombiner,
|
||||||
|
|
||||||
|
current_doc: DocId,
|
||||||
|
current_score: Score,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A wrapper around a `Scorer` that caches the current `doc_id` and implements the `DocSet` trait.
|
||||||
|
/// Also, the `Ord` trait and it's family are implemented reversely. So that we can combine
|
||||||
|
/// `std::BinaryHeap<ScorerWrapper<T>>` to gain a min-heap with current doc id as key.
|
||||||
|
struct ScorerWrapper<T> {
|
||||||
|
scorer: T,
|
||||||
|
current_doc: DocId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> ScorerWrapper<T> {
|
||||||
|
fn new(scorer: T) -> Self {
|
||||||
|
let current_doc = scorer.doc();
|
||||||
|
Self {
|
||||||
|
scorer,
|
||||||
|
current_doc,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> PartialEq for ScorerWrapper<T> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.doc() == other.doc()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> Eq for ScorerWrapper<T> {}
|
||||||
|
|
||||||
|
impl<T: Scorer> PartialOrd for ScorerWrapper<T> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> Ord for ScorerWrapper<T> {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
self.doc().cmp(&other.doc()).reverse()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> DocSet for ScorerWrapper<T> {
|
||||||
|
fn advance(&mut self) -> DocId {
|
||||||
|
let doc_id = self.scorer.advance();
|
||||||
|
self.current_doc = doc_id;
|
||||||
|
doc_id
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.current_doc
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.scorer.size_hint()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
|
||||||
|
pub fn new<T: IntoIterator<Item = TScorer>>(
|
||||||
|
docsets: T,
|
||||||
|
score_combiner: TScoreCombiner,
|
||||||
|
minimum_matches_required: usize,
|
||||||
|
) -> Self {
|
||||||
|
debug_assert!(
|
||||||
|
minimum_matches_required > 1,
|
||||||
|
"union scorer works better if just one matches required"
|
||||||
|
);
|
||||||
|
let chains = docsets
|
||||||
|
.into_iter()
|
||||||
|
.map(|doc| ScorerWrapper::new(doc))
|
||||||
|
.collect();
|
||||||
|
let mut disjunction = Self {
|
||||||
|
chains,
|
||||||
|
score_combiner,
|
||||||
|
current_doc: TERMINATED,
|
||||||
|
minimum_matches_required,
|
||||||
|
current_score: 0.0,
|
||||||
|
};
|
||||||
|
if minimum_matches_required > disjunction.chains.len() {
|
||||||
|
return disjunction;
|
||||||
|
}
|
||||||
|
disjunction.advance();
|
||||||
|
disjunction
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
|
||||||
|
for Disjunction<TScorer, TScoreCombiner>
|
||||||
|
{
|
||||||
|
fn advance(&mut self) -> DocId {
|
||||||
|
let mut current_num_matches = 0;
|
||||||
|
while let Some(mut candidate) = self.chains.pop() {
|
||||||
|
let next = candidate.doc();
|
||||||
|
if next != TERMINATED {
|
||||||
|
// Peek next doc.
|
||||||
|
if self.current_doc != next {
|
||||||
|
if current_num_matches >= self.minimum_matches_required {
|
||||||
|
self.chains.push(candidate);
|
||||||
|
self.current_score = self.score_combiner.score();
|
||||||
|
return self.current_doc;
|
||||||
|
}
|
||||||
|
// Reset current_num_matches and scores.
|
||||||
|
current_num_matches = 0;
|
||||||
|
self.current_doc = next;
|
||||||
|
self.score_combiner.clear();
|
||||||
|
}
|
||||||
|
current_num_matches += 1;
|
||||||
|
self.score_combiner.update(&mut candidate.scorer);
|
||||||
|
candidate.advance();
|
||||||
|
self.chains.push(candidate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if current_num_matches < self.minimum_matches_required {
|
||||||
|
self.current_doc = TERMINATED;
|
||||||
|
}
|
||||||
|
self.current_score = self.score_combiner.score();
|
||||||
|
self.current_doc
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.current_doc
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.chains
|
||||||
|
.iter()
|
||||||
|
.map(|docset| docset.size_hint())
|
||||||
|
.max()
|
||||||
|
.unwrap_or(0u32)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
|
||||||
|
for Disjunction<TScorer, TScoreCombiner>
|
||||||
|
{
|
||||||
|
fn score(&mut self) -> Score {
|
||||||
|
self.current_score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use super::Disjunction;
|
||||||
|
use crate::query::score_combiner::DoNothingCombiner;
|
||||||
|
use crate::query::{ConstScorer, Scorer, SumCombiner, VecDocSet};
|
||||||
|
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||||
|
|
||||||
|
fn conjunct<T: Ord + Copy>(arrays: &[Vec<T>], pass_line: usize) -> Vec<T> {
|
||||||
|
let mut counts = BTreeMap::new();
|
||||||
|
for array in arrays {
|
||||||
|
for &element in array {
|
||||||
|
*counts.entry(element).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
counts
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(&element, &count)| {
|
||||||
|
if count >= pass_line {
|
||||||
|
Some(element)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aux_test_conjunction(vals: Vec<Vec<u32>>, min_match: usize) {
|
||||||
|
let mut union_expected = VecDocSet::from(conjunct(&vals, min_match));
|
||||||
|
let make_scorer = || {
|
||||||
|
Disjunction::new(
|
||||||
|
vals.iter()
|
||||||
|
.cloned()
|
||||||
|
.map(VecDocSet::from)
|
||||||
|
.map(|d| ConstScorer::new(d, 1.0)),
|
||||||
|
DoNothingCombiner,
|
||||||
|
min_match,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
let mut scorer: Disjunction<_, DoNothingCombiner> = make_scorer();
|
||||||
|
let mut count = 0;
|
||||||
|
while scorer.doc() != TERMINATED {
|
||||||
|
assert_eq!(union_expected.doc(), scorer.doc());
|
||||||
|
assert_eq!(union_expected.advance(), scorer.advance());
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
assert_eq!(union_expected.advance(), TERMINATED);
|
||||||
|
assert_eq!(count, make_scorer().count_including_deleted());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[should_panic]
|
||||||
|
#[test]
|
||||||
|
fn test_arg_check1() {
|
||||||
|
aux_test_conjunction(vec![], 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[should_panic]
|
||||||
|
#[test]
|
||||||
|
fn test_arg_check2() {
|
||||||
|
aux_test_conjunction(vec![], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_corner_case() {
|
||||||
|
aux_test_conjunction(vec![], 2);
|
||||||
|
aux_test_conjunction(vec![vec![]; 1000], 2);
|
||||||
|
aux_test_conjunction(vec![vec![]; 100], usize::MAX);
|
||||||
|
aux_test_conjunction(vec![vec![0xC0FFEE]; 10000], usize::MAX);
|
||||||
|
aux_test_conjunction((1..10000u32).map(|i| vec![i]).collect::<Vec<_>>(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_conjunction() {
|
||||||
|
aux_test_conjunction(
|
||||||
|
vec![
|
||||||
|
vec![1, 3333, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
],
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
aux_test_conjunction(
|
||||||
|
vec![vec![8], vec![3, 4, 0xC0FFEEu32], vec![1, 2, 100000000u32]],
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
aux_test_conjunction(
|
||||||
|
vec![
|
||||||
|
vec![1, 3333, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
],
|
||||||
|
3,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// This dummy scorer does nothing but yield doc id increasingly.
|
||||||
|
// with constant score 1.0
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct DummyScorer {
|
||||||
|
cursor: usize,
|
||||||
|
foo: Vec<(DocId, f32)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DummyScorer {
|
||||||
|
fn new(doc_score: Vec<(DocId, f32)>) -> Self {
|
||||||
|
Self {
|
||||||
|
cursor: 0,
|
||||||
|
foo: doc_score,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocSet for DummyScorer {
|
||||||
|
fn advance(&mut self) -> DocId {
|
||||||
|
self.cursor += 1;
|
||||||
|
self.doc()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.foo.get(self.cursor).map(|x| x.0).unwrap_or(TERMINATED)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.foo.len() as u32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Scorer for DummyScorer {
|
||||||
|
fn score(&mut self) -> Score {
|
||||||
|
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_score_calculate() {
|
||||||
|
let mut scorer = Disjunction::new(
|
||||||
|
vec![
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (4, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
],
|
||||||
|
SumCombiner::default(),
|
||||||
|
3,
|
||||||
|
);
|
||||||
|
assert_eq!(scorer.score(), 5.0);
|
||||||
|
assert_eq!(scorer.advance(), 2);
|
||||||
|
assert_eq!(scorer.score(), 3.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_score_calculate_corner_case() {
|
||||||
|
let mut scorer = Disjunction::new(
|
||||||
|
vec![
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||||
|
],
|
||||||
|
SumCombiner::default(),
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
assert_eq!(scorer.doc(), 1);
|
||||||
|
assert_eq!(scorer.score(), 3.0);
|
||||||
|
assert_eq!(scorer.advance(), 3);
|
||||||
|
assert_eq!(scorer.score(), 2.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -149,7 +149,7 @@ mod tests {
|
|||||||
use crate::query::exist_query::ExistsQuery;
|
use crate::query::exist_query::ExistsQuery;
|
||||||
use crate::query::{BooleanQuery, RangeQuery};
|
use crate::query::{BooleanQuery, RangeQuery};
|
||||||
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
||||||
use crate::{Index, Searcher};
|
use crate::{Index, Searcher, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_exists_query_simple() -> crate::Result<()> {
|
fn test_exists_query_simple() -> crate::Result<()> {
|
||||||
@@ -188,9 +188,8 @@ mod tests {
|
|||||||
|
|
||||||
// exercise seek
|
// exercise seek
|
||||||
let query = BooleanQuery::intersection(vec![
|
let query = BooleanQuery::intersection(vec![
|
||||||
Box::new(RangeQuery::new_u64_bounds(
|
Box::new(RangeQuery::new(
|
||||||
"all".to_string(),
|
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||||
Bound::Included(50),
|
|
||||||
Bound::Unbounded,
|
Bound::Unbounded,
|
||||||
)),
|
)),
|
||||||
Box::new(ExistsQuery::new_exists_query("even".to_string())),
|
Box::new(ExistsQuery::new_exists_query("even".to_string())),
|
||||||
@@ -198,10 +197,9 @@ mod tests {
|
|||||||
assert_eq!(searcher.search(&query, &Count)?, 25);
|
assert_eq!(searcher.search(&query, &Count)?, 25);
|
||||||
|
|
||||||
let query = BooleanQuery::intersection(vec![
|
let query = BooleanQuery::intersection(vec![
|
||||||
Box::new(RangeQuery::new_u64_bounds(
|
Box::new(RangeQuery::new(
|
||||||
"all".to_string(),
|
Bound::Included(Term::from_field_u64(all_field, 0)),
|
||||||
Bound::Included(0),
|
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||||
Bound::Excluded(50),
|
|
||||||
)),
|
)),
|
||||||
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
|
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
|
||||||
]);
|
]);
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ mod bm25;
|
|||||||
mod boolean_query;
|
mod boolean_query;
|
||||||
mod boost_query;
|
mod boost_query;
|
||||||
mod const_score_query;
|
mod const_score_query;
|
||||||
|
mod disjunction;
|
||||||
mod disjunction_max_query;
|
mod disjunction_max_query;
|
||||||
mod empty_query;
|
mod empty_query;
|
||||||
mod exclude;
|
mod exclude;
|
||||||
@@ -53,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
|||||||
pub use self::phrase_query::PhraseQuery;
|
pub use self::phrase_query::PhraseQuery;
|
||||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||||
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
|
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
|
||||||
pub use self::regex_query::RegexQuery;
|
pub use self::regex_query::RegexQuery;
|
||||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||||
pub use self::score_combiner::{
|
pub use self::score_combiner::{
|
||||||
|
|||||||
@@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery {
|
|||||||
Bound::Unbounded
|
Bound::Unbounded
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut range_query = RangeQuery::new_term_bounds(
|
let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
|
||||||
enable_scoring
|
|
||||||
.schema()
|
|
||||||
.get_field_name(self.field)
|
|
||||||
.to_owned(),
|
|
||||||
self.prefix.1.typ(),
|
|
||||||
&Bound::Included(self.prefix.1.clone()),
|
|
||||||
&end_term,
|
|
||||||
);
|
|
||||||
range_query.limit(self.max_expansions as u64);
|
range_query.limit(self.max_expansions as u64);
|
||||||
range_query.weight(enable_scoring)
|
range_query.weight(enable_scoring)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -97,6 +97,7 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
|
|||||||
suffixes: Vec<TPostings>,
|
suffixes: Vec<TPostings>,
|
||||||
suffix_offset: u32,
|
suffix_offset: u32,
|
||||||
phrase_count: u32,
|
phrase_count: u32,
|
||||||
|
suffix_position_buffer: Vec<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||||
@@ -140,6 +141,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
suffixes,
|
suffixes,
|
||||||
suffix_offset: (max_offset - suffix_pos) as u32,
|
suffix_offset: (max_offset - suffix_pos) as u32,
|
||||||
phrase_count: 0,
|
phrase_count: 0,
|
||||||
|
suffix_position_buffer: Vec::with_capacity(100),
|
||||||
};
|
};
|
||||||
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
||||||
phrase_prefix_scorer.advance();
|
phrase_prefix_scorer.advance();
|
||||||
@@ -153,7 +155,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
|
|
||||||
fn matches_prefix(&mut self) -> bool {
|
fn matches_prefix(&mut self) -> bool {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut positions = Vec::new();
|
|
||||||
let current_doc = self.doc();
|
let current_doc = self.doc();
|
||||||
let pos_matching = self.phrase_scorer.get_intersection();
|
let pos_matching = self.phrase_scorer.get_intersection();
|
||||||
for suffix in &mut self.suffixes {
|
for suffix in &mut self.suffixes {
|
||||||
@@ -162,8 +163,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
}
|
}
|
||||||
let doc = suffix.seek(current_doc);
|
let doc = suffix.seek(current_doc);
|
||||||
if doc == current_doc {
|
if doc == current_doc {
|
||||||
suffix.positions_with_offset(self.suffix_offset, &mut positions);
|
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
|
||||||
count += intersection_count(pos_matching, &positions);
|
count += intersection_count(pos_matching, &self.suffix_position_buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.phrase_count = count as u32;
|
self.phrase_count = count as u32;
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::fmt;
|
|||||||
use std::ops::Bound;
|
use std::ops::Bound;
|
||||||
|
|
||||||
use crate::query::Occur;
|
use crate::query::Occur;
|
||||||
use crate::schema::{Term, Type};
|
use crate::schema::Term;
|
||||||
use crate::Score;
|
use crate::Score;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@@ -14,8 +14,6 @@ pub enum LogicalLiteral {
|
|||||||
prefix: bool,
|
prefix: bool,
|
||||||
},
|
},
|
||||||
Range {
|
Range {
|
||||||
field: String,
|
|
||||||
value_type: Type,
|
|
||||||
lower: Bound<Term>,
|
lower: Bound<Term>,
|
||||||
upper: Bound<Term>,
|
upper: Bound<Term>,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -790,8 +790,6 @@ impl QueryParser {
|
|||||||
let (field, json_path) = try_tuple!(self
|
let (field, json_path) = try_tuple!(self
|
||||||
.split_full_path(&full_path)
|
.split_full_path(&full_path)
|
||||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
let value_type = field_entry.field_type().value_type();
|
|
||||||
let mut errors = Vec::new();
|
let mut errors = Vec::new();
|
||||||
let lower = match self.resolve_bound(field, json_path, &lower) {
|
let lower = match self.resolve_bound(field, json_path, &lower) {
|
||||||
Ok(bound) => bound,
|
Ok(bound) => bound,
|
||||||
@@ -812,12 +810,8 @@ impl QueryParser {
|
|||||||
// we failed to parse something. Either way, there is no point emiting it
|
// we failed to parse something. Either way, there is no point emiting it
|
||||||
return (None, errors);
|
return (None, errors);
|
||||||
}
|
}
|
||||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
let logical_ast =
|
||||||
field: self.schema.get_field_name(field).to_string(),
|
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
|
||||||
value_type,
|
|
||||||
lower,
|
|
||||||
upper,
|
|
||||||
}));
|
|
||||||
(Some(logical_ast), errors)
|
(Some(logical_ast), errors)
|
||||||
}
|
}
|
||||||
UserInputLeaf::Set {
|
UserInputLeaf::Set {
|
||||||
@@ -884,14 +878,7 @@ fn convert_literal_to_query(
|
|||||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LogicalLiteral::Range {
|
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
|
||||||
field,
|
|
||||||
value_type,
|
|
||||||
lower,
|
|
||||||
upper,
|
|
||||||
} => Box::new(RangeQuery::new_term_bounds(
|
|
||||||
field, value_type, &lower, &upper,
|
|
||||||
)),
|
|
||||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||||
LogicalLiteral::All => Box::new(AllQuery),
|
LogicalLiteral::All => Box::new(AllQuery),
|
||||||
}
|
}
|
||||||
@@ -1136,8 +1123,8 @@ mod test {
|
|||||||
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
|
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
|
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
|
||||||
upper_bound: Included([98]), limit: None }"
|
Included(Term(field=0, type=Str, \"b\")), limit: None }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1815,7 +1802,8 @@ mod test {
|
|||||||
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
||||||
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
||||||
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
||||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
|
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })], \
|
||||||
|
minimum_number_should_match: 1 }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1880,7 +1868,8 @@ mod test {
|
|||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
|
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
|
||||||
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
|
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
|
||||||
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
|
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))], \
|
||||||
|
minimum_number_should_match: 1 }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1897,7 +1886,8 @@ mod test {
|
|||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
|
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
|
||||||
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
|
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
|
||||||
distance: 2, transposition_cost_one: false, prefix: true })] }"
|
distance: 2, transposition_cost_one: false, prefix: true })], \
|
||||||
|
minimum_number_should_match: 1 }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
use crate::collector::Count;
|
use crate::collector::Count;
|
||||||
use crate::directory::RamDirectory;
|
use crate::directory::RamDirectory;
|
||||||
use crate::query::RangeQuery;
|
use crate::query::RangeQuery;
|
||||||
use crate::{schema, IndexBuilder, TantivyDocument};
|
use crate::{schema, IndexBuilder, TantivyDocument, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn range_query_fast_optional_field_minimum() {
|
fn range_query_fast_optional_field_minimum() {
|
||||||
@@ -218,10 +220,9 @@ mod tests {
|
|||||||
let reader = index.reader().unwrap();
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
let query = RangeQuery::new_u64_bounds(
|
let query = RangeQuery::new(
|
||||||
"score".to_string(),
|
Bound::Included(Term::from_field_u64(score_field, 70)),
|
||||||
std::ops::Bound::Included(70),
|
Bound::Unbounded,
|
||||||
std::ops::Bound::Unbounded,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let count = searcher.search(&query, &Count).unwrap();
|
let count = searcher.search(&query, &Count).unwrap();
|
||||||
@@ -2,21 +2,19 @@ use std::ops::Bound;
|
|||||||
|
|
||||||
use crate::schema::Type;
|
use crate::schema::Type;
|
||||||
|
|
||||||
mod fast_field_range_query;
|
mod fast_field_range_doc_set;
|
||||||
mod range_query;
|
mod range_query;
|
||||||
mod range_query_ip_fastfield;
|
|
||||||
mod range_query_u64_fastfield;
|
mod range_query_u64_fastfield;
|
||||||
|
|
||||||
pub use self::range_query::RangeQuery;
|
pub use self::range_query::RangeQuery;
|
||||||
pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight;
|
|
||||||
pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
|
pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||||
|
|
||||||
// TODO is this correct?
|
// TODO is this correct?
|
||||||
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
|
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
|
||||||
match typ {
|
match typ {
|
||||||
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||||
Type::IpAddr => true,
|
Type::IpAddr => true,
|
||||||
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
Type::Facet | Type::Bytes | Type::Json => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,17 @@
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::net::Ipv6Addr;
|
use std::ops::Bound;
|
||||||
use std::ops::{Bound, Range};
|
|
||||||
|
|
||||||
use columnar::MonotonicallyMappableToU128;
|
use common::BitSet;
|
||||||
use common::{BinarySerializable, BitSet};
|
|
||||||
|
|
||||||
use super::map_bound;
|
use super::map_bound;
|
||||||
use super::range_query_u64_fastfield::FastFieldRangeWeight;
|
use super::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||||
use crate::error::TantivyError;
|
|
||||||
use crate::index::SegmentReader;
|
use crate::index::SegmentReader;
|
||||||
use crate::query::explanation::does_not_match;
|
use crate::query::explanation::does_not_match;
|
||||||
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
|
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res};
|
|
||||||
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
|
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||||
use crate::schema::{Field, IndexRecordOption, Term, Type};
|
use crate::schema::{Field, IndexRecordOption, Term, Type};
|
||||||
use crate::termdict::{TermDictionary, TermStreamer};
|
use crate::termdict::{TermDictionary, TermStreamer};
|
||||||
use crate::{DateTime, DocId, Score};
|
use crate::{DocId, Score};
|
||||||
|
|
||||||
/// `RangeQuery` matches all documents that have at least one term within a defined range.
|
/// `RangeQuery` matches all documents that have at least one term within a defined range.
|
||||||
///
|
///
|
||||||
@@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score};
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// use tantivy::collector::Count;
|
/// use tantivy::collector::Count;
|
||||||
/// use tantivy::query::RangeQuery;
|
/// use tantivy::query::RangeQuery;
|
||||||
|
/// use tantivy::Term;
|
||||||
/// use tantivy::schema::{Schema, INDEXED};
|
/// use tantivy::schema::{Schema, INDEXED};
|
||||||
/// use tantivy::{doc, Index, IndexWriter};
|
/// use tantivy::{doc, Index, IndexWriter};
|
||||||
|
/// use std::ops::Bound;
|
||||||
/// # fn test() -> tantivy::Result<()> {
|
/// # fn test() -> tantivy::Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
|
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||||
@@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score};
|
|||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader()?;
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
/// let docs_in_the_sixties = RangeQuery::new(
|
||||||
|
/// Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||||
|
/// Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||||
|
/// );
|
||||||
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
/// assert_eq!(num_60s_books, 2285);
|
/// assert_eq!(num_60s_books, 2285);
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
@@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score};
|
|||||||
/// ```
|
/// ```
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct RangeQuery {
|
pub struct RangeQuery {
|
||||||
field: String,
|
lower_bound: Bound<Term>,
|
||||||
value_type: Type,
|
upper_bound: Bound<Term>,
|
||||||
lower_bound: Bound<Vec<u8>>,
|
|
||||||
upper_bound: Bound<Vec<u8>>,
|
|
||||||
limit: Option<u64>,
|
limit: Option<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the inner value of a `Bound`
|
||||||
|
pub(crate) fn inner_bound(val: &Bound<Term>) -> Option<&Term> {
|
||||||
|
match val {
|
||||||
|
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||||
|
Bound::Unbounded => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl RangeQuery {
|
impl RangeQuery {
|
||||||
/// Creates a new `RangeQuery` from bounded start and end terms.
|
/// Creates a new `RangeQuery` from bounded start and end terms.
|
||||||
///
|
///
|
||||||
/// If the value type is not correct, something may go terribly wrong when
|
/// If the value type is not correct, something may go terribly wrong when
|
||||||
/// the `Weight` object is created.
|
/// the `Weight` object is created.
|
||||||
pub fn new_term_bounds(
|
pub fn new(lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> RangeQuery {
|
||||||
field: String,
|
|
||||||
value_type: Type,
|
|
||||||
lower_bound: &Bound<Term>,
|
|
||||||
upper_bound: &Bound<Term>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
|
|
||||||
RangeQuery {
|
RangeQuery {
|
||||||
field,
|
lower_bound,
|
||||||
value_type,
|
upper_bound,
|
||||||
lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
|
|
||||||
upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
|
|
||||||
limit: None,
|
limit: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new `RangeQuery` over a `i64` field.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `i64`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
|
|
||||||
RangeQuery::new_i64_bounds(
|
|
||||||
field,
|
|
||||||
Bound::Included(range.start),
|
|
||||||
Bound::Excluded(range.end),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `i64` field.
|
|
||||||
///
|
|
||||||
/// The two `Bound` arguments make it possible to create more complex
|
|
||||||
/// ranges than semi-inclusive range.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `i64`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_i64_bounds(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<i64>,
|
|
||||||
upper_bound: Bound<i64>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let make_term_val = |val: &i64| {
|
|
||||||
Term::from_field_i64(Field::from_field_id(0), *val)
|
|
||||||
.serialized_value_bytes()
|
|
||||||
.to_owned()
|
|
||||||
};
|
|
||||||
RangeQuery {
|
|
||||||
field,
|
|
||||||
value_type: Type::I64,
|
|
||||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
|
||||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
|
||||||
limit: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new `RangeQuery` over a `f64` field.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `f64`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
|
|
||||||
RangeQuery::new_f64_bounds(
|
|
||||||
field,
|
|
||||||
Bound::Included(range.start),
|
|
||||||
Bound::Excluded(range.end),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `f64` field.
|
|
||||||
///
|
|
||||||
/// The two `Bound` arguments make it possible to create more complex
|
|
||||||
/// ranges than semi-inclusive range.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `f64`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_f64_bounds(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<f64>,
|
|
||||||
upper_bound: Bound<f64>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let make_term_val = |val: &f64| {
|
|
||||||
Term::from_field_f64(Field::from_field_id(0), *val)
|
|
||||||
.serialized_value_bytes()
|
|
||||||
.to_owned()
|
|
||||||
};
|
|
||||||
RangeQuery {
|
|
||||||
field,
|
|
||||||
value_type: Type::F64,
|
|
||||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
|
||||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
|
||||||
limit: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `u64` field.
|
|
||||||
///
|
|
||||||
/// The two `Bound` arguments make it possible to create more complex
|
|
||||||
/// ranges than semi-inclusive range.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `u64`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_u64_bounds(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<u64>,
|
|
||||||
upper_bound: Bound<u64>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let make_term_val = |val: &u64| {
|
|
||||||
Term::from_field_u64(Field::from_field_id(0), *val)
|
|
||||||
.serialized_value_bytes()
|
|
||||||
.to_owned()
|
|
||||||
};
|
|
||||||
RangeQuery {
|
|
||||||
field,
|
|
||||||
value_type: Type::U64,
|
|
||||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
|
||||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
|
||||||
limit: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `ip` field.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `ip`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_ip_bounds(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<Ipv6Addr>,
|
|
||||||
upper_bound: Bound<Ipv6Addr>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let make_term_val = |val: &Ipv6Addr| {
|
|
||||||
Term::from_field_ip_addr(Field::from_field_id(0), *val)
|
|
||||||
.serialized_value_bytes()
|
|
||||||
.to_owned()
|
|
||||||
};
|
|
||||||
RangeQuery {
|
|
||||||
field,
|
|
||||||
value_type: Type::IpAddr,
|
|
||||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
|
||||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
|
||||||
limit: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `u64` field.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `u64`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
|
|
||||||
RangeQuery::new_u64_bounds(
|
|
||||||
field,
|
|
||||||
Bound::Included(range.start),
|
|
||||||
Bound::Excluded(range.end),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `date` field.
|
|
||||||
///
|
|
||||||
/// The two `Bound` arguments make it possible to create more complex
|
|
||||||
/// ranges than semi-inclusive range.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `date`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_date_bounds(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<DateTime>,
|
|
||||||
upper_bound: Bound<DateTime>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let make_term_val = |val: &DateTime| {
|
|
||||||
Term::from_field_date(Field::from_field_id(0), *val)
|
|
||||||
.serialized_value_bytes()
|
|
||||||
.to_owned()
|
|
||||||
};
|
|
||||||
RangeQuery {
|
|
||||||
field,
|
|
||||||
value_type: Type::Date,
|
|
||||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
|
||||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
|
||||||
limit: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `date` field.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `date`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
|
|
||||||
RangeQuery::new_date_bounds(
|
|
||||||
field,
|
|
||||||
Bound::Included(range.start),
|
|
||||||
Bound::Excluded(range.end),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `Str` field.
|
|
||||||
///
|
|
||||||
/// The two `Bound` arguments make it possible to create more complex
|
|
||||||
/// ranges than semi-inclusive range.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `Str`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_str_bounds(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<&str>,
|
|
||||||
upper_bound: Bound<&str>,
|
|
||||||
) -> RangeQuery {
|
|
||||||
let make_term_val = |val: &&str| val.as_bytes().to_vec();
|
|
||||||
RangeQuery {
|
|
||||||
field,
|
|
||||||
value_type: Type::Str,
|
|
||||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
|
||||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
|
||||||
limit: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `RangeQuery` over a `Str` field.
|
|
||||||
///
|
|
||||||
/// If the field is not of the type `Str`, tantivy
|
|
||||||
/// will panic when the `Weight` object is created.
|
|
||||||
pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
|
|
||||||
RangeQuery::new_str_bounds(
|
|
||||||
field,
|
|
||||||
Bound::Included(range.start),
|
|
||||||
Bound::Excluded(range.end),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Field to search over
|
/// Field to search over
|
||||||
pub fn field(&self) -> &str {
|
pub fn field(&self) -> Field {
|
||||||
&self.field
|
self.get_term().field()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The value type of the field
|
||||||
|
pub fn value_type(&self) -> Type {
|
||||||
|
self.get_term().typ()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_term(&self) -> &Term {
|
||||||
|
inner_bound(&self.lower_bound)
|
||||||
|
.or(inner_bound(&self.upper_bound))
|
||||||
|
.expect("At least one bound must be set")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Limit the number of term the `RangeQuery` will go through.
|
/// Limit the number of term the `RangeQuery` will go through.
|
||||||
@@ -319,70 +120,23 @@ impl RangeQuery {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if the type maps to a u64 fast field
|
|
||||||
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
|
|
||||||
match typ {
|
|
||||||
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
|
||||||
Type::IpAddr => false,
|
|
||||||
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Query for RangeQuery {
|
impl Query for RangeQuery {
|
||||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||||
let schema = enable_scoring.schema();
|
let schema = enable_scoring.schema();
|
||||||
let field_type = schema
|
let field_type = schema.get_field_entry(self.field()).field_type();
|
||||||
.get_field_entry(schema.get_field(&self.field)?)
|
|
||||||
.field_type();
|
|
||||||
let value_type = field_type.value_type();
|
|
||||||
if value_type != self.value_type {
|
|
||||||
let err_msg = format!(
|
|
||||||
"Create a range query of the type {:?}, when the field given was of type \
|
|
||||||
{value_type:?}",
|
|
||||||
self.value_type
|
|
||||||
);
|
|
||||||
return Err(TantivyError::SchemaError(err_msg));
|
|
||||||
}
|
|
||||||
|
|
||||||
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
|
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
|
||||||
if field_type.is_ip_addr() {
|
Ok(Box::new(FastFieldRangeWeight::new(
|
||||||
let parse_ip_from_bytes = |data: &Vec<u8>| {
|
self.field(),
|
||||||
let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| {
|
self.lower_bound.clone(),
|
||||||
crate::TantivyError::InvalidArgument(
|
self.upper_bound.clone(),
|
||||||
"Expected 8 bytes for ip address".to_string(),
|
)))
|
||||||
)
|
|
||||||
})?;
|
|
||||||
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
|
|
||||||
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
|
|
||||||
};
|
|
||||||
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
|
|
||||||
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
|
|
||||||
Ok(Box::new(IPFastFieldRangeWeight::new(
|
|
||||||
self.field.to_string(),
|
|
||||||
lower_bound,
|
|
||||||
upper_bound,
|
|
||||||
)))
|
|
||||||
} else {
|
|
||||||
// We run the range query on u64 value space for performance reasons and simpicity
|
|
||||||
// assert the type maps to u64
|
|
||||||
assert!(maps_to_u64_fastfield(self.value_type));
|
|
||||||
let parse_from_bytes = |data: &Vec<u8>| {
|
|
||||||
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
|
|
||||||
};
|
|
||||||
|
|
||||||
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
|
|
||||||
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
|
|
||||||
Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
|
|
||||||
self.field.to_string(),
|
|
||||||
lower_bound,
|
|
||||||
upper_bound,
|
|
||||||
)))
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
|
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
|
||||||
Ok(Box::new(RangeWeight {
|
Ok(Box::new(RangeWeight {
|
||||||
field: self.field.to_string(),
|
field: self.field(),
|
||||||
lower_bound: self.lower_bound.clone(),
|
lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term),
|
||||||
upper_bound: self.upper_bound.clone(),
|
upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term),
|
||||||
limit: self.limit,
|
limit: self.limit,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -390,7 +144,7 @@ impl Query for RangeQuery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct RangeWeight {
|
pub struct RangeWeight {
|
||||||
field: String,
|
field: Field,
|
||||||
lower_bound: Bound<Vec<u8>>,
|
lower_bound: Bound<Vec<u8>>,
|
||||||
upper_bound: Bound<Vec<u8>>,
|
upper_bound: Bound<Vec<u8>>,
|
||||||
limit: Option<u64>,
|
limit: Option<u64>,
|
||||||
@@ -423,7 +177,7 @@ impl Weight for RangeWeight {
|
|||||||
let max_doc = reader.max_doc();
|
let max_doc = reader.max_doc();
|
||||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||||
|
|
||||||
let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?;
|
let inverted_index = reader.inverted_index(self.field)?;
|
||||||
let term_dict = inverted_index.terms();
|
let term_dict = inverted_index.terms();
|
||||||
let mut term_range = self.term_range(term_dict)?;
|
let mut term_range = self.term_range(term_dict)?;
|
||||||
let mut processed_count = 0;
|
let mut processed_count = 0;
|
||||||
@@ -477,7 +231,7 @@ mod tests {
|
|||||||
use crate::schema::{
|
use crate::schema::{
|
||||||
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
|
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
|
||||||
};
|
};
|
||||||
use crate::{Index, IndexWriter};
|
use crate::{Index, IndexWriter, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_range_query_simple() -> crate::Result<()> {
|
fn test_range_query_simple() -> crate::Result<()> {
|
||||||
@@ -499,7 +253,10 @@ mod tests {
|
|||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
|
let docs_in_the_sixties = RangeQuery::new(
|
||||||
|
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||||
|
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||||
|
);
|
||||||
|
|
||||||
// ... or `1960..=1969` if inclusive range is enabled.
|
// ... or `1960..=1969` if inclusive range is enabled.
|
||||||
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
@@ -530,7 +287,10 @@ mod tests {
|
|||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
|
let mut docs_in_the_sixties = RangeQuery::new(
|
||||||
|
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||||
|
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||||
|
);
|
||||||
docs_in_the_sixties.limit(5);
|
docs_in_the_sixties.limit(5);
|
||||||
|
|
||||||
// due to the limit and no docs in 1963, it's really only 1960..=1965
|
// due to the limit and no docs in 1963, it's really only 1960..=1965
|
||||||
@@ -575,29 +335,29 @@ mod tests {
|
|||||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)),
|
count_multiples(RangeQuery::new(
|
||||||
|
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||||
|
Bound::Excluded(Term::from_field_i64(int_field, 11)),
|
||||||
|
)),
|
||||||
9
|
9
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_i64_bounds(
|
count_multiples(RangeQuery::new(
|
||||||
"intfield".to_string(),
|
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||||
Bound::Included(10),
|
Bound::Included(Term::from_field_i64(int_field, 11)),
|
||||||
Bound::Included(11)
|
|
||||||
)),
|
)),
|
||||||
18
|
18
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_i64_bounds(
|
count_multiples(RangeQuery::new(
|
||||||
"intfield".to_string(),
|
Bound::Excluded(Term::from_field_i64(int_field, 9)),
|
||||||
Bound::Excluded(9),
|
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||||
Bound::Included(10)
|
|
||||||
)),
|
)),
|
||||||
9
|
9
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_i64_bounds(
|
count_multiples(RangeQuery::new(
|
||||||
"intfield".to_string(),
|
Bound::Included(Term::from_field_i64(int_field, 9)),
|
||||||
Bound::Included(9),
|
|
||||||
Bound::Unbounded
|
Bound::Unbounded
|
||||||
)),
|
)),
|
||||||
91
|
91
|
||||||
@@ -646,29 +406,29 @@ mod tests {
|
|||||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)),
|
count_multiples(RangeQuery::new(
|
||||||
|
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||||
|
Bound::Excluded(Term::from_field_f64(float_field, 11.0)),
|
||||||
|
)),
|
||||||
9
|
9
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_f64_bounds(
|
count_multiples(RangeQuery::new(
|
||||||
"floatfield".to_string(),
|
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||||
Bound::Included(10.0),
|
Bound::Included(Term::from_field_f64(float_field, 11.0)),
|
||||||
Bound::Included(11.0)
|
|
||||||
)),
|
)),
|
||||||
18
|
18
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_f64_bounds(
|
count_multiples(RangeQuery::new(
|
||||||
"floatfield".to_string(),
|
Bound::Excluded(Term::from_field_f64(float_field, 9.0)),
|
||||||
Bound::Excluded(9.0),
|
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||||
Bound::Included(10.0)
|
|
||||||
)),
|
)),
|
||||||
9
|
9
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
count_multiples(RangeQuery::new_f64_bounds(
|
count_multiples(RangeQuery::new(
|
||||||
"floatfield".to_string(),
|
Bound::Included(Term::from_field_f64(float_field, 9.0)),
|
||||||
Bound::Included(9.0),
|
|
||||||
Bound::Unbounded
|
Bound::Unbounded
|
||||||
)),
|
)),
|
||||||
91
|
91
|
||||||
|
|||||||
@@ -1,512 +0,0 @@
|
|||||||
//! IP Fastfields support efficient scanning for range queries.
|
|
||||||
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
|
|
||||||
//! used, which uses the term dictionary + postings.
|
|
||||||
|
|
||||||
use std::net::Ipv6Addr;
|
|
||||||
use std::ops::{Bound, RangeInclusive};
|
|
||||||
|
|
||||||
use columnar::{Column, MonotonicallyMappableToU128};
|
|
||||||
|
|
||||||
use crate::query::range_query::fast_field_range_query::RangeDocSet;
|
|
||||||
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
|
|
||||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
|
||||||
|
|
||||||
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
|
|
||||||
pub struct IPFastFieldRangeWeight {
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<Ipv6Addr>,
|
|
||||||
upper_bound: Bound<Ipv6Addr>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IPFastFieldRangeWeight {
|
|
||||||
/// Creates a new IPFastFieldRangeWeight.
|
|
||||||
pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
|
|
||||||
Self {
|
|
||||||
field,
|
|
||||||
lower_bound,
|
|
||||||
upper_bound,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Weight for IPFastFieldRangeWeight {
|
|
||||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
|
||||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
|
||||||
reader.fast_fields().column_opt(&self.field)?
|
|
||||||
else {
|
|
||||||
return Ok(Box::new(EmptyScorer));
|
|
||||||
};
|
|
||||||
let value_range = bound_to_value_range(
|
|
||||||
&self.lower_bound,
|
|
||||||
&self.upper_bound,
|
|
||||||
ip_addr_column.min_value(),
|
|
||||||
ip_addr_column.max_value(),
|
|
||||||
);
|
|
||||||
let docset = RangeDocSet::new(value_range, ip_addr_column);
|
|
||||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
|
||||||
let mut scorer = self.scorer(reader, 1.0)?;
|
|
||||||
if scorer.seek(doc) != doc {
|
|
||||||
return Err(TantivyError::InvalidArgument(format!(
|
|
||||||
"Document #({doc}) does not match"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let explanation = Explanation::new("Const", scorer.score());
|
|
||||||
Ok(explanation)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bound_to_value_range(
|
|
||||||
lower_bound: &Bound<Ipv6Addr>,
|
|
||||||
upper_bound: &Bound<Ipv6Addr>,
|
|
||||||
min_value: Ipv6Addr,
|
|
||||||
max_value: Ipv6Addr,
|
|
||||||
) -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start_value = match lower_bound {
|
|
||||||
Bound::Included(ip_addr) => *ip_addr,
|
|
||||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
|
|
||||||
Bound::Unbounded => min_value,
|
|
||||||
};
|
|
||||||
|
|
||||||
let end_value = match upper_bound {
|
|
||||||
Bound::Included(ip_addr) => *ip_addr,
|
|
||||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
|
|
||||||
Bound::Unbounded => max_value,
|
|
||||||
};
|
|
||||||
start_value..=end_value
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
pub mod tests {
|
|
||||||
use proptest::prelude::ProptestConfig;
|
|
||||||
use proptest::strategy::Strategy;
|
|
||||||
use proptest::{prop_oneof, proptest};
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use crate::collector::Count;
|
|
||||||
use crate::query::QueryParser;
|
|
||||||
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
|
||||||
use crate::{Index, IndexWriter};
|
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct Doc {
|
|
||||||
pub id: String,
|
|
||||||
pub ip: Ipv6Addr,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn operation_strategy() -> impl Strategy<Value = Doc> {
|
|
||||||
prop_oneof![
|
|
||||||
(0u64..10_000u64).prop_map(doc_from_id_1),
|
|
||||||
(1u64..10_000u64).prop_map(doc_from_id_2),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn doc_from_id_1(id: u64) -> Doc {
|
|
||||||
let id = id * 1000;
|
|
||||||
Doc {
|
|
||||||
// ip != id
|
|
||||||
id: id.to_string(),
|
|
||||||
ip: Ipv6Addr::from_u128(id as u128),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn doc_from_id_2(id: u64) -> Doc {
|
|
||||||
let id = id * 1000;
|
|
||||||
Doc {
|
|
||||||
// ip != id
|
|
||||||
id: (id - 1).to_string(),
|
|
||||||
ip: Ipv6Addr::from_u128(id as u128),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
proptest! {
|
|
||||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
|
||||||
#[test]
|
|
||||||
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
|
|
||||||
assert!(test_ip_range_for_docs(&ops).is_ok());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ip_range_regression1() {
|
|
||||||
let ops = &[doc_from_id_1(0)];
|
|
||||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ip_range_regression2() {
|
|
||||||
let ops = &[
|
|
||||||
doc_from_id_1(52),
|
|
||||||
doc_from_id_1(63),
|
|
||||||
doc_from_id_1(12),
|
|
||||||
doc_from_id_2(91),
|
|
||||||
doc_from_id_2(33),
|
|
||||||
];
|
|
||||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ip_range_regression3() {
|
|
||||||
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
|
|
||||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ip_range_regression3_simple() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
|
||||||
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
|
|
||||||
.into_iter()
|
|
||||||
.map(Ipv6Addr::from_u128)
|
|
||||||
.collect();
|
|
||||||
for &ip_addr in &ip_addrs {
|
|
||||||
writer
|
|
||||||
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
let range_weight = IPFastFieldRangeWeight {
|
|
||||||
field: "ips".to_string(),
|
|
||||||
lower_bound: Bound::Included(ip_addrs[1]),
|
|
||||||
upper_bound: Bound::Included(ip_addrs[2]),
|
|
||||||
};
|
|
||||||
let count = range_weight.count(searcher.segment_reader(0)).unwrap();
|
|
||||||
assert_eq!(count, 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
|
|
||||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
|
||||||
let text_field = schema_builder.add_text_field("id", STRING | STORED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
|
|
||||||
for doc in docs.iter() {
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(
|
|
||||||
ips_field => doc.ip,
|
|
||||||
ips_field => doc.ip,
|
|
||||||
ip_field => doc.ip,
|
|
||||||
text_field => doc.id.to_string(),
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
|
|
||||||
let index = create_index_from_docs(docs);
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
|
|
||||||
let query_from_text = |text: &str| {
|
|
||||||
QueryParser::for_index(&index, vec![])
|
|
||||||
.parse_query(text)
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
|
|
||||||
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
|
|
||||||
};
|
|
||||||
|
|
||||||
let test_sample = |sample_docs: &[Doc]| {
|
|
||||||
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
|
|
||||||
ips.sort();
|
|
||||||
let ip_range = ips[0]..=ips[1];
|
|
||||||
let expected_num_hits = docs
|
|
||||||
.iter()
|
|
||||||
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
|
|
||||||
.count();
|
|
||||||
|
|
||||||
let query = gen_query_inclusive("ip", &ip_range);
|
|
||||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
|
||||||
|
|
||||||
let query = gen_query_inclusive("ips", &ip_range);
|
|
||||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
|
||||||
|
|
||||||
// Intersection search
|
|
||||||
let id_filter = sample_docs[0].id.to_string();
|
|
||||||
let expected_num_hits = docs
|
|
||||||
.iter()
|
|
||||||
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
|
|
||||||
.count();
|
|
||||||
let query = format!(
|
|
||||||
"{} AND id:{}",
|
|
||||||
gen_query_inclusive("ip", &ip_range),
|
|
||||||
&id_filter
|
|
||||||
);
|
|
||||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
|
||||||
|
|
||||||
// Intersection search on multivalue ip field
|
|
||||||
let id_filter = sample_docs[0].id.to_string();
|
|
||||||
let query = format!(
|
|
||||||
"{} AND id:{}",
|
|
||||||
gen_query_inclusive("ips", &ip_range),
|
|
||||||
&id_filter
|
|
||||||
);
|
|
||||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
|
||||||
};
|
|
||||||
|
|
||||||
test_sample(&[docs[0].clone(), docs[0].clone()]);
|
|
||||||
if docs.len() > 1 {
|
|
||||||
test_sample(&[docs[0].clone(), docs[1].clone()]);
|
|
||||||
test_sample(&[docs[1].clone(), docs[1].clone()]);
|
|
||||||
}
|
|
||||||
if docs.len() > 2 {
|
|
||||||
test_sample(&[docs[1].clone(), docs[2].clone()]);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
|
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use test::Bencher;
|
|
||||||
|
|
||||||
use super::tests::*;
|
|
||||||
use super::*;
|
|
||||||
use crate::collector::Count;
|
|
||||||
use crate::query::QueryParser;
|
|
||||||
use crate::Index;
|
|
||||||
|
|
||||||
fn get_index_0_to_100() -> Index {
|
|
||||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
|
||||||
let num_vals = 100_000;
|
|
||||||
let docs: Vec<_> = (0..num_vals)
|
|
||||||
.map(|_i| {
|
|
||||||
let id = if rng.gen_bool(0.01) {
|
|
||||||
"veryfew".to_string() // 1%
|
|
||||||
} else if rng.gen_bool(0.1) {
|
|
||||||
"few".to_string() // 9%
|
|
||||||
} else {
|
|
||||||
"many".to_string() // 90%
|
|
||||||
};
|
|
||||||
Doc {
|
|
||||||
id,
|
|
||||||
// Multiply by 1000, so that we create many buckets in the compact space
|
|
||||||
// The benches depend on this range to select n-percent of elements with the
|
|
||||||
// methods below.
|
|
||||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
create_index_from_docs(&docs)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start = Ipv6Addr::from_u128(0);
|
|
||||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
|
||||||
start..=end
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start = Ipv6Addr::from_u128(0);
|
|
||||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
|
||||||
start..=end
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
|
||||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
|
||||||
start..=end
|
|
||||||
}
|
|
||||||
|
|
||||||
fn excute_query(
|
|
||||||
field: &str,
|
|
||||||
ip_range: RangeInclusive<Ipv6Addr>,
|
|
||||||
suffix: &str,
|
|
||||||
index: &Index,
|
|
||||||
) -> usize {
|
|
||||||
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
|
|
||||||
format!(
|
|
||||||
"{}:[{} TO {}] {}",
|
|
||||||
field,
|
|
||||||
&from.to_string(),
|
|
||||||
&to.to_string(),
|
|
||||||
suffix
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
|
|
||||||
let query_from_text = |text: &str| {
|
|
||||||
QueryParser::for_index(index, vec![])
|
|
||||||
.parse_query(text)
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
let query = query_from_text(&query);
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
searcher.search(&query, &(Count)).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
|
|
||||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -2,54 +2,34 @@
|
|||||||
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
|
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
|
||||||
//! used, which uses the term dictionary + postings.
|
//! used, which uses the term dictionary + postings.
|
||||||
|
|
||||||
|
use std::net::Ipv6Addr;
|
||||||
use std::ops::{Bound, RangeInclusive};
|
use std::ops::{Bound, RangeInclusive};
|
||||||
|
|
||||||
use columnar::{ColumnType, HasAssociatedColumnType, MonotonicallyMappableToU64};
|
use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn};
|
||||||
|
use common::BinarySerializable;
|
||||||
|
|
||||||
use super::fast_field_range_query::RangeDocSet;
|
use super::fast_field_range_doc_set::RangeDocSet;
|
||||||
use super::map_bound;
|
use super::{map_bound, map_bound_res};
|
||||||
use crate::query::{ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
|
use crate::query::range_query::range_query::inner_bound;
|
||||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
|
||||||
|
use crate::schema::{Field, Type};
|
||||||
|
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
|
||||||
|
|
||||||
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
|
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct FastFieldRangeWeight {
|
pub struct FastFieldRangeWeight {
|
||||||
field: String,
|
lower_bound: Bound<Term>,
|
||||||
lower_bound: Bound<u64>,
|
upper_bound: Bound<Term>,
|
||||||
upper_bound: Bound<u64>,
|
field: Field,
|
||||||
column_type_opt: Option<ColumnType>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FastFieldRangeWeight {
|
impl FastFieldRangeWeight {
|
||||||
/// Create a new FastFieldRangeWeight, using the u64 representation of any fast field.
|
/// Create a new FastFieldRangeWeight
|
||||||
pub(crate) fn new_u64_lenient(
|
pub(crate) fn new(field: Field, lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> Self {
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<u64>,
|
|
||||||
upper_bound: Bound<u64>,
|
|
||||||
) -> Self {
|
|
||||||
let lower_bound = map_bound(&lower_bound, |val| *val);
|
|
||||||
let upper_bound = map_bound(&upper_bound, |val| *val);
|
|
||||||
Self {
|
Self {
|
||||||
field,
|
|
||||||
lower_bound,
|
lower_bound,
|
||||||
upper_bound,
|
upper_bound,
|
||||||
column_type_opt: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new `FastFieldRangeWeight` for a range of a u64-mappable type .
|
|
||||||
pub fn new<T: HasAssociatedColumnType + MonotonicallyMappableToU64>(
|
|
||||||
field: String,
|
|
||||||
lower_bound: Bound<T>,
|
|
||||||
upper_bound: Bound<T>,
|
|
||||||
) -> Self {
|
|
||||||
let lower_bound = map_bound(&lower_bound, |val| val.to_u64());
|
|
||||||
let upper_bound = map_bound(&upper_bound, |val| val.to_u64());
|
|
||||||
Self {
|
|
||||||
field,
|
field,
|
||||||
lower_bound,
|
|
||||||
upper_bound,
|
|
||||||
column_type_opt: Some(T::column_type()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -65,30 +45,101 @@ impl Query for FastFieldRangeWeight {
|
|||||||
|
|
||||||
impl Weight for FastFieldRangeWeight {
|
impl Weight for FastFieldRangeWeight {
|
||||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||||
let fast_field_reader = reader.fast_fields();
|
// Check if both bounds are Bound::Unbounded
|
||||||
let column_type_opt: Option<[ColumnType; 1]> =
|
if self.lower_bound == Bound::Unbounded && self.upper_bound == Bound::Unbounded {
|
||||||
self.column_type_opt.map(|column_type| [column_type]);
|
return Ok(Box::new(AllScorer::new(reader.max_doc())));
|
||||||
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
|
}
|
||||||
.as_ref()
|
let field_name = reader.schema().get_field_name(self.field);
|
||||||
.map(|column_types| column_types.as_slice());
|
let field_type = reader.schema().get_field_entry(self.field).field_type();
|
||||||
let Some((column, _)) =
|
|
||||||
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
|
let term = inner_bound(&self.lower_bound)
|
||||||
else {
|
.or(inner_bound(&self.upper_bound))
|
||||||
return Ok(Box::new(EmptyScorer));
|
.expect("At least one bound must be set");
|
||||||
};
|
assert_eq!(
|
||||||
#[allow(clippy::reversed_empty_ranges)]
|
term.typ(),
|
||||||
let value_range = bound_to_value_range(
|
field_type.value_type(),
|
||||||
&self.lower_bound,
|
"Field is of type {:?}, but got term of type {:?}",
|
||||||
&self.upper_bound,
|
field_type,
|
||||||
column.min_value(),
|
term.typ()
|
||||||
column.max_value(),
|
);
|
||||||
)
|
if field_type.is_ip_addr() {
|
||||||
.unwrap_or(1..=0); // empty range
|
let parse_ip_from_bytes = |term: &Term| {
|
||||||
if value_range.is_empty() {
|
term.value().as_ip_addr().ok_or_else(|| {
|
||||||
return Ok(Box::new(EmptyScorer));
|
crate::TantivyError::InvalidArgument("Expected ip address".to_string())
|
||||||
|
})
|
||||||
|
};
|
||||||
|
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
|
||||||
|
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
|
||||||
|
|
||||||
|
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
||||||
|
reader.fast_fields().column_opt(field_name)?
|
||||||
|
else {
|
||||||
|
return Ok(Box::new(EmptyScorer));
|
||||||
|
};
|
||||||
|
let value_range = bound_to_value_range_ip(
|
||||||
|
&lower_bound,
|
||||||
|
&upper_bound,
|
||||||
|
ip_addr_column.min_value(),
|
||||||
|
ip_addr_column.max_value(),
|
||||||
|
);
|
||||||
|
let docset = RangeDocSet::new(value_range, ip_addr_column);
|
||||||
|
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||||
|
} else {
|
||||||
|
let (lower_bound, upper_bound) = if field_type.is_str() {
|
||||||
|
let Some(str_dict_column): Option<StrColumn> =
|
||||||
|
reader.fast_fields().str(field_name)?
|
||||||
|
else {
|
||||||
|
return Ok(Box::new(EmptyScorer));
|
||||||
|
};
|
||||||
|
let dict = str_dict_column.dictionary();
|
||||||
|
|
||||||
|
let lower_bound = map_bound(&self.lower_bound, |term| {
|
||||||
|
term.serialized_value_bytes().to_vec()
|
||||||
|
});
|
||||||
|
let upper_bound = map_bound(&self.upper_bound, |term| {
|
||||||
|
term.serialized_value_bytes().to_vec()
|
||||||
|
});
|
||||||
|
// Get term ids for terms
|
||||||
|
let (lower_bound, upper_bound) =
|
||||||
|
dict.term_bounds_to_ord(lower_bound, upper_bound)?;
|
||||||
|
(lower_bound, upper_bound)
|
||||||
|
} else {
|
||||||
|
assert!(
|
||||||
|
maps_to_u64_fastfield(field_type.value_type()),
|
||||||
|
"{:?}",
|
||||||
|
field_type
|
||||||
|
);
|
||||||
|
let parse_from_bytes = |term: &Term| {
|
||||||
|
u64::from_be(
|
||||||
|
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
|
||||||
|
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
|
||||||
|
(lower_bound, upper_bound)
|
||||||
|
};
|
||||||
|
|
||||||
|
let fast_field_reader = reader.fast_fields();
|
||||||
|
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)?
|
||||||
|
else {
|
||||||
|
return Ok(Box::new(EmptyScorer));
|
||||||
|
};
|
||||||
|
#[allow(clippy::reversed_empty_ranges)]
|
||||||
|
let value_range = bound_to_value_range(
|
||||||
|
&lower_bound,
|
||||||
|
&upper_bound,
|
||||||
|
column.min_value(),
|
||||||
|
column.max_value(),
|
||||||
|
)
|
||||||
|
.unwrap_or(1..=0); // empty range
|
||||||
|
if value_range.is_empty() {
|
||||||
|
return Ok(Box::new(EmptyScorer));
|
||||||
|
}
|
||||||
|
let docset = RangeDocSet::new(value_range, column);
|
||||||
|
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||||
}
|
}
|
||||||
let docset = RangeDocSet::new(value_range, column);
|
|
||||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||||
@@ -104,6 +155,35 @@ impl Weight for FastFieldRangeWeight {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the type maps to a u64 fast field
|
||||||
|
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
|
||||||
|
match typ {
|
||||||
|
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||||
|
Type::IpAddr => false,
|
||||||
|
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bound_to_value_range_ip(
|
||||||
|
lower_bound: &Bound<Ipv6Addr>,
|
||||||
|
upper_bound: &Bound<Ipv6Addr>,
|
||||||
|
min_value: Ipv6Addr,
|
||||||
|
max_value: Ipv6Addr,
|
||||||
|
) -> RangeInclusive<Ipv6Addr> {
|
||||||
|
let start_value = match lower_bound {
|
||||||
|
Bound::Included(ip_addr) => *ip_addr,
|
||||||
|
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
|
||||||
|
Bound::Unbounded => min_value,
|
||||||
|
};
|
||||||
|
|
||||||
|
let end_value = match upper_bound {
|
||||||
|
Bound::Included(ip_addr) => *ip_addr,
|
||||||
|
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
|
||||||
|
Bound::Unbounded => max_value,
|
||||||
|
};
|
||||||
|
start_value..=end_value
|
||||||
|
}
|
||||||
|
|
||||||
// Returns None, if the range cannot be converted to a inclusive range (which equals to a empty
|
// Returns None, if the range cannot be converted to a inclusive range (which equals to a empty
|
||||||
// range).
|
// range).
|
||||||
fn bound_to_value_range<T: MonotonicallyMappableToU64>(
|
fn bound_to_value_range<T: MonotonicallyMappableToU64>(
|
||||||
@@ -137,11 +217,72 @@ pub mod tests {
|
|||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
|
|
||||||
use crate::collector::Count;
|
use crate::collector::{Count, TopDocs};
|
||||||
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
|
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||||
use crate::query::{QueryParser, Weight};
|
use crate::query::{QueryParser, Weight};
|
||||||
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
|
use crate::schema::{
|
||||||
use crate::{Index, IndexWriter, TERMINATED};
|
NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, TEXT,
|
||||||
|
};
|
||||||
|
use crate::{Index, IndexWriter, Term, TERMINATED};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_text_field_ff_range_query() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
schema_builder.add_text_field("title", TEXT | FAST);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
let title = schema.get_field("title").unwrap();
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
title => "bbb"
|
||||||
|
))?;
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
title => "ddd"
|
||||||
|
))?;
|
||||||
|
index_writer.commit()?;
|
||||||
|
let reader = index.reader()?;
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
|
||||||
|
let test_query = |query, num_hits| {
|
||||||
|
let query = query_parser.parse_query(query).unwrap();
|
||||||
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
|
||||||
|
assert_eq!(top_docs.len(), num_hits);
|
||||||
|
};
|
||||||
|
|
||||||
|
test_query("title:[aaa TO ccc]", 1);
|
||||||
|
test_query("title:[aaa TO bbb]", 1);
|
||||||
|
test_query("title:[bbb TO bbb]", 1);
|
||||||
|
test_query("title:[bbb TO ddd]", 2);
|
||||||
|
test_query("title:[bbb TO eee]", 2);
|
||||||
|
test_query("title:[bb TO eee]", 2);
|
||||||
|
test_query("title:[ccc TO ccc]", 0);
|
||||||
|
test_query("title:[ccc TO ddd]", 1);
|
||||||
|
test_query("title:[ccc TO eee]", 1);
|
||||||
|
|
||||||
|
test_query("title:[aaa TO *}", 2);
|
||||||
|
test_query("title:[bbb TO *]", 2);
|
||||||
|
test_query("title:[bb TO *]", 2);
|
||||||
|
test_query("title:[ccc TO *]", 1);
|
||||||
|
test_query("title:[ddd TO *]", 1);
|
||||||
|
test_query("title:[dddd TO *]", 0);
|
||||||
|
|
||||||
|
test_query("title:{aaa TO *}", 2);
|
||||||
|
test_query("title:{bbb TO *]", 1);
|
||||||
|
test_query("title:{bb TO *]", 2);
|
||||||
|
test_query("title:{ccc TO *]", 1);
|
||||||
|
test_query("title:{ddd TO *]", 0);
|
||||||
|
test_query("title:{dddd TO *]", 0);
|
||||||
|
|
||||||
|
test_query("title:[* TO bb]", 0);
|
||||||
|
test_query("title:[* TO bbb]", 1);
|
||||||
|
test_query("title:[* TO ccc]", 1);
|
||||||
|
test_query("title:[* TO ddd]", 2);
|
||||||
|
test_query("title:[* TO ddd}", 1);
|
||||||
|
test_query("title:[* TO eee]", 2);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct Doc {
|
pub struct Doc {
|
||||||
@@ -159,14 +300,14 @@ pub mod tests {
|
|||||||
fn doc_from_id_1(id: u64) -> Doc {
|
fn doc_from_id_1(id: u64) -> Doc {
|
||||||
let id = id * 1000;
|
let id = id * 1000;
|
||||||
Doc {
|
Doc {
|
||||||
id_name: id.to_string(),
|
id_name: format!("id_name{:010}", id),
|
||||||
id,
|
id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn doc_from_id_2(id: u64) -> Doc {
|
fn doc_from_id_2(id: u64) -> Doc {
|
||||||
let id = id * 1000;
|
let id = id * 1000;
|
||||||
Doc {
|
Doc {
|
||||||
id_name: (id - 1).to_string(),
|
id_name: format!("id_name{:010}", id - 1),
|
||||||
id,
|
id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -213,10 +354,10 @@ pub mod tests {
|
|||||||
writer.add_document(doc!(field=>52_000u64)).unwrap();
|
writer.add_document(doc!(field=>52_000u64)).unwrap();
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let range_query = FastFieldRangeWeight::new_u64_lenient(
|
let range_query = FastFieldRangeWeight::new(
|
||||||
"test_field".to_string(),
|
field,
|
||||||
Bound::Included(50_000),
|
Bound::Included(Term::from_field_u64(field, 50_000)),
|
||||||
Bound::Included(50_002),
|
Bound::Included(Term::from_field_u64(field, 50_002)),
|
||||||
);
|
);
|
||||||
let scorer = range_query
|
let scorer = range_query
|
||||||
.scorer(searcher.segment_reader(0), 1.0f32)
|
.scorer(searcher.segment_reader(0), 1.0f32)
|
||||||
@@ -254,7 +395,8 @@ pub mod tests {
|
|||||||
NumericOptions::default().set_fast().set_indexed(),
|
NumericOptions::default().set_fast().set_indexed(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
|
let text_field = schema_builder.add_text_field("id_name", STRING | STORED | FAST);
|
||||||
|
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
@@ -273,6 +415,7 @@ pub mod tests {
|
|||||||
id_f64_field => doc.id as f64,
|
id_f64_field => doc.id as f64,
|
||||||
id_i64_field => doc.id as i64,
|
id_i64_field => doc.id as i64,
|
||||||
text_field => doc.id_name.to_string(),
|
text_field => doc.id_name.to_string(),
|
||||||
|
text_field2 => doc.id_name.to_string(),
|
||||||
))
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
@@ -317,6 +460,24 @@ pub mod tests {
|
|||||||
let query = gen_query_inclusive("ids", ids[0]..=ids[1]);
|
let query = gen_query_inclusive("ids", ids[0]..=ids[1]);
|
||||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||||
|
|
||||||
|
// Text query
|
||||||
|
{
|
||||||
|
let test_text_query = |field_name: &str| {
|
||||||
|
let mut id_names: Vec<&str> =
|
||||||
|
sample_docs.iter().map(|doc| doc.id_name.as_str()).collect();
|
||||||
|
id_names.sort();
|
||||||
|
let expected_num_hits = docs
|
||||||
|
.iter()
|
||||||
|
.filter(|doc| (id_names[0]..=id_names[1]).contains(&doc.id_name.as_str()))
|
||||||
|
.count();
|
||||||
|
let query = format!("{}:[{} TO {}]", field_name, id_names[0], id_names[1]);
|
||||||
|
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||||
|
};
|
||||||
|
|
||||||
|
test_text_query("id_name");
|
||||||
|
test_text_query("id_name_fast");
|
||||||
|
}
|
||||||
|
|
||||||
// Exclusive range
|
// Exclusive range
|
||||||
let expected_num_hits = docs
|
let expected_num_hits = docs
|
||||||
.iter()
|
.iter()
|
||||||
@@ -394,6 +555,202 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub mod ip_range_tests {
|
||||||
|
use proptest::prelude::ProptestConfig;
|
||||||
|
use proptest::strategy::Strategy;
|
||||||
|
use proptest::{prop_oneof, proptest};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::collector::Count;
|
||||||
|
use crate::query::QueryParser;
|
||||||
|
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||||
|
use crate::{Index, IndexWriter};
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Doc {
|
||||||
|
pub id: String,
|
||||||
|
pub ip: Ipv6Addr,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn operation_strategy() -> impl Strategy<Value = Doc> {
|
||||||
|
prop_oneof![
|
||||||
|
(0u64..10_000u64).prop_map(doc_from_id_1),
|
||||||
|
(1u64..10_000u64).prop_map(doc_from_id_2),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn doc_from_id_1(id: u64) -> Doc {
|
||||||
|
let id = id * 1000;
|
||||||
|
Doc {
|
||||||
|
// ip != id
|
||||||
|
id: id.to_string(),
|
||||||
|
ip: Ipv6Addr::from_u128(id as u128),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn doc_from_id_2(id: u64) -> Doc {
|
||||||
|
let id = id * 1000;
|
||||||
|
Doc {
|
||||||
|
// ip != id
|
||||||
|
id: (id - 1).to_string(),
|
||||||
|
ip: Ipv6Addr::from_u128(id as u128),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proptest! {
|
||||||
|
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||||
|
#[test]
|
||||||
|
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
|
||||||
|
assert!(test_ip_range_for_docs(&ops).is_ok());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ip_range_regression1() {
|
||||||
|
let ops = &[doc_from_id_1(0)];
|
||||||
|
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ip_range_regression2() {
|
||||||
|
let ops = &[
|
||||||
|
doc_from_id_1(52),
|
||||||
|
doc_from_id_1(63),
|
||||||
|
doc_from_id_1(12),
|
||||||
|
doc_from_id_2(91),
|
||||||
|
doc_from_id_2(33),
|
||||||
|
];
|
||||||
|
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ip_range_regression3() {
|
||||||
|
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
|
||||||
|
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ip_range_regression3_simple() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||||
|
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
|
||||||
|
.into_iter()
|
||||||
|
.map(Ipv6Addr::from_u128)
|
||||||
|
.collect();
|
||||||
|
for &ip_addr in &ip_addrs {
|
||||||
|
writer
|
||||||
|
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
writer.commit().unwrap();
|
||||||
|
let searcher = index.reader().unwrap().searcher();
|
||||||
|
let range_weight = FastFieldRangeWeight::new(
|
||||||
|
ips_field,
|
||||||
|
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])),
|
||||||
|
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])),
|
||||||
|
);
|
||||||
|
|
||||||
|
let count =
|
||||||
|
crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap();
|
||||||
|
assert_eq!(count, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_index_from_ip_docs(docs: &[Doc]) -> Index {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
|
||||||
|
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||||
|
let text_field = schema_builder.add_text_field("id", STRING | STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
|
||||||
|
for doc in docs.iter() {
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(
|
||||||
|
ips_field => doc.ip,
|
||||||
|
ips_field => doc.ip,
|
||||||
|
ip_field => doc.ip,
|
||||||
|
text_field => doc.id.to_string(),
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
|
||||||
|
let index = create_index_from_ip_docs(docs);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
|
||||||
|
let query_from_text = |text: &str| {
|
||||||
|
QueryParser::for_index(&index, vec![])
|
||||||
|
.parse_query(text)
|
||||||
|
.unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
|
||||||
|
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
|
||||||
|
};
|
||||||
|
|
||||||
|
let test_sample = |sample_docs: &[Doc]| {
|
||||||
|
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
|
||||||
|
ips.sort();
|
||||||
|
let ip_range = ips[0]..=ips[1];
|
||||||
|
let expected_num_hits = docs
|
||||||
|
.iter()
|
||||||
|
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
let query = gen_query_inclusive("ip", &ip_range);
|
||||||
|
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||||
|
|
||||||
|
let query = gen_query_inclusive("ips", &ip_range);
|
||||||
|
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||||
|
|
||||||
|
// Intersection search
|
||||||
|
let id_filter = sample_docs[0].id.to_string();
|
||||||
|
let expected_num_hits = docs
|
||||||
|
.iter()
|
||||||
|
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
|
||||||
|
.count();
|
||||||
|
let query = format!(
|
||||||
|
"{} AND id:{}",
|
||||||
|
gen_query_inclusive("ip", &ip_range),
|
||||||
|
&id_filter
|
||||||
|
);
|
||||||
|
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||||
|
|
||||||
|
// Intersection search on multivalue ip field
|
||||||
|
let id_filter = sample_docs[0].id.to_string();
|
||||||
|
let query = format!(
|
||||||
|
"{} AND id:{}",
|
||||||
|
gen_query_inclusive("ips", &ip_range),
|
||||||
|
&id_filter
|
||||||
|
);
|
||||||
|
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||||
|
};
|
||||||
|
|
||||||
|
test_sample(&[docs[0].clone(), docs[0].clone()]);
|
||||||
|
if docs.len() > 1 {
|
||||||
|
test_sample(&[docs[0].clone(), docs[1].clone()]);
|
||||||
|
test_sample(&[docs[1].clone(), docs[1].clone()]);
|
||||||
|
}
|
||||||
|
if docs.len() > 2 {
|
||||||
|
test_sample(&[docs[1].clone(), docs[2].clone()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
mod bench {
|
mod bench {
|
||||||
|
|
||||||
@@ -601,3 +958,242 @@ mod bench {
|
|||||||
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
|
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench_ip {
|
||||||
|
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::{Rng, SeedableRng};
|
||||||
|
use test::Bencher;
|
||||||
|
|
||||||
|
use super::ip_range_tests::*;
|
||||||
|
use super::*;
|
||||||
|
use crate::collector::Count;
|
||||||
|
use crate::query::QueryParser;
|
||||||
|
use crate::Index;
|
||||||
|
|
||||||
|
fn get_index_0_to_100() -> Index {
|
||||||
|
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||||
|
let num_vals = 100_000;
|
||||||
|
let docs: Vec<_> = (0..num_vals)
|
||||||
|
.map(|_i| {
|
||||||
|
let id = if rng.gen_bool(0.01) {
|
||||||
|
"veryfew".to_string() // 1%
|
||||||
|
} else if rng.gen_bool(0.1) {
|
||||||
|
"few".to_string() // 9%
|
||||||
|
} else {
|
||||||
|
"many".to_string() // 90%
|
||||||
|
};
|
||||||
|
Doc {
|
||||||
|
id,
|
||||||
|
// Multiply by 1000, so that we create many buckets in the compact space
|
||||||
|
// The benches depend on this range to select n-percent of elements with the
|
||||||
|
// methods below.
|
||||||
|
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
create_index_from_ip_docs(&docs)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
|
||||||
|
let start = Ipv6Addr::from_u128(0);
|
||||||
|
let end = Ipv6Addr::from_u128(90 * 1000);
|
||||||
|
start..=end
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
|
||||||
|
let start = Ipv6Addr::from_u128(0);
|
||||||
|
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||||
|
start..=end
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
|
||||||
|
let start = Ipv6Addr::from_u128(10 * 1000);
|
||||||
|
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||||
|
start..=end
|
||||||
|
}
|
||||||
|
|
||||||
|
fn excute_query(
|
||||||
|
field: &str,
|
||||||
|
ip_range: RangeInclusive<Ipv6Addr>,
|
||||||
|
suffix: &str,
|
||||||
|
index: &Index,
|
||||||
|
) -> usize {
|
||||||
|
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
|
||||||
|
format!(
|
||||||
|
"{}:[{} TO {}] {}",
|
||||||
|
field,
|
||||||
|
&from.to_string(),
|
||||||
|
&to.to_string(),
|
||||||
|
suffix
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
|
||||||
|
let query_from_text = |text: &str| {
|
||||||
|
QueryParser::for_index(index, vec![])
|
||||||
|
.parse_query(text)
|
||||||
|
.unwrap()
|
||||||
|
};
|
||||||
|
let query = query_from_text(&query);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
searcher.search(&query, &(Count)).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||||
|
let index = get_index_0_to_100();
|
||||||
|
|
||||||
|
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -508,7 +508,7 @@ impl std::fmt::Debug for ValueAddr {
|
|||||||
|
|
||||||
/// A enum representing a value for tantivy to index.
|
/// A enum representing a value for tantivy to index.
|
||||||
///
|
///
|
||||||
/// Any changes need to be reflected in `BinarySerializable` for `ValueType`
|
/// ** Any changes need to be reflected in `BinarySerializable` for `ValueType` **
|
||||||
///
|
///
|
||||||
/// We can't use [schema::Type] or [columnar::ColumnType] here, because they are missing
|
/// We can't use [schema::Type] or [columnar::ColumnType] here, because they are missing
|
||||||
/// some items like Array and PreTokStr.
|
/// some items like Array and PreTokStr.
|
||||||
@@ -553,7 +553,7 @@ impl BinarySerializable for ValueType {
|
|||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
let num = u8::deserialize(reader)?;
|
let num = u8::deserialize(reader)?;
|
||||||
let type_id = if (0..=12).contains(&num) {
|
let type_id = if (0..=12).contains(&num) {
|
||||||
unsafe { std::mem::transmute(num) }
|
unsafe { std::mem::transmute::<u8, ValueType>(num) }
|
||||||
} else {
|
} else {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
io::ErrorKind::InvalidData,
|
io::ErrorKind::InvalidData,
|
||||||
|
|||||||
@@ -201,6 +201,11 @@ impl FieldType {
|
|||||||
matches!(self, FieldType::IpAddr(_))
|
matches!(self, FieldType::IpAddr(_))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// returns true if this is an str field
|
||||||
|
pub fn is_str(&self) -> bool {
|
||||||
|
matches!(self, FieldType::Str(_))
|
||||||
|
}
|
||||||
|
|
||||||
/// returns true if this is an date field
|
/// returns true if this is an date field
|
||||||
pub fn is_date(&self) -> bool {
|
pub fn is_date(&self) -> bool {
|
||||||
matches!(self, FieldType::Date(_))
|
matches!(self, FieldType::Date(_))
|
||||||
|
|||||||
@@ -249,12 +249,8 @@ impl Term {
|
|||||||
#[inline]
|
#[inline]
|
||||||
pub fn append_path(&mut self, bytes: &[u8]) -> &mut [u8] {
|
pub fn append_path(&mut self, bytes: &[u8]) -> &mut [u8] {
|
||||||
let len_before = self.0.len();
|
let len_before = self.0.len();
|
||||||
if bytes.contains(&0u8) {
|
assert!(!bytes.contains(&JSON_END_OF_PATH));
|
||||||
self.0
|
self.0.extend_from_slice(bytes);
|
||||||
.extend(bytes.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
|
||||||
} else {
|
|
||||||
self.0.extend_from_slice(bytes);
|
|
||||||
}
|
|
||||||
&mut self.0[len_before..]
|
&mut self.0[len_before..]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,9 +16,7 @@ fn make_test_sstable(suffix: &str) -> FileSlice {
|
|||||||
|
|
||||||
let table = builder.finish().unwrap();
|
let table = builder.finish().unwrap();
|
||||||
let table = Arc::new(OwnedBytes::new(table));
|
let table = Arc::new(OwnedBytes::new(table));
|
||||||
let slice = common::file_slice::FileSlice::new(table.clone());
|
common::file_slice::FileSlice::new(table.clone())
|
||||||
|
|
||||||
slice
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use rand::rngs::StdRng;
|
|||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
|
use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
|
||||||
|
|
||||||
const CHARSET: &'static [u8] = b"abcdefghij";
|
const CHARSET: &[u8] = b"abcdefghij";
|
||||||
|
|
||||||
fn generate_key(rng: &mut impl Rng) -> String {
|
fn generate_key(rng: &mut impl Rng) -> String {
|
||||||
let len = rng.gen_range(3..12);
|
let len = rng.gen_range(3..12);
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user