Compare commits

..

14 Commits

Author SHA1 Message Date
Paul Masurel
f820d42151 oneshot 0.1.7
Now that the is_closed changed has been merge upstream, we can rely on
that.

This commit is a "hotfix" because we don't want to rely on
some of the commit in main just yet
2024-05-31 07:54:56 +04:00
PSeitz
5b7cca13e5 lower contention on AggregationLimits (#2394)
PR https://github.com/quickwit-oss/quickwit/pull/4962 fixes an issue
where the AggregationLimits are not passed correctly. Since the
AggregationLimits are shared properly we run into contention issues.

This PR includes some straightforward improvement to reduce contention,
by only calling if the memory changed and avoiding the second read.

We probably need some sharding with multiple counters or local caching before updating the
global after some threshold.
2024-05-15 12:25:40 +02:00
dependabot[bot]
a79590477e Update binggan requirement from 0.5.2 to 0.6.2 (#2399)
---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-15 05:40:37 +02:00
Paul Masurel
6181c1eb5e Small changes in the Executor API. (#2391)
Warning, this change is mildly not backward compatible
so I bumped tantivy's version.
2024-05-10 17:19:12 +09:00
Adam Reichold
1ee5f90761 Give allocation control to the caller instead of force a clone (#2389)
Achieved by moving the boxes out of the temporary reference wrappers which are
cloneable themselves, i.e. if required the caller can clone them already or
consume them to reuse existing allocations.
2024-05-09 16:01:13 +09:00
PSeitz
71f3b4e4e3 fix ReferenceValue API flaw (#2372)
* fix ReferenceValue API flaw

Remove `Facet` and `TokenizedString` values from the `ReferenceValue` API,
as this requires the trait value to have them stored somewhere.

Since `TokenizedString` is quite niche, I just copy it into a Box,
instead of designing a reference API around it.

* fix comment link
2024-05-09 06:14:42 +02:00
trinity-1686a
8cd7ddc535 run block decompression from executor (#2386)
* run block decompression from executor

* add a wrapper with is_closed to oneshot channel

* add cancelation test to Executor::spawn_blocking
2024-05-08 12:22:44 +02:00
Paul Masurel
2b76335a95 Removed usage of num_cpus (#2387)
* Removed usage of num_cpus
* handling error
2024-05-08 13:32:52 +09:00
PSeitz
c6b213d8f0 use bingang for agg benchmark (#2378)
* use bingang for agg benchmark

use bingang for agg benchmark, which includes memory consumption

Output:
```
full
histogram                     Memory: 15.8 KB              Avg: 10.9322ms  (+5.44%)    Median: 10.8790ms  (+9.28%)     Min: 10.7470ms    Max: 11.3263ms
histogram_hard_bounds         Memory: 15.5 KB              Avg: 5.1939ms  (+6.61%)     Median: 5.1722ms  (+10.98%)     Min: 5.0432ms     Max: 5.3910ms
histogram_with_avg_sub_agg    Memory: 48.7 KB              Avg: 23.8165ms  (+4.57%)    Median: 23.7264ms  (+10.06%)    Min: 23.4995ms    Max: 24.8107ms
dense
histogram                     Memory: 17.3 KB              Avg: 15.6810ms  (-8.54%)    Median: 15.6174ms  (-8.89%)    Min: 15.4953ms    Max: 16.0702ms
histogram_hard_bounds         Memory: 15.4 KB              Avg: 10.0720ms  (-7.33%)    Median: 10.0572ms  (-7.06%)    Min: 9.8500ms     Max: 10.4819ms
histogram_with_avg_sub_agg    Memory: 50.1 KB              Avg: 33.0993ms  (-7.04%)    Median: 32.9499ms  (-6.86%)    Min: 32.8284ms    Max: 34.0529ms
sparse
histogram                     Memory: 16.3 KB              Avg: 19.2325ms  (-0.44%)    Median: 19.1211ms  (-1.26%)    Min: 19.0348ms    Max: 19.7902ms
histogram_hard_bounds         Memory: 16.1 KB              Avg: 18.5179ms  (-0.61%)    Median: 18.4552ms  (-0.90%)    Min: 18.3799ms    Max: 19.0535ms
histogram_with_avg_sub_agg    Memory: 34.7 KB              Avg: 21.2589ms  (-0.69%)    Median: 21.1867ms  (-1.05%)    Min: 21.0342ms    Max: 21.9900ms
```

* add more bench with term as sub agg
2024-05-07 11:29:49 +02:00
PSeitz
eea70030bf cleanup top level exports (#2382)
remove some top level exports
2024-05-07 09:59:41 +02:00
PSeitz
92b5526310 allow more JSON values, fix i64 special case (#2383)
This changes three things:
- Reuse positions_per_path hashmap instead of allocating one per
  indexed JSON value
- Try to cast u64 values to i64 to streamline with search behaviour
- Allow top level json values to be of any type, instead of limiting it
  to JSON objects. Remove special JSON object handling method.

TODO: We probably should also try to check f64 to i64 and u64 when
indexing, as values may get converted to f64 by the JSON parser
2024-05-01 12:08:12 +02:00
PSeitz
99a59ad37e remove zero byte check (#2379)
remove zero byte checks in columnar. zero bytes are converted during serialization now.
unify code paths
extend test for expected column names
2024-04-26 06:03:28 +02:00
trinity-1686a
6a66a71cbb modify fastfield range query heuristic (#2375) 2024-04-25 10:06:11 +02:00
PSeitz
ff40764204 make convert_to_fast_value_and_append_to_json_term pub (#2370)
* make convert_to_fast_value_and_append_to_json_term pub

* clippy
2024-04-23 04:05:41 +02:00
46 changed files with 861 additions and 1205 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.22.0"
version = "0.23.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -15,12 +15,16 @@ rust-version = "1.63"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
oneshot = "0.1.5"
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
oneshot = "0.1.7"
base64 = "0.22.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
regex = { version = "1.5.5", default-features = false, features = [
"std",
"unicode",
] }
aho-corasick = "1.0"
tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true }
@@ -30,14 +34,15 @@ tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
num_cpus = "1.13.1"
fs4 = { version = "0.8.0", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] }
bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x",
] }
census = "0.4.2"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
@@ -52,22 +57,22 @@ itertools = "0.12.0"
measure_time = "0.8.2"
arc-swap = "1.5.0"
columnar = { version= "0.3", path="./columnar", package ="tantivy-columnar" }
sstable = { version= "0.3", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version= "0.3", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.22.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.6", path="./bitpacker" }
common = { version= "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.3", path="./tokenizer-api", package="tantivy-tokenizer-api" }
columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true }
stacker = { version = "0.3", path = "./stacker", package = "tantivy-stacker" }
query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
mediumvec = "1.3.0"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
[dev-dependencies]
binggan = "0.6.2"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
@@ -82,7 +87,6 @@ time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [
"use-std",
], default-features = false }
peakmem-alloc = "0.3.0"
[target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false }
@@ -114,17 +118,26 @@ lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches.
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data.
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
members = [
"query-grammar",
"bitpacker",
"common",
"ownedbytes",
"stacker",
"sstable",
"tokenizer-api",
"columnar",
]
# Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points
@@ -145,3 +158,7 @@ harness = false
[[bench]]
name = "index-bench"
harness = false
[[bench]]
name = "agg_bench"
harness = false

413
benches/agg_bench.rs Normal file
View File

@@ -0,0 +1,413 @@
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use tantivy::aggregation::agg_req::Aggregations;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use tantivy::{doc, Index, Term};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
/// Mini macro to register a function via its name
/// runner.register("average_u64", move |index| average_u64(index));
macro_rules! register {
($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| $func(index))
};
}
fn main() {
let inputs = vec![
("full", get_test_index_bench(Cardinality::Full).unwrap()),
(
"dense",
get_test_index_bench(Cardinality::OptionalDense).unwrap(),
),
(
"sparse",
get_test_index_bench(Cardinality::OptionalSparse).unwrap(),
),
(
"multivalue",
get_test_index_bench(Cardinality::Multivalued).unwrap(),
),
];
bench_agg(InputGroup::new_with_inputs(inputs));
}
fn bench_agg(mut group: InputGroup<Index>) {
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
register!(group, average_u64);
register!(group, average_f64);
register!(group, average_f64_u64);
register!(group, stats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits);
register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_few);
register!(group, range_agg_with_term_agg_many);
register!(group, histogram);
register!(group, histogram_hard_bounds);
register!(group, histogram_with_avg_sub_agg);
register!(group, avg_and_range_with_avg_sub_agg);
group.run();
}
fn exec_term_with_agg(index: &Index, agg_req: serde_json::Value) {
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let collector = get_collector(agg_req);
let searcher = reader.searcher();
black_box(searcher.search(&term_query, &collector).unwrap());
}
fn average_u64(index: &Index) {
let agg_req = json!({
"average": { "avg": { "field": "score", } }
});
exec_term_with_agg(index, agg_req)
}
fn average_f64(index: &Index) {
let agg_req = json!({
"average": { "avg": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn average_f64_u64(index: &Index) {
let agg_req = json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
});
exec_term_with_agg(index, agg_req)
}
fn stats_f64(index: &Index) {
let agg_req = json!({
"average_f64": { "stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) {
let agg_req = json!({
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
});
execute_agg(index, agg_req);
}
fn terms_few(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_many(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_many_order_by_term(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
});
execute_agg(index, agg_req);
}
fn terms_many_with_top_hits(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"top_hits": { "top_hits":
{
"sort": [
{ "score": "desc" }
],
"size": 2,
"doc_value_fields": ["score_f64"]
}
}
}
},
});
execute_agg(index, agg_req);
}
fn terms_many_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
let collector = get_collector(agg_req);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
black_box(searcher.search(&AllQuery, &collector).unwrap());
}
fn range_agg(index: &Index) {
let agg_req = json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
});
execute_agg(index, agg_req);
}
fn range_agg_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_few(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_few_terms" } },
}
},
});
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_many(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_many_terms" } },
}
},
});
execute_agg(index, agg_req);
}
fn histogram(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
});
execute_agg(index, agg_req);
}
fn histogram_hard_bounds(index: &Index) {
let agg_req = json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
});
execute_agg(index, agg_req);
}
fn histogram_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
}
});
execute_agg(index, agg_req);
}
fn avg_and_range_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
});
execute_agg(index, agg_req);
}
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
OptionalDense = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
OptionalSparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::OptionalDense {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::OptionalSparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::OptionalSparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}
Ok(index)
}

View File

@@ -59,22 +59,6 @@ pub struct ColumnarWriter {
buffers: SpareBuffers,
}
#[inline]
fn mutate_or_create_column<V, TMutator>(
arena_hash_map: &mut ArenaHashMap,
column_name: &str,
updater: TMutator,
) where
V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V,
{
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
arena_hash_map.mutate_or_create(column_name.as_bytes(), updater);
}
impl ColumnarWriter {
pub fn mem_usage(&self) -> usize {
self.arena.mem_usage()
@@ -175,9 +159,8 @@ impl ColumnarWriter {
},
&mut self.dictionaries,
);
mutate_or_create_column(
hash_map,
column_name,
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<StrOrBytesColumnWriter>| {
let mut column_writer = if let Some(column_writer) = column_opt {
column_writer
@@ -192,24 +175,21 @@ impl ColumnarWriter {
);
}
ColumnType::Bool => {
mutate_or_create_column(
&mut self.bool_field_hash_map,
column_name,
self.bool_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::DateTime => {
mutate_or_create_column(
&mut self.datetime_field_hash_map,
column_name,
self.datetime_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
let numerical_type = column_type.numerical_type().unwrap();
mutate_or_create_column(
&mut self.numerical_field_hash_map,
column_name,
self.numerical_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.force_numerical_type(numerical_type);
@@ -217,9 +197,8 @@ impl ColumnarWriter {
},
);
}
ColumnType::IpAddr => mutate_or_create_column(
&mut self.ip_addr_field_hash_map,
column_name,
ColumnType::IpAddr => self.ip_addr_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
),
}
@@ -232,9 +211,8 @@ impl ColumnarWriter {
numerical_value: T,
) {
let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena);
mutate_or_create_column(
hash_map,
column_name,
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.record_numerical_value(doc, numerical_value.into(), arena);
@@ -244,10 +222,6 @@ impl ColumnarWriter {
}
pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena);
hash_map.mutate_or_create(
column_name.as_bytes(),
@@ -261,24 +235,30 @@ impl ColumnarWriter {
pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, val, arena);
column
});
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, val, arena);
column
},
);
}
pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) {
let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_nanos()),
arena,
);
column
});
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_nanos()),
arena,
);
column
},
);
}
pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
@@ -303,10 +283,6 @@ impl ColumnarWriter {
}
pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena, dictionaries) = (
&mut self.bytes_field_hash_map,
&mut self.arena,

View File

@@ -11,9 +11,10 @@ use columnar::Column;
// ---
// Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::index::SegmentReader;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, Index, IndexWriter, Score};
#[derive(Default)]
struct Stats {

View File

@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast()
.set_precision(tantivy::DateTimePrecision::Seconds);
.set_precision(tantivy::schema::DateTimePrecision::Seconds);
// Add `occurred_at` date field type
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);

View File

@@ -1,335 +0,0 @@
#![allow(unused_imports)]
#![allow(dead_code)]
use std::alloc::System;
use std::env::args;
use std::net::Ipv6Addr;
use columnar::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use common::{BinarySerializable, CountingWriter, DateTime, FixedSize};
use peakmem_alloc::*;
use tantivy::schema::{Field, FieldValue, OwnedValue, FAST, INDEXED, STRING, TEXT};
use tantivy::tokenizer::PreTokenizedString;
use tantivy::{doc, TantivyDocument};
const GH_LOGS: &str = include_str!("../benches/gh.json");
const HDFS_LOGS: &str = include_str!("../benches/hdfs.json");
#[global_allocator]
static GLOBAL: &PeakMemAlloc<System> = &INSTRUMENTED_SYSTEM;
fn main() {
dbg!(std::mem::size_of::<TantivyDocument>());
dbg!(std::mem::size_of::<DocContainerRef>());
dbg!(std::mem::size_of::<OwnedValue>());
dbg!(std::mem::size_of::<OwnedValueMedVec>());
dbg!(std::mem::size_of::<ValueContainerRef>());
dbg!(std::mem::size_of::<mediumvec::vec32::Vec32::<u8>>());
let filter = args().nth(1);
measure_fn(
test_hdfs::<TantivyDocument>,
"hdfs TantivyDocument",
&filter,
);
measure_fn(
test_hdfs::<TantivyDocumentMedVec>,
"hdfs TantivyDocumentMedVec",
&filter,
);
measure_fn(
test_hdfs::<DocContainerRef>,
"hdfs DocContainerRef",
&filter,
);
measure_fn(test_gh::<TantivyDocument>, "gh TantivyDocument", &filter);
measure_fn(
test_gh::<TantivyDocumentMedVec>,
"gh TantivyDocumentMedVec",
&filter,
);
measure_fn(test_gh::<DocContainerRef>, "gh DocContainerRef", &filter);
}
fn measure_fn<F: FnOnce()>(f: F, name: &str, filter: &Option<std::string::String>) {
if let Some(filter) = filter {
if !name.contains(filter) {
return;
}
}
GLOBAL.reset_peak_memory();
f();
println!("Peak Memory {} : {:#?}", GLOBAL.get_peak_memory(), name);
}
fn test_hdfs<T: From<TantivyDocument>>() {
let schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED);
schema_builder.add_text_field("body", TEXT);
schema_builder.add_text_field("severity", STRING);
schema_builder.build()
};
let mut docs: Vec<T> = Vec::with_capacity(HDFS_LOGS.lines().count());
for doc_json in HDFS_LOGS.lines() {
let doc = TantivyDocument::parse_json(&schema, doc_json)
.unwrap()
.into();
docs.push(doc);
}
}
fn test_gh<T: From<TantivyDocument>>() {
let schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", FAST);
schema_builder.build()
};
let mut docs: Vec<T> = Vec::with_capacity(GH_LOGS.lines().count());
for doc_json in GH_LOGS.lines() {
let json_field = schema.get_field("json").unwrap();
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val).into();
docs.push(doc);
}
}
#[derive(Clone, Debug, Default)]
#[allow(dead_code)]
pub struct TantivyDocumentMedVec {
field_values: mediumvec::Vec32<FieldValueMedVec>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct FieldValueMedVec {
pub field: Field,
pub value: OwnedValueMedVec,
}
/// This is a owned variant of `Value`, that can be passed around without lifetimes.
/// Represents the value of a any field.
/// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, PartialEq)]
pub enum OwnedValueMedVec {
/// A null value.
Null,
/// The str type is used for any text information.
Str(mediumvec::vec32::Vec32<u8>),
/// Unsigned 64-bits Integer `u64`
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64),
/// 64-bits Float `f64`
F64(f64),
/// Bool value
Bool(bool),
/// Date/time with nanoseconds precision
Date(DateTime),
Array(mediumvec::vec32::Vec32<Self>),
/// Dynamic object value.
Object(mediumvec::vec32::Vec32<(String, Self)>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
/// Pre-tokenized str type,
PreTokStr(Box<PreTokenizedString>),
/// Arbitrarily sized byte array
Bytes(mediumvec::vec32::Vec32<u8>),
}
impl From<TantivyDocument> for TantivyDocumentMedVec {
fn from(doc: TantivyDocument) -> Self {
let field_values = doc
.into_iter()
.map(|fv| FieldValueMedVec {
field: fv.field,
value: fv.value.into(),
})
.collect();
TantivyDocumentMedVec { field_values }
}
}
impl From<OwnedValue> for OwnedValueMedVec {
fn from(value: OwnedValue) -> Self {
match value {
OwnedValue::Null => OwnedValueMedVec::Null,
OwnedValue::Str(s) => {
let bytes = s.into_bytes();
let vec = mediumvec::vec32::Vec32::from_vec(bytes);
OwnedValueMedVec::Str(vec)
}
OwnedValue::U64(u) => OwnedValueMedVec::U64(u),
OwnedValue::I64(i) => OwnedValueMedVec::I64(i),
OwnedValue::F64(f) => OwnedValueMedVec::F64(f),
OwnedValue::Bool(b) => OwnedValueMedVec::Bool(b),
OwnedValue::Date(d) => OwnedValueMedVec::Date(d),
OwnedValue::Array(arr) => {
let arr = arr.into_iter().map(|v| v.into()).collect();
OwnedValueMedVec::Array(arr)
}
OwnedValue::Object(obj) => {
let obj = obj.into_iter().map(|(k, v)| (k, v.into())).collect();
OwnedValueMedVec::Object(obj)
}
OwnedValue::IpAddr(ip) => OwnedValueMedVec::IpAddr(ip),
_ => panic!("Unsupported value type {:?}", value),
}
}
}
#[repr(packed)]
pub struct FieldValueContainerRef {
pub field: u16,
pub value: ValueContainerRef,
}
#[repr(packed)]
struct DocContainerRef {
container: OwnedValueRefContainer,
field_values: mediumvec::Vec32<FieldValueContainerRef>,
}
#[derive(Default)]
struct OwnedValueRefContainer {
nodes: mediumvec::Vec32<ValueContainerRef>,
node_data: mediumvec::Vec32<u8>,
}
impl OwnedValueRefContainer {
fn shrink_to_fit(&mut self) {
self.nodes.shrink_to_fit();
self.node_data.shrink_to_fit();
}
}
impl From<TantivyDocument> for DocContainerRef {
fn from(doc: TantivyDocument) -> Self {
let mut container = OwnedValueRefContainer::default();
let field_values = doc
.into_iter()
.map(|fv| FieldValueContainerRef {
field: fv.field.field_id().try_into().unwrap(),
value: container.add_value(fv.value),
})
.collect();
container.shrink_to_fit();
Self {
field_values,
container,
}
}
}
// References to positions in two array, one for the OwnedValueRef and the other for the encoded
// bytes
#[derive(Debug, Clone, PartialEq)]
pub enum ValueContainerRef {
/// A null value.
Null,
/// The str type is used for any text information.
Str(u32),
/// Unsigned 64-bits Integer `u64`
U64(u32), // position of the serialized 8 bytes in the data array
/// Signed 64-bits Integer `i64`
I64(u32), // position of the serialized 8 bytes in the data array
/// 64-bits Float `f64`
F64(u32), // position of the serialized 8 bytes in the data array
/// Bool value
Bool(bool), // inlined bool
/// Date/time with nanoseconds precision
Date(u32), // position of the serialized 8 byte in the data array
Array(NodeAddress),
/// Dynamic object value.
Object(NodeAddress),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(u32), // position of the serialized 16 bytes in the data array
/// Arbitrarily sized byte array
Bytes(u32),
}
#[derive(Debug, Clone, PartialEq)]
pub struct NodeAddress {
pos: u32,
num_nodes: u32,
}
impl OwnedValueRefContainer {
pub fn add_value(&mut self, value: OwnedValue) -> ValueContainerRef {
match value {
OwnedValue::Null => ValueContainerRef::Null,
OwnedValue::U64(num) => ValueContainerRef::U64(write_into(&mut self.node_data, num)),
OwnedValue::I64(num) => ValueContainerRef::I64(write_into(&mut self.node_data, num)),
OwnedValue::F64(num) => ValueContainerRef::F64(write_into(&mut self.node_data, num)),
OwnedValue::Bool(b) => ValueContainerRef::Bool(b),
OwnedValue::Date(date) => ValueContainerRef::Date(write_into(
&mut self.node_data,
date.into_timestamp_nanos(),
)),
OwnedValue::Str(bytes) => {
ValueContainerRef::Str(write_into(&mut self.node_data, bytes))
}
OwnedValue::Bytes(bytes) => {
ValueContainerRef::Bytes(write_into(&mut self.node_data, bytes))
}
OwnedValue::Array(elements) => {
let pos = self.nodes.len() as u32;
let len = elements.len() as u32;
for elem in elements {
let ref_elem = self.add_value(elem);
self.nodes.push(ref_elem);
}
ValueContainerRef::Array(NodeAddress {
pos,
num_nodes: len,
})
}
OwnedValue::Object(entries) => {
let pos = self.nodes.len() as u32;
let len = entries.len() as u32;
for (key, value) in entries {
let ref_key = self.add_value(OwnedValue::Str(key));
let ref_value = self.add_value(value);
self.nodes.push(ref_key);
self.nodes.push(ref_value);
}
ValueContainerRef::Object(NodeAddress {
pos,
num_nodes: len,
})
}
OwnedValue::IpAddr(num) => {
ValueContainerRef::IpAddr(write_into(&mut self.node_data, num.to_u128()))
}
OwnedValue::PreTokStr(_) => todo!(),
OwnedValue::Facet(_) => todo!(),
}
}
}
fn write_into<T: BinarySerializable>(data: &mut mediumvec::Vec32<u8>, value: T) -> u32 {
let pos = data.len() as u32;
data.as_vec(|vec| value.serialize(vec).unwrap());
pos
}
fn write_into_2<T: BinarySerializable>(data: &mut mediumvec::Vec32<u8>, value: T) -> NodeAddress {
let pos = data.len() as u32;
let mut len = 0;
data.as_vec(|vec| {
let mut wrt = CountingWriter::wrap(vec);
value.serialize(&mut wrt).unwrap();
len = wrt.written_bytes() as u32;
});
NodeAddress {
pos,
num_nodes: len,
}
}
// impl From<ContainerDocRef> for TantivyDocument {
// fn from(doc: ContainerDocRef) -> Self {
// let mut doc2 = TantivyDocument::new();
// for fv in doc.field_values {
// let field = Field::from_field_id(fv.field as u32);
// let value = doc.container.get_value(fv.value);
// doc2.add(FieldValue::new(field, value));
//}
// doc2
//}

View File

@@ -7,10 +7,11 @@
// the list of documents containing a term, getting
// its term frequency, and accessing its positions.
use tantivy::postings::Postings;
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
use tantivy::{doc, DocSet, Index, IndexWriter, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the

View File

@@ -3,10 +3,11 @@ use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak};
use tantivy::collector::TopDocs;
use tantivy::index::SegmentId;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration,
SegmentReader, Warmer,
};

View File

@@ -1,585 +0,0 @@
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use test::{self, Bencher};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
Optional = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
Sparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::Optional {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::Sparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::Sparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}
Ok(index)
}
use paste::paste;
#[macro_export]
macro_rules! bench_all_cardinalities {
( $x:ident ) => {
paste! {
#[bench]
fn $x(b: &mut Bencher) {
[<$x _card>](b, Cardinality::Full)
}
#[bench]
fn [<$x _opt>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Optional)
}
#[bench]
fn [<$x _multi>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Multivalued)
}
#[bench]
fn [<$x _sparse>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Sparse)
}
}
};
}
bench_all_cardinalities!(bench_aggregation_average_u64);
fn bench_aggregation_average_u64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": { "avg": { "field": "score", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_stats_f64);
fn bench_aggregation_stats_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "stats": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_f64);
fn bench_aggregation_average_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_percentiles_f64);
fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_u64_and_f64);
fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_few);
fn bench_aggregation_terms_few_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_with_top_hits_agg);
fn bench_aggregation_terms_many_with_top_hits_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"top_hits": { "top_hits":
{
"sort": [
{ "score": "desc" }
],
"size": 2,
"doc_value_fields": ["score_f64"]
}
}
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_with_sub_agg);
fn bench_aggregation_terms_many_with_sub_agg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg);
fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many2);
fn bench_aggregation_terms_many2_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_order_by_term);
fn bench_aggregation_terms_many_order_by_term_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_range_only);
fn bench_aggregation_range_only_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_range_with_avg);
fn bench_aggregation_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
// hard bounds has a different algorithm, because it actually limits collection range
//
bench_all_cardinalities!(bench_aggregation_histogram_only_hard_bounds);
fn bench_aggregation_histogram_only_hard_bounds_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_histogram_with_avg);
fn bench_aggregation_histogram_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_histogram_only);
fn bench_aggregation_histogram_only_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_avg_and_range_with_avg);
fn bench_aggregation_avg_and_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
}

View File

@@ -81,10 +81,11 @@ impl AggregationLimits {
}
}
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
self.memory_consumption
.fetch_add(num_bytes, Ordering::Relaxed);
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
let prev_value = self
.memory_consumption
.fetch_add(add_num_bytes, Ordering::Relaxed);
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(())
}
@@ -94,11 +95,11 @@ impl AggregationLimits {
}
fn validate_memory_consumption(
memory_consumption: &AtomicU64,
memory_consumption: u64,
memory_limit: ByteCount,
) -> Result<(), AggregationError> {
// Load the estimated memory consumed by the aggregations
let memory_consumed: ByteCount = memory_consumption.load(Ordering::Relaxed).into();
let memory_consumed: ByteCount = memory_consumption.into();
if memory_consumed > memory_limit {
return Err(AggregationError::MemoryExceeded {
limit: memory_limit,
@@ -118,10 +119,11 @@ pub struct ResourceLimitGuard {
}
impl ResourceLimitGuard {
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
self.memory_consumption
.fetch_add(num_bytes, Ordering::Relaxed);
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
let prev_value = self
.memory_consumption
.fetch_add(add_num_bytes, Ordering::Relaxed);
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(())
}
}

View File

@@ -17,7 +17,8 @@ use super::metric::{
use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
use crate::aggregation::{f64_to_fastfield_u64, Key};
use crate::{SegmentOrdinal, SegmentReader};
use crate::index::SegmentReader;
use crate::SegmentOrdinal;
#[derive(Default)]
pub(crate) struct AggregationsWithAccessor {

View File

@@ -331,9 +331,11 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
}
let mem_delta = self.get_memory_consumption() - mem_pre;
bucket_agg_accessor
.limits
.add_memory_consumed(mem_delta as u64)?;
if mem_delta > 0 {
bucket_agg_accessor
.limits
.add_memory_consumed(mem_delta as u64)?;
}
Ok(())
}

View File

@@ -324,9 +324,11 @@ impl SegmentAggregationCollector for SegmentTermCollector {
}
let mem_delta = self.get_memory_consumption() - mem_pre;
bucket_agg_accessor
.limits
.add_memory_consumed(mem_delta as u64)?;
if mem_delta > 0 {
bucket_agg_accessor
.limits
.add_memory_consumed(mem_delta as u64)?;
}
Ok(())
}

View File

@@ -8,7 +8,8 @@ use super::segment_agg_result::{
};
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, SegmentOrdinal, SegmentReader, TantivyError};
use crate::index::SegmentReader;
use crate::{DocId, SegmentOrdinal, TantivyError};
/// The default max bucket count, before the aggregation fails.
pub const DEFAULT_BUCKET_LIMIT: u32 = 65000;

View File

@@ -143,8 +143,6 @@ use std::fmt::Display;
#[cfg(test)]
mod agg_tests;
mod agg_bench;
use core::fmt;
pub use agg_limits::AggregationLimits;

View File

@@ -4,7 +4,8 @@ use std::marker::PhantomData;
use serde::{Deserialize, Serialize};
use super::top_score_collector::TopNComputer;
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
use crate::index::SegmentReader;
use crate::{DocAddress, DocId, SegmentOrdinal};
/// Contains a feature (field, score, etc.) of a document along with the document address.
///

View File

@@ -1,19 +1,25 @@
use rayon::{ThreadPool, ThreadPoolBuilder};
use std::sync::Arc;
#[cfg(feature = "quickwit")]
use futures_util::{future::Either, FutureExt};
use crate::TantivyError;
/// Search executor whether search request are single thread or multithread.
///
/// We don't expose Rayon thread pool directly here for several reasons.
///
/// First dependency hell. It is not a good idea to expose the
/// API of a dependency, knowing it might conflict with a different version
/// used by the client. Second, we may stop using rayon in the future.
/// Executor makes it possible to run tasks in single thread or
/// in a thread pool.
#[derive(Clone)]
pub enum Executor {
/// Single thread variant of an Executor
SingleThread,
/// Thread pool variant of an Executor
ThreadPool(ThreadPool),
ThreadPool(Arc<rayon::ThreadPool>),
}
#[cfg(feature = "quickwit")]
impl From<Arc<rayon::ThreadPool>> for Executor {
fn from(thread_pool: Arc<rayon::ThreadPool>) -> Self {
Executor::ThreadPool(thread_pool)
}
}
impl Executor {
@@ -24,11 +30,11 @@ impl Executor {
/// Creates an Executor that dispatches the tasks in a thread pool.
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> {
let pool = ThreadPoolBuilder::new()
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(num_threads)
.thread_name(move |num| format!("{prefix}{num}"))
.build()?;
Ok(Executor::ThreadPool(pool))
Ok(Executor::ThreadPool(Arc::new(pool)))
}
/// Perform a map in the thread pool.
@@ -91,11 +97,36 @@ impl Executor {
}
}
}
/// Spawn a task on the pool, returning a future completing on task success.
///
/// If the task panic, returns `Err(())`.
#[cfg(feature = "quickwit")]
pub fn spawn_blocking<T: Send + 'static>(
&self,
cpu_intensive_task: impl FnOnce() -> T + Send + 'static,
) -> impl std::future::Future<Output = Result<T, ()>> {
match self {
Executor::SingleThread => Either::Left(std::future::ready(Ok(cpu_intensive_task()))),
Executor::ThreadPool(pool) => {
let (sender, receiver) = oneshot::channel();
pool.spawn(|| {
if sender.is_closed() {
return;
}
let task_result = cpu_intensive_task();
let _ = sender.send(task_result);
});
let res = receiver.map(|res| res.map_err(|_| ()));
Either::Right(res)
}
}
}
}
#[cfg(test)]
mod tests {
use super::Executor;
#[test]
@@ -147,4 +178,34 @@ mod tests {
assert_eq!(result[i], i * 2);
}
}
#[cfg(feature = "quickwit")]
#[test]
fn test_cancel_cpu_intensive_tasks() {
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
let counter: Arc<AtomicU64> = Default::default();
let mut futures = Vec::new();
let executor = Executor::multi_thread(3, "search-test").unwrap();
for _ in 0..1_000 {
let counter_clone = counter.clone();
let fut = executor.spawn_blocking(move || {
std::thread::sleep(Duration::from_millis(4));
counter_clone.fetch_add(1, Ordering::SeqCst)
});
futures.push(fut);
}
std::thread::sleep(Duration::from_millis(5));
// The first few num_cores tasks should run, but the other should get cancelled.
drop(futures);
while Arc::strong_count(&counter) > 1 {
std::thread::sleep(Duration::from_millis(10));
}
// with ideal timing, we expect the result to always be 6, but as long as we run some, and
// cancelled most, the test is a success
assert!(counter.load(Ordering::SeqCst) > 0);
assert!(counter.load(Ordering::SeqCst) < 50);
}
}

View File

@@ -31,7 +31,7 @@ use crate::{DateTime, DocId, Term};
/// position 1.
/// As a result, with lemmatization, "The Smiths" will match our object.
///
/// Worse, if a same term is appears in the second object, a non increasing value would be pushed
/// Worse, if a same term appears in the second object, a non increasing value would be pushed
/// to the position recorder probably provoking a panic.
///
/// This problem is solved for regular multivalued object by offsetting the position
@@ -50,7 +50,7 @@ use crate::{DateTime, DocId, Term};
/// We can therefore afford working with a map that is not imperfect. It is fine if several
/// path map to the same index position as long as the probability is relatively low.
#[derive(Default)]
struct IndexingPositionsPerPath {
pub(crate) struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>,
}
@@ -58,6 +58,9 @@ impl IndexingPositionsPerPath {
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
self.positions_per_path.entry(id).or_default()
}
pub fn clear(&mut self) {
self.positions_per_path.clear();
}
}
/// Convert JSON_PATH_SEGMENT_SEP to a dot.
@@ -68,36 +71,6 @@ pub fn json_path_sep_to_dot(path: &mut str) {
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_json_values<'a, V: Value<'a>>(
doc: DocId,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
json_path_writer: &mut JsonPathWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
json_path_writer.clear();
json_path_writer.set_expand_dots(expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_visitor_res in json_visitors {
let json_visitor = json_visitor_res?;
index_json_object::<V>(
doc,
json_visitor,
text_analyzer,
term_buffer,
json_path_writer,
postings_writer,
ctx,
&mut positions_per_path,
);
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn index_json_object<'a, V: Value<'a>>(
doc: DocId,
@@ -126,7 +99,7 @@ fn index_json_object<'a, V: Value<'a>>(
}
#[allow(clippy::too_many_arguments)]
fn index_json_value<'a, V: Value<'a>>(
pub(crate) fn index_json_value<'a, V: Value<'a>>(
doc: DocId,
json_value: V,
text_analyzer: &mut TextAnalyzer,
@@ -166,12 +139,18 @@ fn index_json_value<'a, V: Value<'a>>(
);
}
ReferenceValueLeaf::U64(val) => {
// try to parse to i64, since when querying we will apply the same logic and prefer
// i64 values
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::I64(val) => {
@@ -257,10 +236,7 @@ fn index_json_value<'a, V: Value<'a>>(
/// Tries to infer a JSON type from a string and append it to the term.
///
/// The term must be json + JSON path.
pub(crate) fn convert_to_fast_value_and_append_to_json_term(
mut term: Term,
phrase: &str,
) -> Option<Term> {
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
assert_eq!(
term.value()
.as_json_value_bytes()

View File

@@ -4,13 +4,13 @@ use std::{fmt, io};
use crate::collector::Collector;
use crate::core::Executor;
use crate::index::SegmentReader;
use crate::index::{SegmentId, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize;
use crate::schema::{Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
use crate::{DocAddress, Index, Opstamp, TrackedObject};
/// Identifies the searcher generation accessed by a [`Searcher`].
///
@@ -109,8 +109,9 @@ impl Searcher {
&self,
doc_address: DocAddress,
) -> crate::Result<D> {
let executor = self.inner.index.search_executor();
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id).await
store_reader.get_async(doc_address.doc_id, executor).await
}
/// Access the schema associated with the index of this searcher.

View File

@@ -1,12 +1,14 @@
use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback};
use crate::index::SegmentId;
use crate::indexer::{LogMergePolicy, NoMergePolicy};
use crate::postings::Postings;
use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings,
ReloadPolicy, SegmentId, TantivyDocument, Term,
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
TantivyDocument, Term,
};
#[test]
@@ -417,7 +419,7 @@ fn test_non_text_json_term_freq() {
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::from_field_json_path(field, "tenant_id", false);
term.append_type_and_fast_value(75u64);
term.append_type_and_fast_value(75i64);
let postings = inv_idx
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -451,7 +453,7 @@ fn test_non_text_json_term_freq_bitpacked() {
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::from_field_json_path(field, "tenant_id", false);
term.append_type_and_fast_value(75u64);
term.append_type_and_fast_value(75i64);
let mut postings = inv_idx
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -146,8 +146,11 @@ mod tests {
facet_ords.extend(facet_reader.facet_ords(0u32));
assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet());
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
let value: Option<Facet> = doc
.get_first(facet_field)
.and_then(|v| v.as_facet())
.map(|facet| Facet::from_encoded_string(facet.to_string()));
assert_eq!(value, Facet::from_text("/a/b").ok());
Ok(())
}

View File

@@ -80,7 +80,7 @@ mod tests {
use std::path::Path;
use columnar::StrColumn;
use common::{ByteCount, HasLen, TerminatingWrite};
use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -88,14 +88,15 @@ mod tests {
use super::*;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::index::SegmentId;
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
DateOptions, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TantivyDocument, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader};
use crate::{Index, IndexWriter, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();

View File

@@ -1,14 +1,14 @@
use std::io;
use columnar::{ColumnarWriter, NumericalValue};
use common::JsonPathWriter;
use common::{DateTimePrecision, JsonPathWriter};
use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError};
use crate::{DocId, TantivyError};
/// Only index JSON down to a depth of 20.
/// This is mostly to guard us from a stack overflow triggered by malicious input.
@@ -183,8 +183,7 @@ impl FastFieldsWriter {
.record_datetime(doc_id, field_name, truncated_datetime);
}
ReferenceValueLeaf::Facet(val) => {
self.columnar_writer
.record_str(doc_id, field_name, val.encoded_str());
self.columnar_writer.record_str(doc_id, field_name, val);
}
ReferenceValueLeaf::Bytes(val) => {
self.columnar_writer.record_bytes(doc_id, field_name, val);

View File

@@ -3,7 +3,7 @@ use std::fmt;
#[cfg(feature = "mmap")]
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
use std::thread::available_parallelism;
use super::segment::Segment;
use super::segment_reader::merge_field_meta_data;
@@ -293,7 +293,7 @@ pub struct Index {
directory: ManagedDirectory,
schema: Schema,
settings: IndexSettings,
executor: Arc<Executor>,
executor: Executor,
tokenizers: TokenizerManager,
fast_field_tokenizers: TokenizerManager,
inventory: SegmentMetaInventory,
@@ -318,29 +318,25 @@ impl Index {
///
/// By default the executor is single thread, and simply runs in the calling thread.
pub fn search_executor(&self) -> &Executor {
self.executor.as_ref()
&self.executor
}
/// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?);
self.executor = Executor::multi_thread(num_threads, "tantivy-search-")?;
Ok(())
}
/// Custom thread pool by a outer thread pool.
pub fn set_shared_multithread_executor(
&mut self,
shared_thread_pool: Arc<Executor>,
) -> crate::Result<()> {
self.executor = shared_thread_pool.clone();
Ok(())
pub fn set_executor(&mut self, executor: Executor) {
self.executor = executor;
}
/// Replace the default single thread search executor pool
/// by a thread pool with as many threads as there are CPUs on the system.
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
let default_num_threads = num_cpus::get();
let default_num_threads = available_parallelism()?.get();
self.set_multithread_executor(default_num_threads)
}
@@ -418,7 +414,7 @@ impl Index {
schema,
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
executor: Executor::single_thread(),
inventory,
}
}
@@ -621,7 +617,7 @@ impl Index {
&self,
memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let mut num_threads = std::cmp::min(available_parallelism()?.get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
num_threads = (memory_budget_in_bytes / MEMORY_BUDGET_NUM_BYTES_MIN).max(1);

View File

@@ -1,5 +1,3 @@
//! # Index Module
//!
//! The `index` module in Tantivy contains core components to read and write indexes.
//!
//! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s.

View File

@@ -246,8 +246,9 @@ impl DeleteCursor {
mod tests {
use super::{DeleteOperation, DeleteQueue};
use crate::index::SegmentReader;
use crate::query::{Explanation, Scorer, Weight};
use crate::{DocId, Score, SegmentReader};
use crate::{DocId, Score};
struct DummyWeight;
impl Weight for DummyWeight {

View File

@@ -144,9 +144,9 @@ mod tests {
use once_cell::sync::Lazy;
use super::*;
use crate::index::SegmentMetaInventory;
use crate::index::{SegmentId, SegmentMetaInventory};
use crate::schema;
use crate::schema::INDEXED;
use crate::{schema, SegmentId};
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);

View File

@@ -1,7 +1,8 @@
use std::collections::HashSet;
use std::ops::Deref;
use crate::{Inventory, Opstamp, SegmentId, TrackedObject};
use crate::index::SegmentId;
use crate::{Inventory, Opstamp, TrackedObject};
#[derive(Default)]
pub(crate) struct MergeOperationInventory(Inventory<InnerMergeOperation>);

View File

@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::index::{Segment, SegmentReader};
use crate::index::{Segment, SegmentComponent, SegmentReader};
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
use crate::indexer::SegmentSerializer;
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
@@ -21,8 +21,7 @@ use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
use crate::{
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
SegmentComponent, SegmentOrdinal,
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
@@ -794,7 +793,7 @@ mod tests {
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
};
use crate::collector::{Count, FacetCollector};
use crate::index::Index;
use crate::index::{Index, SegmentId};
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
@@ -804,7 +803,7 @@ mod tests {
use crate::time::OffsetDateTime;
use crate::{
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
IndexSortByField, IndexWriter, Order, Searcher, SegmentId,
IndexSortByField, IndexWriter, Order, Searcher,
};
#[test]

View File

@@ -3,6 +3,7 @@ mod tests {
use crate::collector::TopDocs;
use crate::fastfield::AliveBitSet;
use crate::index::Index;
use crate::postings::Postings;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{
@@ -10,8 +11,8 @@ mod tests {
TextFieldIndexing, TextOptions,
};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
TantivyDocument, Term,
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
Term,
};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {

View File

@@ -182,7 +182,7 @@ mod tests_mmap {
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"})))
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1", format!("num{field_name_in}"): 10})))
.unwrap();
index_writer
.add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"})))
@@ -260,6 +260,64 @@ mod tests_mmap {
"test6",
);
test_agg(format!("json.{field_name_out}a").as_str(), "test7");
// `.` is stored as `\u{0001}` internally in tantivy
let field_name_out_internal = if field_name_out == "." {
"\u{0001}"
} else {
field_name_out
};
let mut fields = reader.searcher().segment_readers()[0]
.inverted_index(field)
.unwrap()
.list_encoded_fields()
.unwrap();
assert_eq!(fields.len(), 8);
fields.sort();
let mut expected_fields = vec![
(format!("a{field_name_out_internal}"), Type::Str),
(format!("a{field_name_out_internal}a"), Type::Str),
(
format!("a{field_name_out_internal}a{field_name_out_internal}"),
Type::Str,
),
(
format!("a{field_name_out_internal}\u{1}ab{field_name_out_internal}"),
Type::Str,
),
(
format!("a{field_name_out_internal}\u{1}a{field_name_out_internal}"),
Type::Str,
),
(format!("{field_name_out_internal}a"), Type::Str),
(format!("{field_name_out_internal}"), Type::Str),
(format!("num{field_name_out_internal}"), Type::I64),
];
expected_fields.sort();
assert_eq!(fields, expected_fields);
// Check columnar reader
let mut columns = reader.searcher().segment_readers()[0]
.fast_fields()
.columnar()
.list_columns()
.unwrap()
.into_iter()
.map(|(name, _)| name)
.collect::<Vec<_>>();
let mut expected_columns = vec![
format!("json\u{1}{field_name_out_internal}"),
format!("json\u{1}{field_name_out_internal}a"),
format!("json\u{1}a{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}a"),
format!("json\u{1}a{field_name_out_internal}a{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}\u{1}ab{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}\u{1}a{field_name_out_internal}"),
format!("json\u{1}num{field_name_out_internal}"),
];
columns.sort();
expected_columns.sort();
assert_eq!(columns, expected_columns);
}
#[test]

View File

@@ -5,20 +5,20 @@ use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
use crate::core::json_utils::index_json_values;
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::index::Segment;
use crate::index::{Segment, SegmentComponent};
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::document::{Document, ReferenceValue, Value};
use crate::schema::document::{Document, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
use crate::{DocId, Opstamp, TantivyError};
/// Computes the initial size of the hash table.
///
@@ -68,6 +68,7 @@ pub struct SegmentWriter {
pub(crate) fast_field_writers: FastFieldsWriter,
pub(crate) fieldnorms_writer: FieldNormsWriter,
pub(crate) json_path_writer: JsonPathWriter,
pub(crate) json_positions_per_path: IndexingPositionsPerPath,
pub(crate) doc_opstamps: Vec<Opstamp>,
per_field_text_analyzers: Vec<TextAnalyzer>,
term_buffer: Term,
@@ -119,6 +120,7 @@ impl SegmentWriter {
per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
json_path_writer: JsonPathWriter::default(),
json_positions_per_path: IndexingPositionsPerPath::default(),
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema,
@@ -204,8 +206,7 @@ impl SegmentWriter {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
@@ -228,7 +229,7 @@ impl SegmentWriter {
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else {
continue;
};
@@ -342,26 +343,24 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it = values.map(|value_access| {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
match value {
ReferenceValue::Object(object_iter) => Ok(object_iter),
_ => Err(make_schema_error()),
}
});
index_json_values::<D::Value<'_>>(
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
&mut self.json_path_writer,
ctx,
)?;
self.json_positions_per_path.clear();
self.json_path_writer
.set_expand_dots(json_options.is_expand_dots_enabled());
for json_value in values {
self.json_path_writer.clear();
index_json_value(
doc_id,
json_value,
text_analyzer,
term_buffer,
&mut self.json_path_writer,
postings_writer,
ctx,
&mut self.json_positions_per_path,
);
}
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
@@ -498,19 +497,20 @@ mod tests {
use crate::collector::{Count, TopDocs};
use crate::directory::RamDirectory;
use crate::fastfield::FastValue;
use crate::postings::TermInfo;
use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::document::Value;
use crate::schema::{
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT,
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument,
Term, TERMINATED,
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, TantivyDocument, Term,
TERMINATED,
};
#[test]
@@ -597,6 +597,45 @@ mod tests {
assert_eq!(score_docs.len(), 2);
}
#[test]
fn test_flat_json_indexing() {
// A JSON Object that contains mixed values on the first level
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
// Text, i64, u64
writer.add_document(doc!(json_field=>"b")).unwrap();
writer
.add_document(doc!(json_field=>OwnedValue::I64(10i64)))
.unwrap();
writer
.add_document(doc!(json_field=>OwnedValue::U64(55u64)))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "a"})))
.unwrap();
writer.commit().unwrap();
let search_and_expect = |query| {
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let text_query = query_parser.parse_query(query).unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 1);
};
search_and_expect("my_field:a");
search_and_expect("b");
search_and_expect("10");
search_and_expect("55");
}
#[test]
fn test_json_indexing() {
let mut schema_builder = Schema::builder();

View File

@@ -216,11 +216,6 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
)]
pub use self::snippet::{Snippet, SnippetGenerator};
#[doc(hidden)]
pub use crate::core::json_utils;
pub use crate::core::{Executor, Searcher, SearcherGeneration};
@@ -228,16 +223,10 @@ pub use crate::directory::Directory;
#[allow(deprecated)] // Remove with index sorting
pub use crate::index::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader,
Segment, SegmentMeta, SegmentReader,
};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
)]
pub use crate::indexer::PreparedCommit;
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::postings::Postings;
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term};
pub use crate::schema::{Document, TantivyDocument, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 6;
@@ -392,9 +381,10 @@ pub mod tests {
use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy;
use crate::postings::Postings;
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
@@ -446,7 +436,6 @@ pub mod tests {
}
#[test]
#[cfg(not(feature = "lz4"))]
fn test_version_string() {
use regex::Regex;
let regex_ptn = Regex::new(
@@ -1109,9 +1098,9 @@ pub mod tests {
#[test]
fn test_update_via_delete_insert() -> crate::Result<()> {
use crate::collector::Count;
use crate::index::SegmentId;
use crate::indexer::NoMergePolicy;
use crate::query::AllQuery;
use crate::SegmentId;
const DOC_COUNT: u64 = 2u64;

View File

@@ -56,7 +56,7 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer {
/// Open a new `InvertedIndexSerializer` for the given segment
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
use crate::SegmentComponent::{Positions, Postings, Terms};
use crate::index::SegmentComponent::{Positions, Postings, Terms};
let inv_index_serializer = InvertedIndexSerializer {
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),

View File

@@ -1,8 +1,9 @@
use super::Scorer;
use crate::docset::TERMINATED;
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader};
use crate::{DocId, DocSet, Score, Searcher};
/// `EmptyQuery` is a dummy `Query` in which no document matches.
///

View File

@@ -180,7 +180,7 @@ impl MoreLikeThis {
let facets: Vec<&str> = values
.iter()
.map(|value| {
value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| {
value.as_facet().ok_or_else(|| {
TantivyError::InvalidArgument("invalid field value".to_string())
})
})
@@ -220,7 +220,7 @@ impl MoreLikeThis {
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(sink);
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
let mut token_stream = PreTokenizedStream::from(*tok_str.clone());
token_stream.process(sink);
}
}

View File

@@ -174,7 +174,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
}
fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block
self.column.num_docs()
}
}

View File

@@ -127,6 +127,7 @@ impl Scorer for TermScorer {
mod tests {
use proptest::prelude::*;
use crate::index::SegmentId;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::merge_policy::NoMergePolicy;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -134,8 +135,7 @@ mod tests {
use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::{
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, SegmentId, Term,
TERMINATED,
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, Term, TERMINATED,
};
#[test]

View File

@@ -179,9 +179,10 @@ mod tests {
use super::Warmer;
use crate::core::searcher::SearcherGeneration;
use crate::directory::RamDirectory;
use crate::index::SegmentId;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::{Schema, INDEXED};
use crate::{Index, IndexSettings, ReloadPolicy, Searcher, SegmentId};
use crate::{Index, IndexSettings, ReloadPolicy, Searcher};
#[derive(Default)]
struct TestWarmer {

View File

@@ -873,7 +873,7 @@ mod tests {
);
let facet = Facet::from_text("/hello/world").unwrap();
let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into());
let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
let value = deserialize_value(result);
assert_eq!(value, crate::schema::OwnedValue::Facet(facet));
@@ -881,7 +881,8 @@ mod tests {
text: "hello, world".to_string(),
tokens: vec![Token::default(), Token::default()],
};
let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into());
let result =
serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into());
let value = deserialize_value(result);
assert_eq!(value, crate::schema::OwnedValue::PreTokStr(pre_tok_str));
}

View File

@@ -65,13 +65,13 @@ impl<'a> Value<'a> for &'a OwnedValue {
match self {
OwnedValue::Null => ReferenceValueLeaf::Null.into(),
OwnedValue::Str(val) => ReferenceValueLeaf::Str(val).into(),
OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val).into(),
OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val.clone().into()).into(),
OwnedValue::U64(val) => ReferenceValueLeaf::U64(*val).into(),
OwnedValue::I64(val) => ReferenceValueLeaf::I64(*val).into(),
OwnedValue::F64(val) => ReferenceValueLeaf::F64(*val).into(),
OwnedValue::Bool(val) => ReferenceValueLeaf::Bool(*val).into(),
OwnedValue::Date(val) => ReferenceValueLeaf::Date(*val).into(),
OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val).into(),
OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val.encoded_str()).into(),
OwnedValue::Bytes(val) => ReferenceValueLeaf::Bytes(val).into(),
OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(),
OwnedValue::Array(array) => ReferenceValue::Array(array.iter()),
@@ -183,7 +183,7 @@ impl serde::Serialize for OwnedValue {
OwnedValue::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)),
OwnedValue::Object(ref obj) => {
let mut map = serializer.serialize_map(Some(obj.len()))?;
for &(ref k, ref v) in obj {
for (k, v) in obj {
map.serialize_entry(k, v)?;
}
map.end()
@@ -277,11 +277,13 @@ impl<'a, V: Value<'a>> From<ReferenceValue<'a, V>> for OwnedValue {
ReferenceValueLeaf::I64(val) => OwnedValue::I64(val),
ReferenceValueLeaf::F64(val) => OwnedValue::F64(val),
ReferenceValueLeaf::Date(val) => OwnedValue::Date(val),
ReferenceValueLeaf::Facet(val) => OwnedValue::Facet(val.clone()),
ReferenceValueLeaf::Facet(val) => {
OwnedValue::Facet(Facet::from_encoded_string(val.to_string()))
}
ReferenceValueLeaf::Bytes(val) => OwnedValue::Bytes(val.to_vec()),
ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val),
ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val),
ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(val.clone()),
ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()),
},
ReferenceValue::Array(val) => {
OwnedValue::Array(val.map(|v| v.as_value().into()).collect())

View File

@@ -121,7 +121,7 @@ where W: Write
ReferenceValueLeaf::Facet(val) => {
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
val.serialize(self.writer)
Cow::Borrowed(val).serialize(self.writer)
}
ReferenceValueLeaf::Bytes(val) => {
self.write_type_code(type_codes::BYTES_CODE)?;
@@ -428,7 +428,7 @@ mod tests {
);
let facet = Facet::from_text("/hello/world").unwrap();
let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into());
let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
let expected = binary_repr!(
type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(),
);
@@ -441,7 +441,8 @@ mod tests {
text: "hello, world".to_string(),
tokens: vec![Token::default(), Token::default()],
};
let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into());
let result =
serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into());
let expected = binary_repr!(
type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str,
);

View File

@@ -3,7 +3,6 @@ use std::net::Ipv6Addr;
use common::DateTime;
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString;
/// A single field value.
@@ -28,7 +27,7 @@ pub trait Value<'a>: Send + Sync + Debug {
}
#[inline]
/// If the Value is a String, returns the associated str. Returns None otherwise.
/// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
if let ReferenceValue::Leaf(val) = self.as_value() {
Some(val)
@@ -82,8 +81,9 @@ pub trait Value<'a>: Send + Sync + Debug {
#[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
/// otherwise.
fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text())
fn as_pre_tokenized_text(&self) -> Option<Box<PreTokenizedString>> {
self.as_leaf()
.and_then(|leaf| leaf.into_pre_tokenized_text())
}
#[inline]
@@ -94,7 +94,7 @@ pub trait Value<'a>: Send + Sync + Debug {
#[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
fn as_facet(&self) -> Option<&'a Facet> {
fn as_facet(&self) -> Option<&'a str> {
self.as_leaf().and_then(|leaf| leaf.as_facet())
}
@@ -132,7 +132,7 @@ pub trait Value<'a>: Send + Sync + Debug {
}
/// A enum representing a leaf value for tantivy to index.
#[derive(Clone, Copy, Debug, PartialEq)]
#[derive(Clone, Debug, PartialEq)]
pub enum ReferenceValueLeaf<'a> {
/// A null value.
Null,
@@ -146,8 +146,9 @@ pub enum ReferenceValueLeaf<'a> {
F64(f64),
/// Date/time with nanoseconds precision
Date(DateTime),
/// Facet
Facet(&'a Facet),
/// Facet string needs to match the format of
/// [Facet::encoded_str](crate::schema::Facet::encoded_str).
Facet(&'a str),
/// Arbitrarily sized byte array
Bytes(&'a [u8]),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
@@ -155,7 +156,7 @@ pub enum ReferenceValueLeaf<'a> {
/// Bool value
Bool(bool),
/// Pre-tokenized str type,
PreTokStr(&'a PreTokenizedString),
PreTokStr(Box<PreTokenizedString>),
}
impl<'a, T: Value<'a> + ?Sized> From<ReferenceValueLeaf<'a>> for ReferenceValue<'a, T> {
@@ -259,9 +260,9 @@ impl<'a> ReferenceValueLeaf<'a> {
}
#[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
/// otherwise.
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
/// If the Value is a pre-tokenized string, consumes it and returns the string.
/// Returns None otherwise.
pub fn into_pre_tokenized_text(self) -> Option<Box<PreTokenizedString>> {
if let Self::PreTokStr(val) = self {
Some(val)
} else {
@@ -281,7 +282,7 @@ impl<'a> ReferenceValueLeaf<'a> {
#[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
pub fn as_facet(&self) -> Option<&'a Facet> {
pub fn as_facet(&self) -> Option<&'a str> {
if let Self::Facet(val) = self {
Some(val)
} else {
@@ -322,6 +323,16 @@ where V: Value<'a>
}
}
#[inline]
/// If the Value is a leaf, consume it and return the leaf. Returns None otherwise.
pub fn into_leaf(self) -> Option<ReferenceValueLeaf<'a>> {
if let Self::Leaf(val) = self {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a String, returns the associated str. Returns None otherwise.
pub fn as_str(&self) -> Option<&'a str> {
@@ -365,10 +376,11 @@ where V: Value<'a>
}
#[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
/// otherwise.
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text())
/// If the Value is a pre-tokenized string, consumes it and returns the string.
/// Returns None otherwise.
pub fn into_pre_tokenized_text(self) -> Option<Box<PreTokenizedString>> {
self.into_leaf()
.and_then(|leaf| leaf.into_pre_tokenized_text())
}
#[inline]
@@ -379,7 +391,7 @@ where V: Value<'a>
#[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
pub fn as_facet(&self) -> Option<&'a Facet> {
pub fn as_facet(&self) -> Option<&'a str> {
self.as_leaf().and_then(|leaf| leaf.as_facet())
}

View File

@@ -1,7 +1,6 @@
use std::ops::BitOr;
use crate::schema::{NumericOptions, TextOptions};
use crate::DateOptions;
use crate::schema::{DateOptions, NumericOptions, TextOptions};
#[derive(Clone)]
pub struct StoredFlag;

View File

@@ -12,8 +12,8 @@ use std::collections::HashMap;
use common::ByteCount;
use serde::{Deserialize, Serialize};
use crate::index::SegmentComponent;
use crate::schema::Field;
use crate::SegmentComponent;
/// Enum containing any of the possible space usage results for segment components.
pub enum ComponentSpaceUsage {
@@ -115,7 +115,7 @@ impl SegmentSpaceUsage {
/// Use the components directly if this is somehow in performance critical code.
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
use self::ComponentSpaceUsage::*;
use crate::SegmentComponent::*;
use crate::index::SegmentComponent::*;
match component {
Postings => PerField(self.postings().clone()),
Positions => PerField(self.positions().clone()),

View File

@@ -18,6 +18,8 @@ use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize};
use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint;
use crate::DocId;
#[cfg(feature = "quickwit")]
use crate::Executor;
pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100;
@@ -341,7 +343,11 @@ impl StoreReader {
/// In most cases use [`get_async`](Self::get_async)
///
/// Loads and decompresses a block asynchronously.
async fn read_block_async(&self, checkpoint: &Checkpoint) -> io::Result<Block> {
async fn read_block_async(
&self,
checkpoint: &Checkpoint,
executor: &Executor,
) -> io::Result<Block> {
let cache_key = checkpoint.byte_range.start;
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
return Ok(block);
@@ -353,8 +359,12 @@ impl StoreReader {
.read_bytes_async()
.await?;
let decompressed_block =
OwnedBytes::new(self.decompressor.decompress(compressed_block.as_ref())?);
let decompressor = self.decompressor;
let maybe_decompressed_block = executor
.spawn_blocking(move || decompressor.decompress(compressed_block.as_ref()))
.await
.expect("decompression panicked");
let decompressed_block = OwnedBytes::new(maybe_decompressed_block?);
self.cache
.put_into_cache(cache_key, decompressed_block.clone());
@@ -363,15 +373,23 @@ impl StoreReader {
}
/// Reads raw bytes of a given document asynchronously.
pub async fn get_document_bytes_async(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
pub async fn get_document_bytes_async(
&self,
doc_id: DocId,
executor: &Executor,
) -> crate::Result<OwnedBytes> {
let checkpoint = self.block_checkpoint(doc_id)?;
let block = self.read_block_async(&checkpoint).await?;
let block = self.read_block_async(&checkpoint, executor).await?;
Self::get_document_bytes_from_block(block, doc_id, &checkpoint)
}
/// Fetches a document asynchronously. Async version of [`get`](Self::get).
pub async fn get_async<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes_async(doc_id).await?;
pub async fn get_async<D: DocumentDeserialize>(
&self,
doc_id: DocId,
executor: &Executor,
) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;