mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
Merge branch 'main' into issue_1787_extended_stats
This commit is contained in:
@@ -30,7 +30,6 @@ tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
num_cpus = "1.13.1"
|
||||
fs4 = { version = "0.8.0", optional = true }
|
||||
levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
@@ -67,6 +66,7 @@ fnv = "1.0.7"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.5.1"
|
||||
rand = "0.8.5"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
@@ -116,7 +116,7 @@ unstable = [] # useful for benches.
|
||||
|
||||
quickwit = ["sstable", "futures-util"]
|
||||
|
||||
# Compares only the hash of a string when indexing data.
|
||||
# Compares only the hash of a string when indexing data.
|
||||
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
|
||||
# Uses 64bit ahash.
|
||||
compare_hash_only = ["stacker/compare_hash_only"]
|
||||
@@ -143,3 +143,8 @@ harness = false
|
||||
[[bench]]
|
||||
name = "index-bench"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "agg_bench"
|
||||
harness = false
|
||||
|
||||
|
||||
413
benches/agg_bench.rs
Normal file
413
benches/agg_bench.rs
Normal file
@@ -0,0 +1,413 @@
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use rand_distr::Distribution;
|
||||
use serde_json::json;
|
||||
use tantivy::aggregation::agg_req::Aggregations;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::{AllQuery, TermQuery};
|
||||
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use tantivy::{doc, Index, Term};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
|
||||
/// Mini macro to register a function via its name
|
||||
/// runner.register("average_u64", move |index| average_u64(index));
|
||||
macro_rules! register {
|
||||
($runner:expr, $func:ident) => {
|
||||
$runner.register(stringify!($func), move |index| $func(index))
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let inputs = vec![
|
||||
("full", get_test_index_bench(Cardinality::Full).unwrap()),
|
||||
(
|
||||
"dense",
|
||||
get_test_index_bench(Cardinality::OptionalDense).unwrap(),
|
||||
),
|
||||
(
|
||||
"sparse",
|
||||
get_test_index_bench(Cardinality::OptionalSparse).unwrap(),
|
||||
),
|
||||
(
|
||||
"multivalue",
|
||||
get_test_index_bench(Cardinality::Multivalued).unwrap(),
|
||||
),
|
||||
];
|
||||
|
||||
bench_agg(InputGroup::new_with_inputs(inputs));
|
||||
}
|
||||
|
||||
fn bench_agg(mut group: InputGroup<Index>) {
|
||||
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
|
||||
register!(group, average_u64);
|
||||
register!(group, average_f64);
|
||||
register!(group, average_f64_u64);
|
||||
register!(group, stats_f64);
|
||||
register!(group, percentiles_f64);
|
||||
register!(group, terms_few);
|
||||
register!(group, terms_many);
|
||||
register!(group, terms_many_order_by_term);
|
||||
register!(group, terms_many_with_top_hits);
|
||||
register!(group, terms_many_with_avg_sub_agg);
|
||||
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
|
||||
register!(group, range_agg);
|
||||
register!(group, range_agg_with_avg_sub_agg);
|
||||
register!(group, range_agg_with_term_agg_few);
|
||||
register!(group, range_agg_with_term_agg_many);
|
||||
register!(group, histogram);
|
||||
register!(group, histogram_hard_bounds);
|
||||
register!(group, histogram_with_avg_sub_agg);
|
||||
register!(group, avg_and_range_with_avg_sub_agg);
|
||||
|
||||
group.run();
|
||||
}
|
||||
|
||||
fn exec_term_with_agg(index: &Index, agg_req: serde_json::Value) {
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let collector = get_collector(agg_req);
|
||||
let searcher = reader.searcher();
|
||||
black_box(searcher.search(&term_query, &collector).unwrap());
|
||||
}
|
||||
|
||||
fn average_u64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"average": { "avg": { "field": "score", } }
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
fn average_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"average": { "avg": { "field": "score_f64", } }
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
fn average_f64_u64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"average_f64": { "avg": { "field": "score_f64" } },
|
||||
"average": { "avg": { "field": "score" } },
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
fn stats_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"average_f64": { "stats": { "field": "score_f64", } }
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
|
||||
fn percentiles_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"mypercentiles": {
|
||||
"percentiles": {
|
||||
"field": "score_f64",
|
||||
"percents": [ 95, 99, 99.9 ]
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_few(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms" } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_order_by_term(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_with_top_hits(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_many_terms" },
|
||||
"aggs": {
|
||||
"top_hits": { "top_hits":
|
||||
{
|
||||
"sort": [
|
||||
{ "score": "desc" }
|
||||
],
|
||||
"size": 2,
|
||||
"doc_value_fields": ["score_f64"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_many_terms" },
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "json.mixed_type" },
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
black_box(searcher.search(&AllQuery, &collector).unwrap());
|
||||
}
|
||||
fn range_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"range_f64": { "range": { "field": "score_f64", "ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 30000 },
|
||||
{ "from": 30000, "to": 40000 },
|
||||
{ "from": 40000, "to": 50000 },
|
||||
{ "from": 50000, "to": 60000 }
|
||||
] } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn range_agg_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
"range": {
|
||||
"field": "score_f64",
|
||||
"ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 30000 },
|
||||
{ "from": 30000, "to": 40000 },
|
||||
{ "from": 40000, "to": 50000 },
|
||||
{ "from": 50000, "to": 60000 }
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn range_agg_with_term_agg_few(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
"range": {
|
||||
"field": "score_f64",
|
||||
"ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 30000 },
|
||||
{ "from": 30000, "to": 40000 },
|
||||
{ "from": 40000, "to": 50000 },
|
||||
{ "from": 50000, "to": 60000 }
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn range_agg_with_term_agg_many(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
"range": {
|
||||
"field": "score_f64",
|
||||
"ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 30000 },
|
||||
{ "from": 30000, "to": 40000 },
|
||||
{ "from": 40000, "to": 50000 },
|
||||
{ "from": 50000, "to": 60000 }
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"my_texts": { "terms": { "field": "text_many_terms" } },
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 100 // 1000 buckets
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn histogram_hard_bounds(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn histogram_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
"histogram": { "field": "score_f64", "interval": 100 },
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn avg_and_range_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
"range": {
|
||||
"field": "score_f64",
|
||||
"ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 60000 }
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
},
|
||||
"average": { "avg": { "field": "score" } }
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum Cardinality {
|
||||
/// All documents contain exactly one value.
|
||||
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
|
||||
#[default]
|
||||
Full = 0,
|
||||
/// All documents contain at most one value.
|
||||
OptionalDense = 1,
|
||||
/// All documents may contain any number of values.
|
||||
Multivalued = 2,
|
||||
/// 1 / 20 documents has a value
|
||||
OptionalSparse = 3,
|
||||
}
|
||||
|
||||
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
|
||||
AggregationCollector::from_aggs(agg_req, Default::default())
|
||||
}
|
||||
|
||||
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = tantivy::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let json_field = schema_builder.add_json_field("json", FAST);
|
||||
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
|
||||
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
|
||||
|
||||
let many_terms_data = (0..150_000)
|
||||
.map(|num| format!("author{}", num))
|
||||
.collect::<Vec<_>>();
|
||||
{
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
|
||||
// To make the different test cases comparable we just change one doc to force the
|
||||
// cardinality
|
||||
if cardinality == Cardinality::OptionalDense {
|
||||
index_writer.add_document(doc!())?;
|
||||
}
|
||||
if cardinality == Cardinality::Multivalued {
|
||||
index_writer.add_document(doc!(
|
||||
json_field => json!({"mixed_type": 10.0}),
|
||||
json_field => json!({"mixed_type": 10.0}),
|
||||
text_field => "cool",
|
||||
text_field => "cool",
|
||||
text_field_many_terms => "cool",
|
||||
text_field_many_terms => "cool",
|
||||
text_field_few_terms => "cool",
|
||||
text_field_few_terms => "cool",
|
||||
score_field => 1u64,
|
||||
score_field => 1u64,
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_i64 => 1i64,
|
||||
score_field_i64 => 1i64,
|
||||
))?;
|
||||
}
|
||||
let mut doc_with_value = 1_000_000;
|
||||
if cardinality == Cardinality::OptionalSparse {
|
||||
doc_with_value /= 20;
|
||||
}
|
||||
let _val_max = 1_000_000.0;
|
||||
for _ in 0..doc_with_value {
|
||||
let val: f64 = rng.gen_range(0.0..1_000_000.0);
|
||||
let json = if rng.gen_bool(0.1) {
|
||||
// 10% are numeric values
|
||||
json!({ "mixed_type": val })
|
||||
} else {
|
||||
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
|
||||
};
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
json_field => json,
|
||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
score_field => val as u64,
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_i64 => val as i64,
|
||||
))?;
|
||||
if cardinality == Cardinality::OptionalSparse {
|
||||
for _ in 0..20 {
|
||||
index_writer.add_document(doc!(text_field => "cool"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
// writing the segment
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
@@ -11,9 +11,10 @@ use columnar::Column;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::index::SegmentReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
|
||||
use tantivy::{doc, Index, IndexWriter, Score};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
|
||||
@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast()
|
||||
.set_precision(tantivy::DateTimePrecision::Seconds);
|
||||
.set_precision(tantivy::schema::DateTimePrecision::Seconds);
|
||||
// Add `occurred_at` date field type
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
|
||||
@@ -7,10 +7,11 @@
|
||||
// the list of documents containing a term, getting
|
||||
// its term frequency, and accessing its positions.
|
||||
|
||||
use tantivy::postings::Postings;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
|
||||
use tantivy::{doc, DocSet, Index, IndexWriter, TERMINATED};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// We first create a schema for the sake of the
|
||||
|
||||
@@ -3,10 +3,11 @@ use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::index::SegmentId;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration,
|
||||
SegmentReader, Warmer,
|
||||
};
|
||||
|
||||
|
||||
@@ -1,610 +0,0 @@
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use rand_distr::Distribution;
|
||||
use serde_json::json;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use crate::{Index, Term};
|
||||
|
||||
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum Cardinality {
|
||||
/// All documents contain exactly one value.
|
||||
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
|
||||
#[default]
|
||||
Full = 0,
|
||||
/// All documents contain at most one value.
|
||||
Optional = 1,
|
||||
/// All documents may contain any number of values.
|
||||
Multivalued = 2,
|
||||
/// 1 / 20 documents has a value
|
||||
Sparse = 3,
|
||||
}
|
||||
|
||||
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
|
||||
AggregationCollector::from_aggs(agg_req, Default::default())
|
||||
}
|
||||
|
||||
fn get_test_index_bench(cardinality: Cardinality) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = crate::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let json_field = schema_builder.add_json_field("json", FAST);
|
||||
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
|
||||
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
|
||||
|
||||
let many_terms_data = (0..150_000)
|
||||
.map(|num| format!("author{}", num))
|
||||
.collect::<Vec<_>>();
|
||||
{
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
|
||||
// To make the different test cases comparable we just change one doc to force the
|
||||
// cardinality
|
||||
if cardinality == Cardinality::Optional {
|
||||
index_writer.add_document(doc!())?;
|
||||
}
|
||||
if cardinality == Cardinality::Multivalued {
|
||||
index_writer.add_document(doc!(
|
||||
json_field => json!({"mixed_type": 10.0}),
|
||||
json_field => json!({"mixed_type": 10.0}),
|
||||
text_field => "cool",
|
||||
text_field => "cool",
|
||||
text_field_many_terms => "cool",
|
||||
text_field_many_terms => "cool",
|
||||
text_field_few_terms => "cool",
|
||||
text_field_few_terms => "cool",
|
||||
score_field => 1u64,
|
||||
score_field => 1u64,
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_i64 => 1i64,
|
||||
score_field_i64 => 1i64,
|
||||
))?;
|
||||
}
|
||||
let mut doc_with_value = 1_000_000;
|
||||
if cardinality == Cardinality::Sparse {
|
||||
doc_with_value /= 20;
|
||||
}
|
||||
let _val_max = 1_000_000.0;
|
||||
for _ in 0..doc_with_value {
|
||||
let val: f64 = rng.gen_range(0.0..1_000_000.0);
|
||||
let json = if rng.gen_bool(0.1) {
|
||||
// 10% are numeric values
|
||||
json!({ "mixed_type": val })
|
||||
} else {
|
||||
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
|
||||
};
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
json_field => json,
|
||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
score_field => val as u64,
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_i64 => val as i64,
|
||||
))?;
|
||||
if cardinality == Cardinality::Sparse {
|
||||
for _ in 0..20 {
|
||||
index_writer.add_document(doc!(text_field => "cool"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
// writing the segment
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
use paste::paste;
|
||||
#[macro_export]
|
||||
macro_rules! bench_all_cardinalities {
|
||||
( $x:ident ) => {
|
||||
paste! {
|
||||
#[bench]
|
||||
fn $x(b: &mut Bencher) {
|
||||
[<$x _card>](b, Cardinality::Full)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn [<$x _opt>](b: &mut Bencher) {
|
||||
[<$x _card>](b, Cardinality::Optional)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn [<$x _multi>](b: &mut Bencher) {
|
||||
[<$x _card>](b, Cardinality::Multivalued)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn [<$x _sparse>](b: &mut Bencher) {
|
||||
[<$x _card>](b, Cardinality::Sparse)
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_average_u64);
|
||||
|
||||
fn bench_aggregation_average_u64_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"average": { "avg": { "field": "score", } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_stats_f64);
|
||||
|
||||
fn bench_aggregation_stats_f64_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"average_f64": { "stats": { "field": "score_f64", } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_extendedstats_f64);
|
||||
|
||||
fn bench_aggregation_extendedstats_f64_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"extstat_f64": { "extended_stats": { "field": "score_f64", } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_average_f64);
|
||||
|
||||
fn bench_aggregation_average_f64_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"average_f64": { "avg": { "field": "score_f64", } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_percentiles_f64);
|
||||
|
||||
fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_str = r#"
|
||||
{
|
||||
"mypercentiles": {
|
||||
"percentiles": {
|
||||
"field": "score_f64",
|
||||
"percents": [ 95, 99, 99.9 ]
|
||||
}
|
||||
}
|
||||
} "#;
|
||||
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_average_u64_and_f64);
|
||||
|
||||
fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"average_f64": { "avg": { "field": "score_f64" } },
|
||||
"average": { "avg": { "field": "score" } },
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_terms_few);
|
||||
|
||||
fn bench_aggregation_terms_few_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_terms_many_with_top_hits_agg);
|
||||
|
||||
fn bench_aggregation_terms_many_with_top_hits_agg_card(
|
||||
b: &mut Bencher,
|
||||
cardinality: Cardinality,
|
||||
) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_many_terms" },
|
||||
"aggs": {
|
||||
"top_hits": { "top_hits":
|
||||
{
|
||||
"sort": [
|
||||
{ "score": "desc" }
|
||||
],
|
||||
"size": 2,
|
||||
"doc_value_fields": ["score_f64"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_terms_many_with_sub_agg);
|
||||
|
||||
fn bench_aggregation_terms_many_with_sub_agg_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_many_terms" },
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg);
|
||||
|
||||
fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card(
|
||||
b: &mut Bencher,
|
||||
cardinality: Cardinality,
|
||||
) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "json.mixed_type" },
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_terms_many2);
|
||||
|
||||
fn bench_aggregation_terms_many2_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms" } },
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_terms_many_order_by_term);
|
||||
|
||||
fn bench_aggregation_terms_many_order_by_term_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_range_only);
|
||||
|
||||
fn bench_aggregation_range_only_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"range_f64": { "range": { "field": "score_f64", "ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 30000 },
|
||||
{ "from": 30000, "to": 40000 },
|
||||
{ "from": 40000, "to": 50000 },
|
||||
{ "from": 50000, "to": 60000 }
|
||||
] } },
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_range_with_avg);
|
||||
|
||||
fn bench_aggregation_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"rangef64": {
|
||||
"range": {
|
||||
"field": "score_f64",
|
||||
"ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 30000 },
|
||||
{ "from": 30000, "to": 40000 },
|
||||
{ "from": 40000, "to": 50000 },
|
||||
{ "from": 50000, "to": 60000 }
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
// hard bounds has a different algorithm, because it actually limits collection range
|
||||
//
|
||||
bench_all_cardinalities!(bench_aggregation_histogram_only_hard_bounds);
|
||||
|
||||
fn bench_aggregation_histogram_only_hard_bounds_card(
|
||||
b: &mut Bencher,
|
||||
cardinality: Cardinality,
|
||||
) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_histogram_with_avg);
|
||||
|
||||
fn bench_aggregation_histogram_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"rangef64": {
|
||||
"histogram": { "field": "score_f64", "interval": 100 },
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_histogram_only);
|
||||
|
||||
fn bench_aggregation_histogram_only_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"rangef64": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 100 // 1000 buckets
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
bench_all_cardinalities!(bench_aggregation_avg_and_range_with_avg);
|
||||
|
||||
fn bench_aggregation_avg_and_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
|
||||
let index = get_test_index_bench(cardinality).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = serde_json::from_value(json!({
|
||||
"rangef64": {
|
||||
"range": {
|
||||
"field": "score_f64",
|
||||
"ranges": [
|
||||
{ "from": 3, "to": 7000 },
|
||||
{ "from": 7000, "to": 20000 },
|
||||
{ "from": 20000, "to": 60000 }
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
},
|
||||
"average": { "avg": { "field": "score" } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let collector = get_collector(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,8 @@ use super::metric::{
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::VecWithNames;
|
||||
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
||||
use crate::{SegmentOrdinal, SegmentReader};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::SegmentOrdinal;
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
|
||||
@@ -8,7 +8,8 @@ use super::segment_agg_result::{
|
||||
};
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::{DocId, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::{DocId, SegmentOrdinal, TantivyError};
|
||||
|
||||
/// The default max bucket count, before the aggregation fails.
|
||||
pub const DEFAULT_BUCKET_LIMIT: u32 = 65000;
|
||||
|
||||
@@ -143,8 +143,6 @@ use std::fmt::Display;
|
||||
#[cfg(test)]
|
||||
mod agg_tests;
|
||||
|
||||
mod agg_bench;
|
||||
|
||||
use core::fmt;
|
||||
|
||||
pub use agg_limits::AggregationLimits;
|
||||
|
||||
@@ -4,7 +4,8 @@ use std::marker::PhantomData;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::top_score_collector::TopNComputer;
|
||||
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::{DocAddress, DocId, SegmentOrdinal};
|
||||
|
||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||
///
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#[cfg(feature = "quickwit")]
|
||||
use futures_util::{future::Either, FutureExt};
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use crate::TantivyError;
|
||||
@@ -91,11 +93,84 @@ impl Executor {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn a task on the pool, returning a future completing on task success.
|
||||
///
|
||||
/// If the task panic, returns `Err(())`.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub fn spawn_blocking<T: Send + 'static>(
|
||||
&self,
|
||||
cpu_intensive_task: impl FnOnce() -> T + Send + 'static,
|
||||
) -> impl std::future::Future<Output = Result<T, ()>> {
|
||||
match self {
|
||||
Executor::SingleThread => Either::Left(std::future::ready(Ok(cpu_intensive_task()))),
|
||||
Executor::ThreadPool(pool) => {
|
||||
let (sender, receiver) = oneshot_with_sentinel::channel();
|
||||
pool.spawn(|| {
|
||||
if sender.is_closed() {
|
||||
return;
|
||||
}
|
||||
let task_result = cpu_intensive_task();
|
||||
let _ = sender.send(task_result);
|
||||
});
|
||||
|
||||
let res = receiver.map(|res| res.map_err(|_| ()));
|
||||
Either::Right(res)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
mod oneshot_with_sentinel {
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
// TODO get ride of this if oneshot ever gains a is_closed()
|
||||
|
||||
pub struct SenderWithSentinel<T> {
|
||||
tx: oneshot::Sender<T>,
|
||||
guard: Arc<()>,
|
||||
}
|
||||
|
||||
pub struct ReceiverWithSentinel<T> {
|
||||
rx: oneshot::Receiver<T>,
|
||||
_guard: Arc<()>,
|
||||
}
|
||||
|
||||
pub fn channel<T>() -> (SenderWithSentinel<T>, ReceiverWithSentinel<T>) {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
let guard = Arc::new(());
|
||||
(
|
||||
SenderWithSentinel {
|
||||
tx,
|
||||
guard: guard.clone(),
|
||||
},
|
||||
ReceiverWithSentinel { rx, _guard: guard },
|
||||
)
|
||||
}
|
||||
|
||||
impl<T> SenderWithSentinel<T> {
|
||||
pub fn send(self, message: T) -> Result<(), oneshot::SendError<T>> {
|
||||
self.tx.send(message)
|
||||
}
|
||||
|
||||
pub fn is_closed(&self) -> bool {
|
||||
Arc::strong_count(&self.guard) == 1
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::future::Future for ReceiverWithSentinel<T> {
|
||||
type Output = Result<T, oneshot::RecvError>;
|
||||
|
||||
fn poll(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
Pin::new(&mut self.rx).poll(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Executor;
|
||||
|
||||
#[test]
|
||||
@@ -147,4 +222,34 @@ mod tests {
|
||||
assert_eq!(result[i], i * 2);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[test]
|
||||
fn test_cancel_cpu_intensive_tasks() {
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
let counter: Arc<AtomicU64> = Default::default();
|
||||
let mut futures = Vec::new();
|
||||
let executor = Executor::multi_thread(3, "search-test").unwrap();
|
||||
for _ in 0..1_000 {
|
||||
let counter_clone = counter.clone();
|
||||
let fut = executor.spawn_blocking(move || {
|
||||
std::thread::sleep(Duration::from_millis(4));
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst)
|
||||
});
|
||||
futures.push(fut);
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(5));
|
||||
// The first few num_cores tasks should run, but the other should get cancelled.
|
||||
drop(futures);
|
||||
while Arc::strong_count(&counter) > 1 {
|
||||
std::thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
// with ideal timing, we expect the result to always be 6, but as long as we run some, and
|
||||
// cancelled most, the test is a success
|
||||
assert!(counter.load(Ordering::SeqCst) > 0);
|
||||
assert!(counter.load(Ordering::SeqCst) < 50);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,13 +4,13 @@ use std::{fmt, io};
|
||||
|
||||
use crate::collector::Collector;
|
||||
use crate::core::Executor;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::index::{SegmentId, SegmentReader};
|
||||
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
|
||||
use crate::schema::document::DocumentDeserialize;
|
||||
use crate::schema::{Schema, Term};
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::{CacheStats, StoreReader};
|
||||
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
|
||||
use crate::{DocAddress, Index, Opstamp, TrackedObject};
|
||||
|
||||
/// Identifies the searcher generation accessed by a [`Searcher`].
|
||||
///
|
||||
@@ -109,8 +109,9 @@ impl Searcher {
|
||||
&self,
|
||||
doc_address: DocAddress,
|
||||
) -> crate::Result<D> {
|
||||
let executor = self.inner.index.search_executor();
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get_async(doc_address.doc_id).await
|
||||
store_reader.get_async(doc_address.doc_id, executor).await
|
||||
}
|
||||
|
||||
/// Access the schema associated with the index of this searcher.
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
use crate::collector::Count;
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::index::SegmentId;
|
||||
use crate::indexer::{LogMergePolicy, NoMergePolicy};
|
||||
use crate::postings::Postings;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{
|
||||
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings,
|
||||
ReloadPolicy, SegmentId, TantivyDocument, Term,
|
||||
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
|
||||
TantivyDocument, Term,
|
||||
};
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -146,8 +146,11 @@ mod tests {
|
||||
facet_ords.extend(facet_reader.facet_ords(0u32));
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet());
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
||||
let value: Option<Facet> = doc
|
||||
.get_first(facet_field)
|
||||
.and_then(|v| v.as_facet())
|
||||
.map(|facet| Facet::from_encoded_string(facet.to_string()));
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@ mod tests {
|
||||
use std::path::Path;
|
||||
|
||||
use columnar::StrColumn;
|
||||
use common::{ByteCount, HasLen, TerminatingWrite};
|
||||
use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite};
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -88,14 +88,15 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::index::SegmentId;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{
|
||||
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument,
|
||||
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
DateOptions, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
|
||||
TantivyDocument, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader};
|
||||
use crate::{Index, IndexWriter, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use std::io;
|
||||
|
||||
use columnar::{ColumnarWriter, NumericalValue};
|
||||
use common::JsonPathWriter;
|
||||
use common::{DateTimePrecision, JsonPathWriter};
|
||||
use tokenizer_api::Token;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateTimePrecision, DocId, TantivyError};
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// Only index JSON down to a depth of 20.
|
||||
/// This is mostly to guard us from a stack overflow triggered by malicious input.
|
||||
@@ -183,8 +183,7 @@ impl FastFieldsWriter {
|
||||
.record_datetime(doc_id, field_name, truncated_datetime);
|
||||
}
|
||||
ReferenceValueLeaf::Facet(val) => {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name, val.encoded_str());
|
||||
self.columnar_writer.record_str(doc_id, field_name, val);
|
||||
}
|
||||
ReferenceValueLeaf::Bytes(val) => {
|
||||
self.columnar_writer.record_bytes(doc_id, field_name, val);
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::fmt;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::thread::available_parallelism;
|
||||
|
||||
use super::segment::Segment;
|
||||
use super::segment_reader::merge_field_meta_data;
|
||||
@@ -340,7 +341,7 @@ impl Index {
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with as many threads as there are CPUs on the system.
|
||||
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
|
||||
let default_num_threads = num_cpus::get();
|
||||
let default_num_threads = available_parallelism()?.get();
|
||||
self.set_multithread_executor(default_num_threads)
|
||||
}
|
||||
|
||||
@@ -621,7 +622,7 @@ impl Index {
|
||||
&self,
|
||||
memory_budget_in_bytes: usize,
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
||||
let mut num_threads = std::cmp::min(available_parallelism()?.get(), MAX_NUM_THREAD);
|
||||
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
|
||||
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
|
||||
num_threads = (memory_budget_in_bytes / MEMORY_BUDGET_NUM_BYTES_MIN).max(1);
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
//! # Index Module
|
||||
//!
|
||||
//! The `index` module in Tantivy contains core components to read and write indexes.
|
||||
//!
|
||||
//! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s.
|
||||
|
||||
@@ -246,8 +246,9 @@ impl DeleteCursor {
|
||||
mod tests {
|
||||
|
||||
use super::{DeleteOperation, DeleteQueue};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::{Explanation, Scorer, Weight};
|
||||
use crate::{DocId, Score, SegmentReader};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
struct DummyWeight;
|
||||
impl Weight for DummyWeight {
|
||||
|
||||
@@ -144,9 +144,9 @@ mod tests {
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use super::*;
|
||||
use crate::index::SegmentMetaInventory;
|
||||
use crate::index::{SegmentId, SegmentMetaInventory};
|
||||
use crate::schema;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::{schema, SegmentId};
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::{Inventory, Opstamp, SegmentId, TrackedObject};
|
||||
use crate::index::SegmentId;
|
||||
use crate::{Inventory, Opstamp, TrackedObject};
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::index::{Segment, SegmentReader};
|
||||
use crate::index::{Segment, SegmentComponent, SegmentReader};
|
||||
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
|
||||
@@ -21,8 +21,7 @@ use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::{TermMerger, TermOrdinal};
|
||||
use crate::{
|
||||
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
|
||||
SegmentComponent, SegmentOrdinal,
|
||||
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
|
||||
};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
@@ -794,7 +793,7 @@ mod tests {
|
||||
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
|
||||
};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::index::Index;
|
||||
use crate::index::{Index, SegmentId};
|
||||
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
|
||||
use crate::schema::document::Value;
|
||||
use crate::schema::{
|
||||
@@ -804,7 +803,7 @@ mod tests {
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{
|
||||
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
|
||||
IndexSortByField, IndexWriter, Order, Searcher, SegmentId,
|
||||
IndexSortByField, IndexWriter, Order, Searcher,
|
||||
};
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -3,6 +3,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::Index;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::document::Value;
|
||||
use crate::schema::{
|
||||
@@ -10,8 +11,8 @@ mod tests {
|
||||
TextFieldIndexing, TextOptions,
|
||||
};
|
||||
use crate::{
|
||||
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
|
||||
TantivyDocument, Term,
|
||||
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
|
||||
Term,
|
||||
};
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
|
||||
@@ -7,7 +7,7 @@ use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||
use super::operation::AddOperation;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::index::Segment;
|
||||
use crate::index::{Segment, SegmentComponent};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
|
||||
use crate::postings::{
|
||||
@@ -18,7 +18,7 @@ use crate::schema::document::{Document, Value};
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
|
||||
use crate::{DocId, Opstamp, TantivyError};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
@@ -206,8 +206,7 @@ impl SegmentWriter {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
postings_writer.index_text(
|
||||
@@ -230,7 +229,7 @@ impl SegmentWriter {
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
text_analyzer.token_stream(text)
|
||||
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
|
||||
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
@@ -498,7 +497,7 @@ mod tests {
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::postings::{Postings, TermInfo};
|
||||
use crate::query::{PhraseQuery, QueryParser};
|
||||
use crate::schema::document::Value;
|
||||
use crate::schema::{
|
||||
@@ -510,8 +509,8 @@ mod tests {
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{
|
||||
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument,
|
||||
Term, TERMINATED,
|
||||
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, TantivyDocument, Term,
|
||||
TERMINATED,
|
||||
};
|
||||
|
||||
#[test]
|
||||
|
||||
20
src/lib.rs
20
src/lib.rs
@@ -216,11 +216,6 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
||||
#[deprecated(
|
||||
since = "0.22.0",
|
||||
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
|
||||
)]
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
#[doc(hidden)]
|
||||
pub use crate::core::json_utils;
|
||||
pub use crate::core::{Executor, Searcher, SearcherGeneration};
|
||||
@@ -228,16 +223,10 @@ pub use crate::directory::Directory;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub use crate::index::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
|
||||
Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader,
|
||||
Segment, SegmentMeta, SegmentReader,
|
||||
};
|
||||
#[deprecated(
|
||||
since = "0.22.0",
|
||||
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
|
||||
)]
|
||||
pub use crate::indexer::PreparedCommit;
|
||||
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term};
|
||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 6;
|
||||
@@ -392,9 +381,10 @@ pub mod tests {
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
@@ -1126,9 +1116,9 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_update_via_delete_insert() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
use crate::index::SegmentId;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::AllQuery;
|
||||
use crate::SegmentId;
|
||||
|
||||
const DOC_COUNT: u64 = 2u64;
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ pub struct InvertedIndexSerializer {
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `InvertedIndexSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
|
||||
use crate::SegmentComponent::{Positions, Postings, Terms};
|
||||
use crate::index::SegmentComponent::{Positions, Postings, Terms};
|
||||
let inv_index_serializer = InvertedIndexSerializer {
|
||||
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
|
||||
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use super::Scorer;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Weight};
|
||||
use crate::{DocId, DocSet, Score, Searcher, SegmentReader};
|
||||
use crate::{DocId, DocSet, Score, Searcher};
|
||||
|
||||
/// `EmptyQuery` is a dummy `Query` in which no document matches.
|
||||
///
|
||||
|
||||
@@ -180,7 +180,7 @@ impl MoreLikeThis {
|
||||
let facets: Vec<&str> = values
|
||||
.iter()
|
||||
.map(|value| {
|
||||
value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| {
|
||||
value.as_facet().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid field value".to_string())
|
||||
})
|
||||
})
|
||||
@@ -220,7 +220,7 @@ impl MoreLikeThis {
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.process(sink);
|
||||
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
let mut token_stream = PreTokenizedStream::from(*tok_str.clone());
|
||||
token_stream.process(sink);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,6 +127,7 @@ impl Scorer for TermScorer {
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use crate::index::SegmentId;
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
@@ -134,8 +135,7 @@ mod tests {
|
||||
use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{
|
||||
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, SegmentId, Term,
|
||||
TERMINATED,
|
||||
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, Term, TERMINATED,
|
||||
};
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -179,9 +179,10 @@ mod tests {
|
||||
use super::Warmer;
|
||||
use crate::core::searcher::SearcherGeneration;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::index::SegmentId;
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::schema::{Schema, INDEXED};
|
||||
use crate::{Index, IndexSettings, ReloadPolicy, Searcher, SegmentId};
|
||||
use crate::{Index, IndexSettings, ReloadPolicy, Searcher};
|
||||
|
||||
#[derive(Default)]
|
||||
struct TestWarmer {
|
||||
|
||||
@@ -873,7 +873,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let facet = Facet::from_text("/hello/world").unwrap();
|
||||
let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into());
|
||||
let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
|
||||
let value = deserialize_value(result);
|
||||
assert_eq!(value, crate::schema::OwnedValue::Facet(facet));
|
||||
|
||||
@@ -881,7 +881,8 @@ mod tests {
|
||||
text: "hello, world".to_string(),
|
||||
tokens: vec![Token::default(), Token::default()],
|
||||
};
|
||||
let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into());
|
||||
let result =
|
||||
serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into());
|
||||
let value = deserialize_value(result);
|
||||
assert_eq!(value, crate::schema::OwnedValue::PreTokStr(pre_tok_str));
|
||||
}
|
||||
|
||||
@@ -65,13 +65,13 @@ impl<'a> Value<'a> for &'a OwnedValue {
|
||||
match self {
|
||||
OwnedValue::Null => ReferenceValueLeaf::Null.into(),
|
||||
OwnedValue::Str(val) => ReferenceValueLeaf::Str(val).into(),
|
||||
OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val).into(),
|
||||
OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val.clone().into()).into(),
|
||||
OwnedValue::U64(val) => ReferenceValueLeaf::U64(*val).into(),
|
||||
OwnedValue::I64(val) => ReferenceValueLeaf::I64(*val).into(),
|
||||
OwnedValue::F64(val) => ReferenceValueLeaf::F64(*val).into(),
|
||||
OwnedValue::Bool(val) => ReferenceValueLeaf::Bool(*val).into(),
|
||||
OwnedValue::Date(val) => ReferenceValueLeaf::Date(*val).into(),
|
||||
OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val).into(),
|
||||
OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val.encoded_str()).into(),
|
||||
OwnedValue::Bytes(val) => ReferenceValueLeaf::Bytes(val).into(),
|
||||
OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(),
|
||||
OwnedValue::Array(array) => ReferenceValue::Array(array.iter()),
|
||||
@@ -277,11 +277,13 @@ impl<'a, V: Value<'a>> From<ReferenceValue<'a, V>> for OwnedValue {
|
||||
ReferenceValueLeaf::I64(val) => OwnedValue::I64(val),
|
||||
ReferenceValueLeaf::F64(val) => OwnedValue::F64(val),
|
||||
ReferenceValueLeaf::Date(val) => OwnedValue::Date(val),
|
||||
ReferenceValueLeaf::Facet(val) => OwnedValue::Facet(val.clone()),
|
||||
ReferenceValueLeaf::Facet(val) => {
|
||||
OwnedValue::Facet(Facet::from_encoded_string(val.to_string()))
|
||||
}
|
||||
ReferenceValueLeaf::Bytes(val) => OwnedValue::Bytes(val.to_vec()),
|
||||
ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val),
|
||||
ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val),
|
||||
ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(val.clone()),
|
||||
ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()),
|
||||
},
|
||||
ReferenceValue::Array(val) => {
|
||||
OwnedValue::Array(val.map(|v| v.as_value().into()).collect())
|
||||
|
||||
@@ -121,7 +121,7 @@ where W: Write
|
||||
ReferenceValueLeaf::Facet(val) => {
|
||||
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
Cow::Borrowed(val).serialize(self.writer)
|
||||
}
|
||||
ReferenceValueLeaf::Bytes(val) => {
|
||||
self.write_type_code(type_codes::BYTES_CODE)?;
|
||||
@@ -428,7 +428,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let facet = Facet::from_text("/hello/world").unwrap();
|
||||
let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into());
|
||||
let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
|
||||
let expected = binary_repr!(
|
||||
type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(),
|
||||
);
|
||||
@@ -441,7 +441,8 @@ mod tests {
|
||||
text: "hello, world".to_string(),
|
||||
tokens: vec![Token::default(), Token::default()],
|
||||
};
|
||||
let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into());
|
||||
let result =
|
||||
serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into());
|
||||
let expected = binary_repr!(
|
||||
type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str,
|
||||
);
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::net::Ipv6Addr;
|
||||
|
||||
use common::DateTime;
|
||||
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
|
||||
/// A single field value.
|
||||
@@ -82,7 +81,7 @@ pub trait Value<'a>: Send + Sync + Debug {
|
||||
#[inline]
|
||||
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
|
||||
/// otherwise.
|
||||
fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
|
||||
fn as_pre_tokenized_text(&self) -> Option<Box<PreTokenizedString>> {
|
||||
self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text())
|
||||
}
|
||||
|
||||
@@ -94,7 +93,7 @@ pub trait Value<'a>: Send + Sync + Debug {
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
|
||||
fn as_facet(&self) -> Option<&'a Facet> {
|
||||
fn as_facet(&self) -> Option<&'a str> {
|
||||
self.as_leaf().and_then(|leaf| leaf.as_facet())
|
||||
}
|
||||
|
||||
@@ -132,7 +131,7 @@ pub trait Value<'a>: Send + Sync + Debug {
|
||||
}
|
||||
|
||||
/// A enum representing a leaf value for tantivy to index.
|
||||
#[derive(Clone, Copy, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum ReferenceValueLeaf<'a> {
|
||||
/// A null value.
|
||||
Null,
|
||||
@@ -146,8 +145,9 @@ pub enum ReferenceValueLeaf<'a> {
|
||||
F64(f64),
|
||||
/// Date/time with nanoseconds precision
|
||||
Date(DateTime),
|
||||
/// Facet
|
||||
Facet(&'a Facet),
|
||||
/// Facet string needs to match the format of
|
||||
/// [Facet::encoded_str](crate::schema::Facet::encoded_str).
|
||||
Facet(&'a str),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(&'a [u8]),
|
||||
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
|
||||
@@ -155,7 +155,7 @@ pub enum ReferenceValueLeaf<'a> {
|
||||
/// Bool value
|
||||
Bool(bool),
|
||||
/// Pre-tokenized str type,
|
||||
PreTokStr(&'a PreTokenizedString),
|
||||
PreTokStr(Box<PreTokenizedString>),
|
||||
}
|
||||
|
||||
impl<'a, T: Value<'a> + ?Sized> From<ReferenceValueLeaf<'a>> for ReferenceValue<'a, T> {
|
||||
@@ -261,9 +261,9 @@ impl<'a> ReferenceValueLeaf<'a> {
|
||||
#[inline]
|
||||
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
|
||||
/// otherwise.
|
||||
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
|
||||
pub fn as_pre_tokenized_text(&self) -> Option<Box<PreTokenizedString>> {
|
||||
if let Self::PreTokStr(val) = self {
|
||||
Some(val)
|
||||
Some(val.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -281,7 +281,7 @@ impl<'a> ReferenceValueLeaf<'a> {
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
|
||||
pub fn as_facet(&self) -> Option<&'a Facet> {
|
||||
pub fn as_facet(&self) -> Option<&'a str> {
|
||||
if let Self::Facet(val) = self {
|
||||
Some(val)
|
||||
} else {
|
||||
@@ -367,7 +367,7 @@ where V: Value<'a>
|
||||
#[inline]
|
||||
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
|
||||
/// otherwise.
|
||||
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
|
||||
pub fn as_pre_tokenized_text(&self) -> Option<Box<PreTokenizedString>> {
|
||||
self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text())
|
||||
}
|
||||
|
||||
@@ -379,7 +379,7 @@ where V: Value<'a>
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
|
||||
pub fn as_facet(&self) -> Option<&'a Facet> {
|
||||
pub fn as_facet(&self) -> Option<&'a str> {
|
||||
self.as_leaf().and_then(|leaf| leaf.as_facet())
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
use crate::schema::{NumericOptions, TextOptions};
|
||||
use crate::DateOptions;
|
||||
use crate::schema::{DateOptions, NumericOptions, TextOptions};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StoredFlag;
|
||||
|
||||
@@ -12,8 +12,8 @@ use std::collections::HashMap;
|
||||
use common::ByteCount;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::index::SegmentComponent;
|
||||
use crate::schema::Field;
|
||||
use crate::SegmentComponent;
|
||||
|
||||
/// Enum containing any of the possible space usage results for segment components.
|
||||
pub enum ComponentSpaceUsage {
|
||||
@@ -115,7 +115,7 @@ impl SegmentSpaceUsage {
|
||||
/// Use the components directly if this is somehow in performance critical code.
|
||||
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
||||
use self::ComponentSpaceUsage::*;
|
||||
use crate::SegmentComponent::*;
|
||||
use crate::index::SegmentComponent::*;
|
||||
match component {
|
||||
Postings => PerField(self.postings().clone()),
|
||||
Positions => PerField(self.positions().clone()),
|
||||
|
||||
@@ -18,6 +18,8 @@ use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize};
|
||||
use crate::space_usage::StoreSpaceUsage;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
#[cfg(feature = "quickwit")]
|
||||
use crate::Executor;
|
||||
|
||||
pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100;
|
||||
|
||||
@@ -341,7 +343,11 @@ impl StoreReader {
|
||||
/// In most cases use [`get_async`](Self::get_async)
|
||||
///
|
||||
/// Loads and decompresses a block asynchronously.
|
||||
async fn read_block_async(&self, checkpoint: &Checkpoint) -> io::Result<Block> {
|
||||
async fn read_block_async(
|
||||
&self,
|
||||
checkpoint: &Checkpoint,
|
||||
executor: &Executor,
|
||||
) -> io::Result<Block> {
|
||||
let cache_key = checkpoint.byte_range.start;
|
||||
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
|
||||
return Ok(block);
|
||||
@@ -353,8 +359,12 @@ impl StoreReader {
|
||||
.read_bytes_async()
|
||||
.await?;
|
||||
|
||||
let decompressed_block =
|
||||
OwnedBytes::new(self.decompressor.decompress(compressed_block.as_ref())?);
|
||||
let decompressor = self.decompressor;
|
||||
let maybe_decompressed_block = executor
|
||||
.spawn_blocking(move || decompressor.decompress(compressed_block.as_ref()))
|
||||
.await
|
||||
.expect("decompression panicked");
|
||||
let decompressed_block = OwnedBytes::new(maybe_decompressed_block?);
|
||||
|
||||
self.cache
|
||||
.put_into_cache(cache_key, decompressed_block.clone());
|
||||
@@ -363,15 +373,23 @@ impl StoreReader {
|
||||
}
|
||||
|
||||
/// Reads raw bytes of a given document asynchronously.
|
||||
pub async fn get_document_bytes_async(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
|
||||
pub async fn get_document_bytes_async(
|
||||
&self,
|
||||
doc_id: DocId,
|
||||
executor: &Executor,
|
||||
) -> crate::Result<OwnedBytes> {
|
||||
let checkpoint = self.block_checkpoint(doc_id)?;
|
||||
let block = self.read_block_async(&checkpoint).await?;
|
||||
let block = self.read_block_async(&checkpoint, executor).await?;
|
||||
Self::get_document_bytes_from_block(block, doc_id, &checkpoint)
|
||||
}
|
||||
|
||||
/// Fetches a document asynchronously. Async version of [`get`](Self::get).
|
||||
pub async fn get_async<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
|
||||
let mut doc_bytes = self.get_document_bytes_async(doc_id).await?;
|
||||
pub async fn get_async<D: DocumentDeserialize>(
|
||||
&self,
|
||||
doc_id: DocId,
|
||||
executor: &Executor,
|
||||
) -> crate::Result<D> {
|
||||
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
|
||||
Reference in New Issue
Block a user