mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-31 06:22:54 +00:00
Compare commits
9 Commits
fix_open_b
...
issue/1251
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1fba39965e | ||
|
|
61e955039d | ||
|
|
9815067171 | ||
|
|
972cb6c26d | ||
|
|
4dc80cfa25 | ||
|
|
cef145790c | ||
|
|
e05e2a0c51 | ||
|
|
e028515caf | ||
|
|
850b9eaea4 |
@@ -7,6 +7,8 @@ Tantivy 0.17
|
||||
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-oss/tantivy/issues/1224)
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
|
||||
@@ -55,6 +55,7 @@ lru = "0.7.0"
|
||||
fastdivide = "0.4"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.8.0"
|
||||
pretty_assertions = "1.1.0"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
@@ -67,6 +68,7 @@ proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
pprof = {version= "0.6", features=["flamegraph", "criterion"]}
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5"
|
||||
@@ -110,3 +112,8 @@ required-features = ["fail/failpoints"]
|
||||
[[bench]]
|
||||
name = "analyzer"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "index-bench"
|
||||
harness = false
|
||||
|
||||
|
||||
100000
benches/hdfs.json
Normal file
100000
benches/hdfs.json
Normal file
File diff suppressed because it is too large
Load Diff
79
benches/index-bench.rs
Normal file
79
benches/index-bench.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
||||
|
||||
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let schema = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_u64_field("timestamp", INDEXED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
schema_builder.add_text_field("severity", STRING);
|
||||
schema_builder.build()
|
||||
};
|
||||
let schema_with_store = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_u64_field("timestamp", INDEXED | STORED);
|
||||
schema_builder.add_text_field("body", TEXT | STORED);
|
||||
schema_builder.add_text_field("severity", STRING | STORED);
|
||||
schema_builder.build()
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("index-hdfs");
|
||||
group.sample_size(20);
|
||||
group.bench_function("index-hdfs-no-commit", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..10 {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..10 {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = hdfs_index_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
@@ -10,6 +10,7 @@
|
||||
- [Index Sorting](./index_sorting.md)
|
||||
- [Innerworkings](./innerworkings.md)
|
||||
- [Inverted index](./inverted_index.md)
|
||||
- [Json](./json.md)
|
||||
- [Best practise](./inverted_index.md)
|
||||
|
||||
[Frequently Asked Questions](./faq.md)
|
||||
|
||||
82
doc/src/json.md
Normal file
82
doc/src/json.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# Json
|
||||
|
||||
As of tantivy 0.17, tantivy supports a json object type.
|
||||
This type can be used to allow for a schema-less search index.
|
||||
|
||||
When indexing a json object, we "flatten" the JSON. This operation emits terms that represent a triplet `(json_path, value_type, value)`
|
||||
|
||||
For instance, if user is a json field, the following document:
|
||||
|
||||
```json
|
||||
{
|
||||
"user": {
|
||||
"name": "Paul Masurel",
|
||||
"address": {
|
||||
"city": "Tokyo",
|
||||
"country": "Japan"
|
||||
},
|
||||
"created_at": "2018-11-12T23:20:50.52Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
emits the following tokens:
|
||||
- ("name", Text, "Paul")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
|
||||
# Bytes-encoding and lexicographical sort.
|
||||
|
||||
Like any other terms, these triplets are encoded into a binary format as follows.
|
||||
- `json_path`: the json path is a sequence of "segments". In the example above, `address.city`
|
||||
is just a debug representation of the json path `["address", "city"]`.
|
||||
Its representation is done by separating segments by a unicode char `\x01`, and ending the path by `\x00`.
|
||||
- `value type`: One byte represents the `Value` type.
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("name", Text, "Paul")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
As seen in "pitfalls", we may end up having to search for a value for a same path in several different fields. Putting the field code after the path makes it maximizes compression opportunities but also increases the chances for the two terms to end up in the actual same term dictionary block.
|
||||
|
||||
|
||||
# Pitfalls and limitation.
|
||||
|
||||
Json gives very little information about the type of the literals it stores.
|
||||
All numeric types end up mapped as a "Number" and there are no types for dates.
|
||||
|
||||
At ingestion time, tantivy will try to interpret number and strings as different type with a
|
||||
priority order.
|
||||
Numbers will be interpreted as u64, i64 and f64 in that order.
|
||||
Strings will be interpreted as rfc3999 dates or simple strings.
|
||||
|
||||
The first working time is picked and only one type will be emitted for indexing.
|
||||
|
||||
Note this interpretation happens on a per-document basis, and there is no effort to try to sniff
|
||||
a consistent field type at the scale of a segment.
|
||||
|
||||
On the query parser side on the other hand, we may end up emitting more than one type.
|
||||
For instance, we do not even know if the type is a number or string based.
|
||||
|
||||
So the query
|
||||
|
||||
```
|
||||
my_path.my_segment:233
|
||||
```
|
||||
|
||||
Should be interpreted as
|
||||
- `(my_path.my_segment, String, 233)`
|
||||
- `(my_path.my_segment, u64, 233)`
|
||||
|
||||
Likewise, we need to emit two tokens if the query contains an rfc3999 date.
|
||||
Indeed the date could have been actually a single token inside the text of a document at ingestion time. Generally speaking, we will always at least emit a string token in query parsing, and sometimes more.
|
||||
130
examples/aggregation.rs
Normal file
130
examples/aggregation.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
// # Aggregation example
|
||||
//
|
||||
// This example shows how you can use built-in aggregations.
|
||||
// We will use range buckets and compute the average in each bucket.
|
||||
//
|
||||
|
||||
use serde_json::Value;
|
||||
use tantivy::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::metric::AverageAggregation;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::{doc, Index, Term};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = crate::schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
|
||||
let price_field = schema_builder.add_f64_field("price", score_fieldtype.clone());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of documents for this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 1f64,
|
||||
price_field => 0f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 3f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 5f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
highscore_field => 6f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 7f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 11f64,
|
||||
price_field => 10f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 14f64,
|
||||
price_field => 15f64,
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 15f64,
|
||||
price_field => 20f64,
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req_1: Aggregations = vec![(
|
||||
"average_price".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("price".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"score_ranges".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "highscore".to_string(),
|
||||
ranges: vec![
|
||||
(-1f64..9f64).into(),
|
||||
(9f64..14f64).into(),
|
||||
(14f64..20f64).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1.clone(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -419,10 +419,7 @@ mod tests {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
let _ = create_and_validate(&data, "random");
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
|
||||
36
src/aggregation/README.md
Normal file
36
src/aggregation/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Contributing
|
||||
|
||||
When adding new bucket aggregation make sure to extend the "test_aggregation_flushing" test for at least 2 levels.
|
||||
|
||||
|
||||
|
||||
# Code Organization
|
||||
|
||||
Tantivy's aggregations have been designed to mimic the
|
||||
[aggregations of elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html).
|
||||
|
||||
The code is organized in submodules:
|
||||
|
||||
##bucket
|
||||
Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggegations.
|
||||
|
||||
##metric
|
||||
Contains all metric aggregations, like average aggregation. Metric aggregations do not have sub aggregations.
|
||||
|
||||
#### agg_req
|
||||
agg_req contains the users aggregation request. Deserialization from json is compatible with elasticsearch aggregation requests.
|
||||
|
||||
#### agg_req_with_accessor
|
||||
agg_req_with_accessor contains the users aggregation request enriched with fast field accessors etc, which are
|
||||
used during collection.
|
||||
|
||||
#### segment_agg_result
|
||||
segment_agg_result contains the aggregation result tree, which is used for collection of a segment.
|
||||
The tree from agg_req_with_accessor is passed during collection.
|
||||
|
||||
#### intermediate_agg_result
|
||||
intermediate_agg_result contains the aggregation tree for merging with other trees.
|
||||
|
||||
#### agg_result
|
||||
agg_result contains the final aggregation tree.
|
||||
|
||||
169
src/aggregation/agg_req.rs
Normal file
169
src/aggregation/agg_req.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
//! Contains the aggregation request tree. Used to build an
|
||||
//! [AggregationCollector](super::AggregationCollector).
|
||||
//!
|
||||
//! [Aggregations] is the top level entry point to create a request, which is a `HashMap<String,
|
||||
//! Aggregation>`.
|
||||
//! Requests are compatible with the json format of elasticsearch.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! use tantivy::aggregation::bucket::RangeAggregation;
|
||||
//! use tantivy::aggregation::agg_req::BucketAggregationType;
|
||||
//! use tantivy::aggregation::agg_req::{Aggregation, Aggregations};
|
||||
//! use tantivy::aggregation::agg_req::BucketAggregation;
|
||||
//! let agg_req1: Aggregations = vec![
|
||||
//! (
|
||||
//! "range".to_string(),
|
||||
//! Aggregation::Bucket(BucketAggregation {
|
||||
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
|
||||
//! field: "score".to_string(),
|
||||
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
//! }),
|
||||
//! sub_aggregation: Default::default(),
|
||||
//! }),
|
||||
//! ),
|
||||
//! ]
|
||||
//! .into_iter()
|
||||
//! .collect();
|
||||
//!
|
||||
//! let elasticsearch_compatible_json_req = r#"
|
||||
//! {
|
||||
//! "range": {
|
||||
//! "range": {
|
||||
//! "field": "score",
|
||||
//! "ranges": [
|
||||
//! { "from": 3.0, "to": 7.0 },
|
||||
//! { "from": 7.0, "to": 20.0 }
|
||||
//! ]
|
||||
//! }
|
||||
//! }
|
||||
//! }"#;
|
||||
//! let agg_req2: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
//! assert_eq!(agg_req1, agg_req2);
|
||||
//! ```
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use super::bucket::RangeAggregation;
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
|
||||
/// The top-level aggregation request structure, which contains [Aggregation] and their user defined
|
||||
/// names.
|
||||
///
|
||||
/// The key is the user defined name of the aggregation.
|
||||
pub type Aggregations = HashMap<String, Aggregation>;
|
||||
|
||||
/// Aggregation request of [BucketAggregation] or [MetricAggregation].
|
||||
///
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum Aggregation {
|
||||
/// Bucket aggregation, see [BucketAggregation] for details.
|
||||
Bucket(BucketAggregation),
|
||||
/// Metric aggregation, see [MetricAggregation] for details.
|
||||
Metric(MetricAggregation),
|
||||
}
|
||||
|
||||
/// BucketAggregations create buckets of documents. Each bucket is associated with a rule which
|
||||
/// determines whether or not a document in the falls into it. In other words, the buckets
|
||||
/// effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
|
||||
/// fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
|
||||
/// compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
|
||||
/// metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
|
||||
/// the buckets created by their "parent" bucket aggregation. There are different bucket
|
||||
/// aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
|
||||
/// define fixed number of multiple buckets, and others dynamically create the buckets during the
|
||||
/// aggregation process.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BucketAggregation {
|
||||
/// Bucket aggregation strategy to group documents.
|
||||
#[serde(flatten)]
|
||||
pub bucket_agg: BucketAggregationType,
|
||||
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
|
||||
/// bucket.
|
||||
#[serde(rename = "aggs")]
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Aggregations::is_empty")]
|
||||
pub sub_aggregation: Aggregations,
|
||||
}
|
||||
|
||||
/// The bucket aggregation types.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum BucketAggregationType {
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "range")]
|
||||
Range(RangeAggregation),
|
||||
}
|
||||
|
||||
/// The aggregations in this family compute metrics based on values extracted
|
||||
/// from the documents that are being aggregated. Values are extracted from the fast field of
|
||||
/// the document.
|
||||
|
||||
/// Some aggregations output a single numeric metric (e.g. Average) and are called
|
||||
/// single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
|
||||
/// called multi-value numeric metrics aggregation.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum MetricAggregation {
|
||||
/// Calculates the average.
|
||||
#[serde(rename = "avg")]
|
||||
Average(AverageAggregation),
|
||||
/// Calculates stats sum, average, min, max, standard_deviation on a field.
|
||||
#[serde(rename = "stats")]
|
||||
Stats(StatsAggregation),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn serialize_to_json_test() {
|
||||
let agg_req1: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let elasticsearch_compatible_json_req = r#"{
|
||||
"range": {
|
||||
"range": {
|
||||
"field": "score",
|
||||
"ranges": [
|
||||
{
|
||||
"to": 3.0
|
||||
},
|
||||
{
|
||||
"from": 3.0,
|
||||
"to": 7.0
|
||||
},
|
||||
{
|
||||
"from": 7.0,
|
||||
"to": 20.0
|
||||
},
|
||||
{
|
||||
"from": 20.0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let agg_req2: String = serde_json::to_string_pretty(&agg_req1).unwrap();
|
||||
assert_eq!(agg_req2, elasticsearch_compatible_json_req);
|
||||
}
|
||||
}
|
||||
140
src/aggregation/agg_req_with_accessor.rs
Normal file
140
src/aggregation/agg_req_with_accessor.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
//! This will enhance the request tree with access to the fastfield and metadata.
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::RangeAggregation;
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
pub metrics: VecWithNames<MetricAggregationWithAccessor>,
|
||||
pub buckets: VecWithNames<BucketAggregationWithAccessor>,
|
||||
}
|
||||
|
||||
impl AggregationsWithAccessor {
|
||||
fn from_data(
|
||||
metrics: VecWithNames<MetricAggregationWithAccessor>,
|
||||
buckets: VecWithNames<BucketAggregationWithAccessor>,
|
||||
) -> Self {
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.metrics.is_empty() && self.buckets.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BucketAggregationWithAccessor {
|
||||
/// In general there can be buckets without fast field access, e.g. buckets that are created
|
||||
/// based on search terms. So eventually this needs to be Option or moved.
|
||||
pub(crate) accessor: DynamicFastFieldReader<u64>,
|
||||
pub(crate) field_type: Type,
|
||||
pub(crate) bucket_agg: BucketAggregationType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
}
|
||||
|
||||
impl BucketAggregationWithAccessor {
|
||||
fn from_bucket(
|
||||
bucket: &BucketAggregationType,
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<BucketAggregationWithAccessor> {
|
||||
let (accessor, field_type) = match &bucket {
|
||||
BucketAggregationType::Range(RangeAggregation {
|
||||
field: field_name,
|
||||
ranges: _,
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
};
|
||||
let sub_aggregation = sub_aggregation.clone();
|
||||
Ok(BucketAggregationWithAccessor {
|
||||
accessor,
|
||||
field_type,
|
||||
sub_aggregation: get_aggregations_with_accessor(&sub_aggregation, reader)?,
|
||||
bucket_agg: bucket.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Contains the metric request and the fast field accessor.
|
||||
#[derive(Clone)]
|
||||
pub struct MetricAggregationWithAccessor {
|
||||
pub metric: MetricAggregation,
|
||||
pub field_type: Type,
|
||||
pub accessor: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl MetricAggregationWithAccessor {
|
||||
fn from_metric(
|
||||
metric: &MetricAggregation,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<MetricAggregationWithAccessor> {
|
||||
match &metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: field_name })
|
||||
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
|
||||
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor,
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_aggregations_with_accessor(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<AggregationsWithAccessor> {
|
||||
let mut metrics = vec![];
|
||||
let mut buckets = vec![];
|
||||
for (key, agg) in aggs.iter() {
|
||||
match agg {
|
||||
Aggregation::Bucket(bucket) => buckets.push((
|
||||
key.to_string(),
|
||||
BucketAggregationWithAccessor::from_bucket(
|
||||
&bucket.bucket_agg,
|
||||
&bucket.sub_aggregation,
|
||||
reader,
|
||||
)?,
|
||||
)),
|
||||
Aggregation::Metric(metric) => metrics.push((
|
||||
key.to_string(),
|
||||
MetricAggregationWithAccessor::from_metric(metric, reader)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
Ok(AggregationsWithAccessor::from_data(
|
||||
VecWithNames::from_entries(metrics),
|
||||
VecWithNames::from_entries(buckets),
|
||||
))
|
||||
}
|
||||
|
||||
fn get_ff_reader_and_validate(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
if field_type.value_type() != Type::I64
|
||||
&& field_type.value_type() != Type::U64
|
||||
&& field_type.value_type() != Type::F64
|
||||
{
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field type in aggregation {:?}, only f64, u64, i64 is supported",
|
||||
field_type.value_type()
|
||||
)));
|
||||
}
|
||||
let ff_fields = reader.fast_fields();
|
||||
ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (field, field_type.value_type()))
|
||||
}
|
||||
142
src/aggregation/agg_result.rs
Normal file
142
src/aggregation/agg_result.rs
Normal file
@@ -0,0 +1,142 @@
|
||||
//! Contains the final aggregation tree.
|
||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||
//! This conversion computes the final result. For example: The intermediate result contains
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
//! calculated on the step from intermediate to final aggregation result tree.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateMetricResult, IntermediateRangeBucketEntry,
|
||||
};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::Key;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The final aggegation result.
|
||||
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
|
||||
|
||||
impl From<IntermediateAggregationResults> for AggregationResults {
|
||||
fn from(tree: IntermediateAggregationResults) -> Self {
|
||||
Self(
|
||||
tree.0
|
||||
.into_iter()
|
||||
.map(|(key, agg)| (key, agg.into()))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
pub enum AggregationResult {
|
||||
/// Bucket result variant.
|
||||
BucketResult(BucketResult),
|
||||
/// Metric result variant.
|
||||
MetricResult(MetricResult),
|
||||
}
|
||||
impl From<IntermediateAggregationResult> for AggregationResult {
|
||||
fn from(tree: IntermediateAggregationResult) -> Self {
|
||||
match tree {
|
||||
IntermediateAggregationResult::Bucket(bucket) => {
|
||||
AggregationResult::BucketResult(bucket.into())
|
||||
}
|
||||
IntermediateAggregationResult::Metric(metric) => {
|
||||
AggregationResult::MetricResult(metric.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// MetricResult
|
||||
pub enum MetricResult {
|
||||
/// Average metric result.
|
||||
Average(SingleMetricResult),
|
||||
/// Stats metric result.
|
||||
Stats(Stats),
|
||||
}
|
||||
|
||||
impl From<IntermediateMetricResult> for MetricResult {
|
||||
fn from(metric: IntermediateMetricResult) -> Self {
|
||||
match metric {
|
||||
IntermediateMetricResult::Average(avg_data) => {
|
||||
MetricResult::Average(avg_data.finalize().into())
|
||||
}
|
||||
IntermediateMetricResult::Stats(intermediate_stats) => {
|
||||
MetricResult::Stats(intermediate_stats.finalize())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// BucketEntry holds bucket aggregation result types.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum BucketResult {
|
||||
/// This is the default entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Range {
|
||||
/// The range buckets sorted by range.
|
||||
buckets: Vec<RangeBucketEntry>,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<IntermediateBucketResult> for BucketResult {
|
||||
fn from(result: IntermediateBucketResult) -> Self {
|
||||
match result {
|
||||
IntermediateBucketResult::Range(range_map) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_map
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| bucket.into())
|
||||
.collect_vec();
|
||||
|
||||
buckets.sort_by(|a, b| {
|
||||
a.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.partial_cmp(&b.from.unwrap_or(f64::MIN))
|
||||
.unwrap_or(Ordering::Equal)
|
||||
});
|
||||
BucketResult::Range { buckets }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeBucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
pub key: Key,
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<IntermediateRangeBucketEntry> for RangeBucketEntry {
|
||||
fn from(entry: IntermediateRangeBucketEntry) -> Self {
|
||||
RangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry.sub_aggregation.into(),
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
10
src/aggregation/bucket/mod.rs
Normal file
10
src/aggregation/bucket/mod.rs
Normal file
@@ -0,0 +1,10 @@
|
||||
//! Module for all bucket aggregations.
|
||||
//!
|
||||
//! Results of final buckets are [BucketEntry](super::agg_result::BucketEntry).
|
||||
//! Results of intermediate buckets are
|
||||
//! [IntermediateBucketEntry](super::intermediate_agg_result::IntermediateBucketEntry)
|
||||
|
||||
mod range;
|
||||
|
||||
pub use range::RangeAggregation;
|
||||
pub(crate) use range::SegmentRangeCollector;
|
||||
536
src/aggregation/bucket/range.rs
Normal file
536
src/aggregation/bucket/range.rs
Normal file
@@ -0,0 +1,536 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
|
||||
};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field_name` will be checked
|
||||
/// against each bucket range. Note that this aggregation includes the from value and excludes the
|
||||
/// to value for each range.
|
||||
///
|
||||
/// Result type is [BucketResult](crate::aggregation::agg_result::BucketResult) with
|
||||
/// [BucketEntryKeyCount](crate::aggregation::agg_result::BucketEntryKeyCount) on the
|
||||
/// AggregationCollector.
|
||||
///
|
||||
/// Result type is
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateBucketResult] with
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateBucketEntryKeyCount] on the
|
||||
/// DistributedAggregationCollector.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeAggregation {
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// Note that this aggregation includes the from value and excludes the to value for each
|
||||
/// range. Extra buckets will be created until the first to, and last from, if necessary.
|
||||
pub ranges: Vec<RangeAggregationRange>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeAggregationRange {
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub from: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<Range<f64>> for RangeAggregationRange {
|
||||
fn from(range: Range<f64>) -> Self {
|
||||
let from = if range.start == f64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(range.start)
|
||||
};
|
||||
let to = if range.end == f64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(range.end)
|
||||
};
|
||||
RangeAggregationRange { from, to }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct SegmentRangeAndBucketEntry {
|
||||
range: Range<u64>,
|
||||
bucket: SegmentRangeBucketEntry,
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct SegmentRangeCollector {
|
||||
/// The buckets containing the aggregation data.
|
||||
buckets: Vec<SegmentRangeAndBucketEntry>,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl SegmentRangeCollector {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
let field_type = self.field_type;
|
||||
|
||||
let buckets = self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
(
|
||||
range_to_key(&range_bucket.range, &field_type),
|
||||
range_bucket.bucket.into(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
IntermediateBucketResult::Range(buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn from_req(
|
||||
req: &RangeAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
field_type: Type,
|
||||
) -> crate::Result<Self> {
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
let buckets = extend_validate_ranges(&req.ranges, &field_type)?
|
||||
.iter()
|
||||
.map(|range| {
|
||||
let to = if range.end == u64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.end, &field_type))
|
||||
};
|
||||
let from = if range.start == u64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.start, &field_type))
|
||||
};
|
||||
let sub_aggregation = if sub_aggregation.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(SegmentAggregationResultsCollector::from_req(
|
||||
sub_aggregation,
|
||||
)?)
|
||||
};
|
||||
Ok(SegmentRangeAndBucketEntry {
|
||||
range: range.clone(),
|
||||
bucket: SegmentRangeBucketEntry {
|
||||
key: range_to_key(range, &field_type),
|
||||
doc_count: 0,
|
||||
sub_aggregation,
|
||||
from,
|
||||
to,
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
Ok(SegmentRangeCollector {
|
||||
buckets,
|
||||
field_type,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect_block(
|
||||
&mut self,
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = bucket_with_accessor.accessor.get(docs[0]);
|
||||
let val2 = bucket_with_accessor.accessor.get(docs[1]);
|
||||
let val3 = bucket_with_accessor.accessor.get(docs[2]);
|
||||
let val4 = bucket_with_accessor.accessor.get(docs[3]);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
let bucket_pos4 = self.get_bucket_pos(val4);
|
||||
|
||||
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = bucket_with_accessor.accessor.get(*doc);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
if force_flush {
|
||||
for bucket in &mut self.buckets {
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn increment_bucket(
|
||||
&mut self,
|
||||
bucket_pos: usize,
|
||||
doc: DocId,
|
||||
bucket_with_accessor: &AggregationsWithAccessor,
|
||||
) {
|
||||
let bucket = &mut self.buckets[bucket_pos];
|
||||
|
||||
bucket.bucket.doc_count += 1;
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation.collect(doc, bucket_with_accessor);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_bucket_pos(&self, val: u64) -> usize {
|
||||
let pos = self
|
||||
.buckets
|
||||
.binary_search_by_key(&val, |probe| probe.range.start)
|
||||
.unwrap_or_else(|pos| pos - 1);
|
||||
debug_assert!(self.buckets[pos].range.contains(&val));
|
||||
pos
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the user provided f64 range value to fast field value space.
|
||||
///
|
||||
/// Internally fast field values are always stored as u64.
|
||||
/// If the fast field has u64 [1,2,5], these values are stored as is in the fast field.
|
||||
/// A fast field with f64 [1.0, 2.0, 5.0] is converted to u64 space, using a
|
||||
/// monotonic mapping function, so the order is preserved.
|
||||
///
|
||||
/// Consequently, a f64 user range 1.0..3.0 needs to be converted to fast field value space using
|
||||
/// the same monotonic mapping function, so that the provided ranges contain the u64 values in the
|
||||
/// fast field.
|
||||
/// The alternative would be that every value read would be converted to the f64 range, but that is
|
||||
/// more computational expensive when many documents are hit.
|
||||
fn to_u64_range(range: &RangeAggregationRange, field_type: &Type) -> Range<u64> {
|
||||
range
|
||||
.from
|
||||
.map(|from| f64_to_fastfield_u64(from, field_type))
|
||||
.unwrap_or(u64::MIN)
|
||||
..range
|
||||
.to
|
||||
.map(|to| f64_to_fastfield_u64(to, field_type))
|
||||
.unwrap_or(u64::MAX)
|
||||
}
|
||||
|
||||
/// Extends the provided buckets to contain the whole value range, by inserting buckets at the
|
||||
/// beginning and end.
|
||||
fn extend_validate_ranges(
|
||||
buckets: &[RangeAggregationRange],
|
||||
field_type: &Type,
|
||||
) -> crate::Result<Vec<Range<u64>>> {
|
||||
let mut converted_buckets = buckets
|
||||
.iter()
|
||||
.map(|range| to_u64_range(range, field_type))
|
||||
.collect_vec();
|
||||
|
||||
converted_buckets.sort_by_key(|bucket| bucket.start);
|
||||
if converted_buckets[0].start != u64::MIN {
|
||||
converted_buckets.insert(0, u64::MIN..converted_buckets[0].start);
|
||||
}
|
||||
|
||||
if converted_buckets[converted_buckets.len() - 1].end != u64::MAX {
|
||||
converted_buckets.push(converted_buckets[converted_buckets.len() - 1].end..u64::MAX);
|
||||
}
|
||||
|
||||
// fill up holes in the ranges
|
||||
let find_hole = |converted_buckets: &[Range<u64>]| {
|
||||
for (pos, ranges) in converted_buckets.windows(2).enumerate() {
|
||||
if ranges[0].end > ranges[1].start {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Overlapping ranges not supported range {:?}, range+1 {:?}",
|
||||
ranges[0], ranges[1]
|
||||
)));
|
||||
}
|
||||
if ranges[0].end != ranges[1].start {
|
||||
return Ok(Some(pos));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
};
|
||||
|
||||
while let Some(hole_pos) = find_hole(&converted_buckets)? {
|
||||
let new_range = converted_buckets[hole_pos].end..converted_buckets[hole_pos + 1].start;
|
||||
converted_buckets.insert(hole_pos + 1, new_range);
|
||||
}
|
||||
|
||||
Ok(converted_buckets)
|
||||
}
|
||||
|
||||
pub fn range_to_string(range: &Range<u64>, field_type: &Type) -> String {
|
||||
// is_start is there for malformed requests, e.g. ig the user passes the range u64::MIN..0.0,
|
||||
// it should be rendererd as "*-0" and not "*-*"
|
||||
let to_str = |val: u64, is_start: bool| {
|
||||
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
|
||||
"*".to_string()
|
||||
} else {
|
||||
f64_from_fastfield_u64(val, field_type).to_string()
|
||||
}
|
||||
};
|
||||
|
||||
format!("{}-{}", to_str(range.start, true), to_str(range.end, false))
|
||||
}
|
||||
|
||||
pub fn range_to_key(range: &Range<u64>, field_type: &Type) -> Key {
|
||||
Key::Str(range_to_string(range, field_type))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
|
||||
};
|
||||
use crate::aggregation::tests::get_test_index_with_num_docs;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::query::AllQuery;
|
||||
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: Type,
|
||||
) -> SegmentRangeCollector {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges,
|
||||
};
|
||||
|
||||
SegmentRangeCollector::from_req(&req, &Default::default(), field_type).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_fraction_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["range"]["buckets"][0]["key"], "*-0");
|
||||
assert_eq!(res["range"]["buckets"][0]["doc_count"], 0);
|
||||
assert_eq!(res["range"]["buckets"][1]["key"], "0-0.1");
|
||||
assert_eq!(res["range"]["buckets"][1]["doc_count"], 10);
|
||||
assert_eq!(res["range"]["buckets"][2]["key"], "0.1-0.2");
|
||||
assert_eq!(res["range"]["buckets"][2]["doc_count"], 10);
|
||||
assert_eq!(res["range"]["buckets"][3]["key"], "0.2-*");
|
||||
assert_eq!(res["range"]["buckets"][3]["doc_count"], 80);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.end, 20f64.to_u64());
|
||||
// Added bucket to fill hole
|
||||
assert_eq!(buckets[2].range.start, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.end, 30f64.to_u64());
|
||||
assert_eq!(buckets[3].range.start, 30f64.to_u64());
|
||||
assert_eq!(buckets[3].range.end, 40f64.to_u64());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_range_conversion_special_case() {
|
||||
// the monotonic conversion between f64 and u64, does not map f64::MIN.to_u64() ==
|
||||
// u64::MIN, but the into trait converts f64::MIN/MAX to None
|
||||
let buckets = vec![
|
||||
(f64::MIN..10f64).into(),
|
||||
(10f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.end, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.start, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.end, u64::MAX);
|
||||
assert_eq!(buckets.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_range_test_negative_vals() {
|
||||
let buckets = vec![(-10f64..-1f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||
}
|
||||
#[test]
|
||||
fn bucket_range_test_positive_vals() {
|
||||
let buckets = vec![(0f64..10f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_u64() {
|
||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||
let collector = get_collector_from_ranges(ranges, Type::U64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9), 0);
|
||||
assert_eq!(search(10), 1);
|
||||
assert_eq!(search(11), 1);
|
||||
assert_eq!(search(99), 1);
|
||||
assert_eq!(search(100), 2);
|
||||
assert_eq!(search(u64::MAX - 1), 2); // Since the end range is never included, the max
|
||||
// value
|
||||
};
|
||||
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
];
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
RangeAggregationRange {
|
||||
to: None,
|
||||
from: Some(100.0),
|
||||
},
|
||||
];
|
||||
check_ranges(ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![
|
||||
//(f64::MIN..10.0).into(),
|
||||
(10.0..100.0).into(),
|
||||
//(100.0..f64::MAX).into(),
|
||||
];
|
||||
|
||||
let collector = get_collector_from_ranges(ranges, Type::F64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9f64.to_u64()), 0);
|
||||
assert_eq!(search(10f64.to_u64()), 1);
|
||||
assert_eq!(search(11f64.to_u64()), 1);
|
||||
assert_eq!(search(99f64.to_u64()), 1);
|
||||
assert_eq!(search(100f64.to_u64()), 2);
|
||||
assert_eq!(search(u64::MAX - 1), 2); // Since the end range is never included,
|
||||
// the max value
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
|
||||
|
||||
const TOTAL_DOCS: u64 = 1_000_000u64;
|
||||
const NUM_DOCS: u64 = 50_000u64;
|
||||
|
||||
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
|
||||
let bucket_size = num_docs / num_buckets;
|
||||
let mut buckets: Vec<RangeAggregationRange> = vec![];
|
||||
for i in 0..num_buckets {
|
||||
let bucket_start = (i * bucket_size) as f64;
|
||||
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
|
||||
}
|
||||
|
||||
get_collector_from_ranges(buckets, Type::U64)
|
||||
}
|
||||
|
||||
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let all_docs = (0..total_docs - 1).collect_vec();
|
||||
let mut vals = all_docs
|
||||
.as_slice()
|
||||
.choose_multiple(&mut rng, num_docs_returned as usize)
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
vals.sort();
|
||||
vals
|
||||
}
|
||||
|
||||
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
|
||||
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
|
||||
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
|
||||
b.iter(|| {
|
||||
let mut bucket_pos = 0;
|
||||
for val in &vals {
|
||||
bucket_pos = collector.get_bucket_pos(*val);
|
||||
}
|
||||
bucket_pos
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_range_100_buckets(b: &mut test::Bencher) {
|
||||
bench_range_binary_search(b, 100)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_range_10_buckets(b: &mut test::Bencher) {
|
||||
bench_range_binary_search(b, 10)
|
||||
}
|
||||
}
|
||||
135
src/aggregation/collector.rs
Normal file
135
src/aggregation/collector.rs
Normal file
@@ -0,0 +1,135 @@
|
||||
use super::agg_req::Aggregations;
|
||||
use super::agg_req_with_accessor::AggregationsWithAccessor;
|
||||
use super::agg_result::AggregationResults;
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggregations_with_accessor;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Collector for aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
agg: Aggregations,
|
||||
}
|
||||
|
||||
impl AggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
pub fn from_aggs(agg: Aggregations) -> Self {
|
||||
Self { agg }
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for distributed aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
///
|
||||
/// # Purpose
|
||||
/// AggregationCollector returns `IntermediateAggregationResults` and not the final
|
||||
/// `AggregationResults`, so that results from differenct indices can be merged and then converted
|
||||
/// into the final `AggregationResults` via the `into()` method.
|
||||
pub struct DistributedAggregationCollector {
|
||||
agg: Aggregations,
|
||||
}
|
||||
|
||||
impl DistributedAggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
pub fn from_aggs(agg: Aggregations) -> Self {
|
||||
Self { agg }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for DistributedAggregationCollector {
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
type Child = AggregationSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let aggs_with_accessor = get_aggregations_with_accessor(&self.agg, reader)?;
|
||||
let result = SegmentAggregationResultsCollector::from_req(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for AggregationCollector {
|
||||
type Fruit = AggregationResults;
|
||||
|
||||
type Child = AggregationSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let aggs_with_accessor = get_aggregations_with_accessor(&self.agg, reader)?;
|
||||
let result = SegmentAggregationResultsCollector::from_req(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
merge_fruits(segment_fruits).map(|res| res.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
mut segment_fruits: Vec<IntermediateAggregationResults>,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
if let Some(mut fruit) = segment_fruits.pop() {
|
||||
for next_fruit in segment_fruits {
|
||||
fruit.merge_fruits(&next_fruit);
|
||||
}
|
||||
Ok(fruit)
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
"no fruits provided in merge_fruits".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs: AggregationsWithAccessor,
|
||||
result: SegmentAggregationResultsCollector,
|
||||
}
|
||||
|
||||
impl SegmentCollector for AggregationSegmentCollector {
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
|
||||
self.result.collect(doc, &self.aggs);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
self.result.flush_staged_docs(&self.aggs, true);
|
||||
self.result.into()
|
||||
}
|
||||
}
|
||||
304
src/aggregation/intermediate_agg_result.rs
Normal file
304
src/aggregation/intermediate_agg_result.rs
Normal file
@@ -0,0 +1,304 @@
|
||||
//! Contains the intermediate aggregation tree, that can be merged.
|
||||
//! Intermediate aggregation results can be used to merge results between segments or between
|
||||
//! indices.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentMetricResultCollector,
|
||||
SegmentRangeBucketEntry,
|
||||
};
|
||||
use super::{Key, VecWithNames};
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults(pub(crate) VecWithNames<IntermediateAggregationResult>);
|
||||
|
||||
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
|
||||
fn from(tree: SegmentAggregationResultsCollector) -> Self {
|
||||
let mut data = vec![];
|
||||
for (key, bucket) in tree.buckets.into_iter() {
|
||||
data.push((key, IntermediateAggregationResult::Bucket(bucket.into())));
|
||||
}
|
||||
for (key, metric) in tree.metrics.into_iter() {
|
||||
data.push((key, IntermediateAggregationResult::Metric(metric.into())));
|
||||
}
|
||||
Self(VecWithNames::from_entries(data))
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
/// Merge an other intermediate aggregation result into this result.
|
||||
///
|
||||
/// The order of the values need to be the same on both results. This is ensured when the same
|
||||
/// (key values) are present on the underlying VecWithNames struct.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateAggregationResults) {
|
||||
for (tree_left, tree_right) in self.0.values_mut().zip(other.0.values()) {
|
||||
tree_left.merge_fruits(tree_right);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateAggregationResult {
|
||||
/// Bucket variant
|
||||
Bucket(IntermediateBucketResult),
|
||||
/// Metric variant
|
||||
Metric(IntermediateMetricResult),
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateAggregationResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateAggregationResult::Bucket(res_left),
|
||||
IntermediateAggregationResult::Bucket(res_right),
|
||||
) => {
|
||||
res_left.merge_fruits(res_right);
|
||||
}
|
||||
(
|
||||
IntermediateAggregationResult::Metric(res_left),
|
||||
IntermediateAggregationResult::Metric(res_right),
|
||||
) => {
|
||||
res_left.merge_fruits(res_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible types in aggregation tree on merge fruits");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds the intermediate data for metric results
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateMetricResult {
|
||||
/// Average containing intermediate average data result
|
||||
Average(IntermediateAverage),
|
||||
/// AverageData variant
|
||||
Stats(IntermediateStats),
|
||||
}
|
||||
|
||||
impl From<SegmentMetricResultCollector> for IntermediateMetricResult {
|
||||
fn from(tree: SegmentMetricResultCollector) -> Self {
|
||||
match tree {
|
||||
SegmentMetricResultCollector::Average(collector) => {
|
||||
IntermediateMetricResult::Average(IntermediateAverage::from_collector(collector))
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(collector) => {
|
||||
IntermediateMetricResult::Stats(collector.stats)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateMetricResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateMetricResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateMetricResult::Average(avg_data_left),
|
||||
IntermediateMetricResult::Average(avg_data_right),
|
||||
) => {
|
||||
avg_data_left.merge_fruits(avg_data_right);
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::Stats(stats_left),
|
||||
IntermediateMetricResult::Stats(stats_right),
|
||||
) => {
|
||||
stats_left.merge_fruits(stats_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible fruit types in tree {:?}", other);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The intermediate bucket results. Internally they can be easily merged via the keys of the
|
||||
/// buckets.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateBucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range(HashMap<Key, IntermediateRangeBucketEntry>),
|
||||
}
|
||||
|
||||
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
|
||||
fn from(collector: SegmentBucketResultCollector) -> Self {
|
||||
match collector {
|
||||
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Range(entries_left),
|
||||
IntermediateBucketResult::Range(entries_right),
|
||||
) => {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.get(name) {
|
||||
entry_left.merge_fruits(entry_right);
|
||||
}
|
||||
}
|
||||
|
||||
for (key, res) in entries_right.iter() {
|
||||
if !entries_left.contains_key(key) {
|
||||
entries_left.insert(key.clone(), res.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateRangeBucketEntry {
|
||||
/// The unique the bucket is identified.
|
||||
pub key: Key,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
pub(crate) values: Option<Vec<u64>>,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
|
||||
fn from(entry: SegmentRangeBucketEntry) -> Self {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
|
||||
sub_aggregation.into()
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
// let sub_aggregation = entry.sub_aggregation.into();
|
||||
|
||||
IntermediateRangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
values: None,
|
||||
sub_aggregation,
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateRangeBucketEntry {
|
||||
fn merge_fruits(&mut self, other: &IntermediateRangeBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(&other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = HashMap::new();
|
||||
for (key, doc_count) in data {
|
||||
buckets.insert(
|
||||
Key::Str(key.to_string()),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
to: None,
|
||||
},
|
||||
);
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level2".to_string(),
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
|
||||
);
|
||||
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
|
||||
}
|
||||
|
||||
fn get_test_tree(data: &[(String, u64, String, u64)]) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = HashMap::new();
|
||||
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
|
||||
buckets.insert(
|
||||
Key::Str(key.to_string()),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
from: None,
|
||||
to: None,
|
||||
sub_aggregation: get_sub_test_tree(&[(
|
||||
sub_aggregation_key.to_string(),
|
||||
*sub_aggregation_count,
|
||||
)]),
|
||||
},
|
||||
);
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level1".to_string(),
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
|
||||
);
|
||||
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_1() {
|
||||
let mut tree_left = get_test_tree(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_test_tree(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(&tree_right);
|
||||
|
||||
let tree_expected = get_test_tree(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||
]);
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_2() {
|
||||
let mut tree_left = get_test_tree(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_test_tree(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(&tree_right);
|
||||
|
||||
let tree_expected = get_test_tree(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
}
|
||||
101
src/aggregation/metric/average.rs
Normal file
101
src/aggregation/metric/average.rs
Normal file
@@ -0,0 +1,101 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// A single-value metric aggregation that computes the average of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
pub struct AverageAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
}
|
||||
impl AverageAggregation {
|
||||
/// Create new AverageAggregation from a field.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
AverageAggregation { field: field_name }
|
||||
}
|
||||
/// Return the field name.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAverageCollector {
|
||||
pub data: IntermediateAverage,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl Debug for SegmentAverageCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AverageCollector")
|
||||
.field("data", &self.data)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAverageCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
data: Default::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get(docs[0]);
|
||||
let val2 = field.get(docs[1]);
|
||||
let val3 = field.get(docs[2]);
|
||||
let val4 = field.get(docs[3]);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.data.collect(val1);
|
||||
self.data.collect(val2);
|
||||
self.data.collect(val3);
|
||||
self.data.collect(val4);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = field.get(*doc);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Contains mergeable version of average data.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAverage {
|
||||
pub(crate) sum: f64,
|
||||
pub(crate) doc_count: u64,
|
||||
}
|
||||
|
||||
impl IntermediateAverage {
|
||||
pub(crate) fn from_collector(collector: SegmentAverageCollector) -> Self {
|
||||
collector.data
|
||||
}
|
||||
|
||||
/// Merge average data into this instance.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateAverage) {
|
||||
self.sum += other.sum;
|
||||
self.doc_count += other.doc_count;
|
||||
}
|
||||
/// compute final result
|
||||
pub fn finalize(&self) -> f64 {
|
||||
self.sum / self.doc_count as f64
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, val: f64) {
|
||||
self.doc_count += 1;
|
||||
self.sum += val;
|
||||
}
|
||||
}
|
||||
22
src/aggregation/metric/mod.rs
Normal file
22
src/aggregation/metric/mod.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
//! Module for all metric aggregations.
|
||||
|
||||
mod average;
|
||||
mod stats;
|
||||
pub use average::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
pub use stats::*;
|
||||
|
||||
/// Single-metric aggregations use this common result structure.
|
||||
///
|
||||
/// Main reason to wrap it in value is to match elasticsearch output structure.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SingleMetricResult {
|
||||
/// The value of the single value metric.
|
||||
pub value: f64,
|
||||
}
|
||||
|
||||
impl From<f64> for SingleMetricResult {
|
||||
fn from(value: f64) -> Self {
|
||||
Self { value }
|
||||
}
|
||||
}
|
||||
273
src/aggregation/metric/stats.rs
Normal file
273
src/aggregation/metric/stats.rs
Normal file
@@ -0,0 +1,273 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
/// A multi-value metric aggregation that computes stats of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [Stats] for returned statistics.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct StatsAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
}
|
||||
impl StatsAggregation {
|
||||
/// Create new StatsAggregation from a field.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
StatsAggregation { field: field_name }
|
||||
}
|
||||
/// Return the field name.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
/// Stats contains a collection of statistics.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
/// The number of documents.
|
||||
pub count: usize,
|
||||
/// The sum of the fast field values.
|
||||
pub sum: f64,
|
||||
/// The standard deviation of the fast field values.
|
||||
pub standard_deviation: f64,
|
||||
/// The min value of the fast field values.
|
||||
pub min: f64,
|
||||
/// The max value of the fast field values.
|
||||
pub max: f64,
|
||||
/// The average of the values.
|
||||
pub avg: f64,
|
||||
}
|
||||
|
||||
/// IntermediateStats contains the mergeable version for stats.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
count: usize,
|
||||
sum: f64,
|
||||
squared_sum: f64,
|
||||
min: f64,
|
||||
max: f64,
|
||||
}
|
||||
|
||||
impl IntermediateStats {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
count: 0,
|
||||
sum: 0.0,
|
||||
squared_sum: 0.0,
|
||||
min: f64::MAX,
|
||||
max: f64::MIN,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn avg(&self) -> f64 {
|
||||
self.sum / (self.count as f64)
|
||||
}
|
||||
|
||||
fn square_mean(&self) -> f64 {
|
||||
self.squared_sum / (self.count as f64)
|
||||
}
|
||||
|
||||
pub(crate) fn standard_deviation(&self) -> f64 {
|
||||
let average = self.avg();
|
||||
(self.square_mean() - average * average).sqrt()
|
||||
}
|
||||
|
||||
/// Merge data from other stats into this instance.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateStats) {
|
||||
self.count += other.count;
|
||||
self.sum += other.sum;
|
||||
self.squared_sum += other.squared_sum;
|
||||
self.min = self.min.min(other.min);
|
||||
self.max = self.max.max(other.max);
|
||||
}
|
||||
|
||||
/// compute final result
|
||||
pub fn finalize(&self) -> Stats {
|
||||
Stats {
|
||||
count: self.count,
|
||||
sum: self.sum,
|
||||
standard_deviation: self.standard_deviation(),
|
||||
min: self.min,
|
||||
max: self.max,
|
||||
avg: self.avg(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, value: f64) {
|
||||
self.count += 1;
|
||||
self.sum += value;
|
||||
self.squared_sum += value * value;
|
||||
self.min = self.min.min(value);
|
||||
self.max = self.max.max(value);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentStatsCollector {
|
||||
pub(crate) stats: IntermediateStats,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl SegmentStatsCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
stats: IntermediateStats::new(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get(docs[0]);
|
||||
let val2 = field.get(docs[1]);
|
||||
let val3 = field.get(docs[2]);
|
||||
let val4 = field.get(docs[3]);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.stats.collect(val1);
|
||||
self.stats.collect(val2);
|
||||
self.stats.collect(val3);
|
||||
self.stats.collect(val4);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = field.get(*doc);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::iter;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::aggregation::tests::get_test_index_2_segments;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(false)?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = vec![
|
||||
(
|
||||
"stats_i64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score_i64".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"stats_f64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score_f64".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
}),
|
||||
sub_aggregation: iter::once((
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(
|
||||
StatsAggregation::from_field_name("score".to_string()),
|
||||
)),
|
||||
))
|
||||
.collect(),
|
||||
}),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(
|
||||
res["stats"],
|
||||
json!({
|
||||
"avg": 12.142857142857142,
|
||||
"count": 7,
|
||||
"max": 44.0,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.65313748796613,
|
||||
"sum": 85.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["stats_i64"],
|
||||
json!({
|
||||
"avg": 12.142857142857142,
|
||||
"count": 7,
|
||||
"max": 44.0,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.65313748796613,
|
||||
"sum": 85.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["stats_f64"],
|
||||
json!({
|
||||
"avg": 12.214285714285714,
|
||||
"count": 7,
|
||||
"max": 44.5,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.819905785437443,
|
||||
"sum": 85.5
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["range"]["buckets"][2]["stats"],
|
||||
json!({
|
||||
"avg": 10.666666666666666,
|
||||
"count": 3,
|
||||
"max": 14.0,
|
||||
"min": 7.0,
|
||||
"standard_deviation": 2.867441755680877,
|
||||
"sum": 32.0
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
1148
src/aggregation/mod.rs
Normal file
1148
src/aggregation/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
195
src/aggregation/segment_agg_result.rs
Normal file
195
src/aggregation/segment_agg_result.rs
Normal file
@@ -0,0 +1,195 @@
|
||||
//! Contains aggregation trees which is used during collection in a segment.
|
||||
//! This tree contains datastructrues optimized for fast collection.
|
||||
//! The tree can be converted to an intermediate tree, which contains datastructrues optimized for
|
||||
//! merging.
|
||||
|
||||
use std::fmt::Debug;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::agg_req::MetricAggregation;
|
||||
use super::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
|
||||
};
|
||||
use super::bucket::SegmentRangeCollector;
|
||||
use super::metric::{
|
||||
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
|
||||
};
|
||||
use super::{Key, VecWithNames};
|
||||
use crate::aggregation::agg_req::BucketAggregationType;
|
||||
use crate::DocId;
|
||||
|
||||
pub(crate) const DOC_BLOCK_SIZE: usize = 256;
|
||||
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAggregationResultsCollector {
|
||||
pub(crate) metrics: VecWithNames<SegmentMetricResultCollector>,
|
||||
pub(crate) buckets: VecWithNames<SegmentBucketResultCollector>,
|
||||
staged_docs: DocBlock,
|
||||
num_staged_docs: usize,
|
||||
}
|
||||
|
||||
impl Debug for SegmentAggregationResultsCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentAggregationResultsCollector")
|
||||
.field("metrics", &self.metrics)
|
||||
.field("buckets", &self.buckets)
|
||||
.field("staged_docs", &&self.staged_docs[..self.num_staged_docs])
|
||||
.field("num_staged_docs", &self.num_staged_docs)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationResultsCollector {
|
||||
pub(crate) fn from_req(req: &AggregationsWithAccessor) -> crate::Result<Self> {
|
||||
let buckets = req
|
||||
.buckets
|
||||
.entries()
|
||||
.map(|(key, req)| {
|
||||
Ok((
|
||||
key.to_string(),
|
||||
SegmentBucketResultCollector::from_req(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
let metrics = req
|
||||
.metrics
|
||||
.entries()
|
||||
.map(|(key, req)| (key.to_string(), SegmentMetricResultCollector::from_req(req)))
|
||||
.collect_vec();
|
||||
Ok(SegmentAggregationResultsCollector {
|
||||
metrics: VecWithNames::from_entries(metrics),
|
||||
buckets: VecWithNames::from_entries(buckets),
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: 0,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) {
|
||||
self.staged_docs[self.num_staged_docs] = doc;
|
||||
self.num_staged_docs += 1;
|
||||
if self.num_staged_docs == self.staged_docs.len() {
|
||||
self.flush_staged_docs(agg_with_accessor, false);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn flush_staged_docs(
|
||||
&mut self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
for (agg_with_accessor, collector) in agg_with_accessor
|
||||
.metrics
|
||||
.values()
|
||||
.zip(self.metrics.values_mut())
|
||||
{
|
||||
collector.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
|
||||
}
|
||||
for (agg_with_accessor, collector) in agg_with_accessor
|
||||
.buckets
|
||||
.values()
|
||||
.zip(self.buckets.values_mut())
|
||||
{
|
||||
collector.collect_block(
|
||||
&self.staged_docs[..self.num_staged_docs],
|
||||
agg_with_accessor,
|
||||
force_flush,
|
||||
);
|
||||
}
|
||||
|
||||
self.num_staged_docs = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentMetricResultCollector {
|
||||
Average(SegmentAverageCollector),
|
||||
Stats(SegmentStatsCollector),
|
||||
}
|
||||
|
||||
impl SegmentMetricResultCollector {
|
||||
pub fn from_req(req: &MetricAggregationWithAccessor) -> Self {
|
||||
match &req.metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: _ }) => {
|
||||
SegmentMetricResultCollector::Average(SegmentAverageCollector::from_req(
|
||||
req.field_type,
|
||||
))
|
||||
}
|
||||
MetricAggregation::Stats(StatsAggregation { field: _ }) => {
|
||||
SegmentMetricResultCollector::Stats(SegmentStatsCollector::from_req(req.field_type))
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
|
||||
match self {
|
||||
SegmentMetricResultCollector::Average(avg_collector) => {
|
||||
avg_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(stats_collector) => {
|
||||
stats_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SegmentBucketAggregationResultCollectors will have specialized buckets for collection inside
|
||||
/// segments.
|
||||
/// The typical structure of Map<Key, Bucket> is not suitable during collection for performance
|
||||
/// reasons.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentBucketResultCollector {
|
||||
Range(SegmentRangeCollector),
|
||||
}
|
||||
|
||||
impl SegmentBucketResultCollector {
|
||||
pub fn from_req(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.bucket_agg {
|
||||
BucketAggregationType::Range(range_req) => Ok(Self::Range(
|
||||
SegmentRangeCollector::from_req(range_req, &req.sub_aggregation, req.field_type)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect_block(
|
||||
&mut self,
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.collect_block(doc, bucket_with_accessor, force_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -19,7 +19,7 @@ use crate::{DocId, Score};
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// f64 field. are not supported.
|
||||
/// f64 fields are not supported.
|
||||
#[derive(Clone)]
|
||||
pub struct HistogramCollector {
|
||||
min_value: u64,
|
||||
|
||||
@@ -74,6 +74,9 @@ pub enum TantivyError {
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
#[error("A thread holding the locked panicked and poisoned the lock")]
|
||||
Poisoned,
|
||||
/// The provided field name does not exist.
|
||||
#[error("The field does not exist: '{0}'")]
|
||||
FieldNotFound(String),
|
||||
/// Invalid argument was passed by the user.
|
||||
#[error("An invalid argument was passed: '{0}'")]
|
||||
InvalidArgument(String),
|
||||
|
||||
@@ -86,7 +86,7 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight = term_query.specialized_weight(&searcher, true)?;
|
||||
let term_weight = term_query.specialized_weight(&*searcher, true)?;
|
||||
let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0)?;
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
Ok(())
|
||||
@@ -99,7 +99,7 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight_err = term_query.specialized_weight(&searcher, false);
|
||||
let term_weight_err = term_query.specialized_weight(&*searcher, false);
|
||||
assert!(matches!(
|
||||
term_weight_err,
|
||||
Err(crate::TantivyError::SchemaError(_))
|
||||
|
||||
@@ -112,6 +112,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
#[inline]
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
@@ -119,6 +120,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
@@ -174,6 +176,7 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
@@ -35,8 +35,7 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
let mut doc_set: Vec<u64> = Vec::new();
|
||||
|
||||
let mut doc_id = 0u64;
|
||||
for iteration in 0..get_num_iterations() {
|
||||
dbg!(iteration);
|
||||
for _iteration in 0..get_num_iterations() {
|
||||
let num_docs: usize = rng.gen_range(0..4);
|
||||
if !doc_set.is_empty() {
|
||||
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
|
||||
|
||||
@@ -307,16 +307,16 @@ impl IndexMerger {
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Str(_) => {
|
||||
// We don't handle str fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future.
|
||||
}
|
||||
FieldType::Bytes(byte_options) => {
|
||||
if byte_options.is_fast() {
|
||||
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {
|
||||
// We don't handle json / string fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -5,12 +5,13 @@ use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::{
|
||||
compute_table_size, serialize_postings, IndexingContext, PerFieldPostingsWriter, PostingsWriter,
|
||||
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::{Field, FieldEntry, FieldType, FieldValue, Schema, Term, Type, Value};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, TokenStreamChain, Tokenizer,
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer,
|
||||
};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent};
|
||||
|
||||
@@ -221,19 +222,19 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
|
||||
let num_tokens = if token_streams.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
for mut token_stream in token_streams {
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&mut token_stream,
|
||||
&mut *token_stream,
|
||||
term_buffer,
|
||||
indexing_context,
|
||||
)
|
||||
};
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
&mut indexing_position,
|
||||
);
|
||||
}
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, indexing_position.num_tokens);
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
@@ -275,6 +276,9 @@ impl SegmentWriter {
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -144,6 +144,7 @@ mod indexer;
|
||||
pub mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
pub mod aggregation;
|
||||
pub mod collector;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
|
||||
52
src/postings/json_postings_writer.rs
Normal file
52
src/postings/json_postings_writer.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use std::io;
|
||||
use crate::Term;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{Recorder, NothingRecorder};
|
||||
use crate::postings::stacker::Addr;
|
||||
use crate::postings::{PostingsWriter, IndexingContext, UnorderedTermId, FieldSerializer};
|
||||
|
||||
pub struct JsonPostingsWriter {
|
||||
str_posting_writer: Box<dyn PostingsWriter>,
|
||||
non_str_posting_writer: Box<dyn PostingsWriter>,
|
||||
}
|
||||
|
||||
impl JsonPostingsWriter {
|
||||
pub(crate) fn new<R: Recorder>() -> Self {
|
||||
JsonPostingsWriter {
|
||||
str_posting_writer: SpecializedPostingsWriter::<R>::new_boxed(),
|
||||
non_str_posting_writer: SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl PostingsWriter for JsonPostingsWriter {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
pos: u32,
|
||||
term: &crate::Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
let term_type = term.typ();
|
||||
todo!()
|
||||
}
|
||||
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
indexing_context: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,13 +15,14 @@ mod segment_postings;
|
||||
mod serializer;
|
||||
mod skip;
|
||||
mod stacker;
|
||||
mod json_postings_writer;
|
||||
mod term_info;
|
||||
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub(crate) use self::indexing_context::IndexingContext;
|
||||
pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter;
|
||||
pub use self::postings::Postings;
|
||||
pub(crate) use self::postings_writer::{serialize_postings, PostingsWriter};
|
||||
pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, PostingsWriter};
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::postings::json_postings_writer::JsonPostingsWriter;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
|
||||
use crate::postings::PostingsWriter;
|
||||
@@ -49,5 +50,20 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bytes(_)
|
||||
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
Box::new(if let Some(text_indexing_option) = json_object_options.get_text_indexing_option() {
|
||||
match text_indexing_option.index_option() {
|
||||
IndexRecordOption::Basic => JsonPostingsWriter::new::<NothingRecorder>(),
|
||||
IndexRecordOption::WithFreqs => {
|
||||
JsonPostingsWriter::new::<TermFrequencyRecorder>()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
JsonPostingsWriter::new::<TfAndPositionRecorder>()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
JsonPostingsWriter::new::<NothingRecorder>()
|
||||
})
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,13 +2,12 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Range;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
|
||||
use super::stacker::Addr;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
use crate::postings::recorder::{BufferLender, Recorder, NothingRecorder};
|
||||
use crate::postings::{
|
||||
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
|
||||
UnorderedTermId,
|
||||
@@ -18,6 +17,8 @@ use crate::termdict::TermOrdinal;
|
||||
use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
|
||||
const POSITION_GAP: u32 = 1;
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
@@ -83,6 +84,7 @@ pub(crate) fn serialize_postings(
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes(_) => {}
|
||||
FieldType::JsonObject(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
@@ -100,6 +102,12 @@ pub(crate) fn serialize_postings(
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct IndexingPosition {
|
||||
pub num_tokens: u32,
|
||||
pub end_position: u32,
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
@@ -138,51 +146,150 @@ pub(crate) trait PostingsWriter {
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut Term,
|
||||
indexing_context: &mut IndexingContext,
|
||||
) -> u32 {
|
||||
indexing_position: &mut IndexingPosition,
|
||||
) {
|
||||
term_buffer.set_field(Type::Str, field);
|
||||
let mut sink = |token: &Token| {
|
||||
let mut num_tokens = 0;
|
||||
let mut end_position = 0;
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term_buffer.set_text(token.text.as_str());
|
||||
self.subscribe(doc_id, token.position as u32, term_buffer, indexing_context);
|
||||
} else {
|
||||
if token.text.len() > MAX_TOKEN_LEN {
|
||||
warn!(
|
||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||
MAX_TOKEN_LEN in the documentation for more information.",
|
||||
token.text.len(),
|
||||
MAX_TOKEN_LEN
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
term_buffer.set_text(token.text.as_str());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = start_position + token.position_length as u32;
|
||||
self.subscribe(doc_id, start_position, term_buffer, indexing_context);
|
||||
num_tokens += 1;
|
||||
});
|
||||
indexing_position.end_position = end_position + POSITION_GAP;
|
||||
indexing_position.num_tokens += num_tokens;
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64;
|
||||
}
|
||||
|
||||
pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
|
||||
text_postings_writer: SpecializedPostingsWriter<Rec>,
|
||||
other_postings_writer: SpecializedPostingsWriter<NothingRecorder>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> JsonPostingsWriter<Rec> {
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
let text_postings_writer: SpecializedPostingsWriter<Rec> = SpecializedPostingsWriter {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
};
|
||||
let other_postings_writer: SpecializedPostingsWriter<NothingRecorder> =
|
||||
SpecializedPostingsWriter {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
};
|
||||
Box::new(JsonPostingsWriter {
|
||||
text_postings_writer,
|
||||
other_postings_writer,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
ctx: &mut IndexingContext
|
||||
) -> UnorderedTermId {
|
||||
// TODO will the unordered term id be correct!?
|
||||
debug_assert!(term.is_json());
|
||||
if term.typ() == Type::Str {
|
||||
self.text_postings_writer
|
||||
.subscribe(doc, pos, term, ctx)
|
||||
} else {
|
||||
self.other_postings_writer
|
||||
.subscribe(doc, pos, term, ctx)
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
if term.typ() == Type::Str {
|
||||
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
} else {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
self.text_postings_writer.total_num_tokens() + self.other_postings_writer.total_num_tokens()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
#[derive(Default)]
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
/// constructor
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
let new_specialized_posting_writer: Self = Self {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
};
|
||||
Box::new(new_specialized_posting_writer)
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a memory arena.
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
#[inline]
|
||||
fn serialize_one_term(
|
||||
term: &Term<&[u8]>,
|
||||
addr: Addr,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let recorder: Rec = ctx.arena.read(addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
serializer.close_term()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
@@ -218,21 +325,19 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
indexing_context: &IndexingContext,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
let recorder: Rec = indexing_context.term_index.read(*addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(
|
||||
&indexing_context.arena,
|
||||
Self::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
serializer,
|
||||
&mut buffer_lender,
|
||||
);
|
||||
serializer.close_term()?;
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
use common::read_u32_vint;
|
||||
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
@@ -104,7 +104,7 @@ impl Recorder for NothingRecorder {
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(doc);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {}
|
||||
@@ -169,7 +169,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||
self.term_doc_freq += 1;
|
||||
self.current_doc = doc;
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(doc);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {
|
||||
@@ -178,7 +178,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(self.current_tf);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
@@ -239,15 +239,17 @@ impl Recorder for TfAndPositionRecorder {
|
||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.term_doc_freq += 1u32;
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(doc);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
|
||||
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(arena));
|
||||
self.stack
|
||||
.writer(arena)
|
||||
.write_u32_vint(position.wrapping_add(1u32));
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
||||
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(POSITION_END);
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
@@ -300,7 +302,9 @@ impl Recorder for TfAndPositionRecorder {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{write_u32_vint, BufferLender, VInt32Reader};
|
||||
use common::write_u32_vint;
|
||||
|
||||
use super::{BufferLender, VInt32Reader};
|
||||
|
||||
#[test]
|
||||
fn test_buffer_lender() {
|
||||
|
||||
@@ -76,7 +76,7 @@ impl InvertedIndexSerializer {
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -203,6 +203,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq);
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
assert_eq!(term_freq as usize, position_deltas.len());
|
||||
positions_serializer.write_positions_delta(position_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
use std::{io, mem};
|
||||
use std::mem;
|
||||
|
||||
use common::serialize_vint_u32;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::postings::stacker::memory_arena::{load, store};
|
||||
@@ -97,12 +99,13 @@ fn ensure_capacity<'a>(
|
||||
}
|
||||
|
||||
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
pub fn write_u32_vint(&mut self, val: u32) {
|
||||
let mut buf = [0u8; 8];
|
||||
let data = serialize_vint_u32(val, &mut buf);
|
||||
self.extend_from_slice(data);
|
||||
}
|
||||
|
||||
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
|
||||
if buf.is_empty() {
|
||||
// we need to cut early, because `ensure_capacity`
|
||||
// allocates if there is no capacity at all right now.
|
||||
return;
|
||||
}
|
||||
while !buf.is_empty() {
|
||||
let add_len: usize;
|
||||
{
|
||||
@@ -117,25 +120,6 @@ impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
// There is no use case to only write the capacity.
|
||||
// This is not IO after all, so we write the whole
|
||||
// buffer even if the contract of `.write` is looser.
|
||||
self.extend_from_slice(buf);
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.extend_from_slice(buf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new() -> ExpUnrolledLinkedList {
|
||||
ExpUnrolledLinkedList {
|
||||
@@ -178,8 +162,7 @@ impl ExpUnrolledLinkedList {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::{len_to_capacity, *};
|
||||
@@ -205,18 +188,14 @@ mod tests {
|
||||
let mut eull = ExpUnrolledLinkedList::new();
|
||||
let data: Vec<u32> = (0..100).collect();
|
||||
for &el in &data {
|
||||
assert!(eull
|
||||
.writer(&mut arena)
|
||||
.write_u32::<LittleEndian>(el)
|
||||
.is_ok());
|
||||
eull.writer(&mut arena).write_u32_vint(el);
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
eull.read_to_end(&arena, &mut buffer);
|
||||
let mut result = vec![];
|
||||
let mut remaining = &buffer[..];
|
||||
while !remaining.is_empty() {
|
||||
result.push(LittleEndian::read_u32(&remaining[..4]));
|
||||
remaining = &remaining[4..];
|
||||
result.push(read_u32_vint(&mut remaining));
|
||||
}
|
||||
assert_eq!(&result[..], &data[..]);
|
||||
}
|
||||
@@ -231,14 +210,11 @@ mod tests {
|
||||
let mut vec2: Vec<u8> = vec![];
|
||||
|
||||
for i in 0..9 {
|
||||
assert!(stack.writer(&mut eull).write_u32::<LittleEndian>(i).is_ok());
|
||||
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
|
||||
stack.writer(&mut eull).write_u32_vint(i);
|
||||
assert!(write_u32_vint(i, &mut vec1).is_ok());
|
||||
if i % 2 == 0 {
|
||||
assert!(stack2
|
||||
.writer(&mut eull)
|
||||
.write_u32::<LittleEndian>(i)
|
||||
.is_ok());
|
||||
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
|
||||
stack2.writer(&mut eull).write_u32_vint(i);
|
||||
assert!(write_u32_vint(i, &mut vec2).is_ok());
|
||||
}
|
||||
}
|
||||
let mut res1 = vec![];
|
||||
@@ -303,7 +279,6 @@ mod tests {
|
||||
mod bench {
|
||||
use std::iter;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
use test::Bencher;
|
||||
|
||||
use super::super::MemoryArena;
|
||||
@@ -339,7 +314,9 @@ mod bench {
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
let _ = stacks[t].writer(&mut arena).write_u32::<NativeEndian>(i);
|
||||
stacks[t]
|
||||
.writer(&mut arena)
|
||||
.extend_from_slice(&i.to_ne_bytes());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -372,6 +372,9 @@ impl QueryParser {
|
||||
let term = Term::from_field_bytes(field, &bytes);
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -661,7 +664,7 @@ mod test {
|
||||
let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"#
|
||||
r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -674,7 +677,7 @@ mod test {
|
||||
let query = query_parser.parse_query("text:hello").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"#
|
||||
r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -701,7 +704,7 @@ mod test {
|
||||
let query = query_parser.parse_query("text:hello^2").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"#
|
||||
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -736,7 +739,7 @@ mod test {
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
r#"Term(type=Str, field=7, val="wordone wordtwo")"#,
|
||||
r#"Term(type=Str, field=7, "wordone wordtwo")"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -779,7 +782,7 @@ mod test {
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term(type=U64, field=3, val=2324)",
|
||||
"Term(type=U64, field=3, 2324)",
|
||||
false,
|
||||
);
|
||||
|
||||
@@ -806,7 +809,7 @@ mod test {
|
||||
fn test_parse_bytes() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"bytes:YnVidQ==",
|
||||
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
|
||||
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -821,7 +824,7 @@ mod test {
|
||||
fn test_parse_bytes_phrase() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"bytes:\"YnVidQ==\"",
|
||||
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
|
||||
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -837,12 +840,12 @@ mod test {
|
||||
fn test_parse_query_to_ast_ab_c() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#,
|
||||
r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#,
|
||||
r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#,
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -851,17 +854,17 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
|
||||
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -878,12 +881,12 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_two_terms() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
|
||||
r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
r#"title:"a b""#,
|
||||
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
|
||||
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -892,37 +895,37 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_ranges() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#,
|
||||
r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#,
|
||||
r#"((Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b"))) (Included(Term(type=Str, field=1, "a")) TO Included(Term(type=Str, field=1, "b"))))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#,
|
||||
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{* TO toto}",
|
||||
r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#,
|
||||
r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO *}",
|
||||
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#,
|
||||
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:{-5 TO 3}",
|
||||
r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#,
|
||||
r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:{-1.5 TO 1.5}",
|
||||
r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#,
|
||||
r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper("*", "*", false);
|
||||
@@ -1051,27 +1054,27 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
r#"Term(type=Str, field=0, "toto")"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
|
||||
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
|
||||
r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
|
||||
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -1080,7 +1083,7 @@ mod test {
|
||||
pub fn test_query_parser_hyphen() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:www-form-encoded",
|
||||
r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#,
|
||||
r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -1090,7 +1093,7 @@ mod test {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a AND title:b",
|
||||
r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#,
|
||||
r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#,
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
@@ -1101,7 +1104,7 @@ mod test {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a OR title:b",
|
||||
r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#,
|
||||
r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#,
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -174,7 +174,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
r#"TermQuery(Term(type=Str, field=1, val="hello"))"#
|
||||
r#"TermQuery(Term(type=Str, field=1, "hello"))"#
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -137,6 +137,7 @@ impl FieldEntry {
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
FieldType::Facet(ref options) => options.is_stored(),
|
||||
FieldType::Bytes(ref options) => options.is_stored(),
|
||||
FieldType::JsonObject(ref options) => options.is_stored(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,33 @@
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::JsonObjectOptions;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use chrono::{FixedOffset, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Error)]
|
||||
pub enum ValueParsingError {
|
||||
/// Encountered a numerical value that overflows or underflow its integer type.
|
||||
OverflowError(String),
|
||||
/// The json node is not of the correct type.
|
||||
/// (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
|
||||
/// Tantivy will try to autocast values.
|
||||
TypeError(String),
|
||||
/// The json node is a string but contains json that is
|
||||
/// not valid base64.
|
||||
InvalidBase64(String),
|
||||
#[error("Overflow error. Expected {expected}, got {json}")]
|
||||
OverflowError {
|
||||
expected: &'static str,
|
||||
json: serde_json::Value,
|
||||
},
|
||||
#[error("Type error. Expected {expected}, got {json}")]
|
||||
TypeError {
|
||||
expected: &'static str,
|
||||
json: serde_json::Value,
|
||||
},
|
||||
#[error("Invalid base64: {base64}")]
|
||||
InvalidBase64 { base64: String },
|
||||
}
|
||||
|
||||
/// Type of the value that a field can take.
|
||||
@@ -67,6 +75,18 @@ impl Type {
|
||||
*self as u8
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
Type::Str => "Str",
|
||||
Type::U64 => "U64",
|
||||
Type::I64 => "I64",
|
||||
Type::F64 => "F64",
|
||||
Type::Date => "Date",
|
||||
Type::Facet => "Facet",
|
||||
Type::Bytes => "Bytes",
|
||||
}
|
||||
}
|
||||
|
||||
/// Interprets a 1byte code as a type.
|
||||
/// Returns None if the code is invalid.
|
||||
pub fn from_code(code: u8) -> Option<Self> {
|
||||
@@ -104,6 +124,8 @@ pub enum FieldType {
|
||||
Facet(FacetOptions),
|
||||
/// Bytes (one per document)
|
||||
Bytes(BytesOptions),
|
||||
/// Json object
|
||||
JsonObject(JsonObjectOptions),
|
||||
}
|
||||
|
||||
impl FieldType {
|
||||
@@ -117,6 +139,9 @@ impl FieldType {
|
||||
FieldType::Date(_) => Type::Date,
|
||||
FieldType::Facet(_) => Type::Facet,
|
||||
FieldType::Bytes(_) => Type::Bytes,
|
||||
FieldType::JsonObject(_) => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,6 +155,7 @@ impl FieldType {
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::Facet(ref _facet_options) => true,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
|
||||
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,12 +172,17 @@ impl FieldType {
|
||||
| FieldType::Date(ref int_options) => int_options.fieldnorms(),
|
||||
FieldType::Facet(_) => false,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
|
||||
FieldType::JsonObject(ref json_object_options) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a field configuration, return the maximal possible
|
||||
/// `IndexRecordOption` available.
|
||||
///
|
||||
/// For the Json object, this does not necessarily mean it is the index record
|
||||
/// option level is available for all terms.
|
||||
/// (Non string terms have the Basic indexing option at most.)
|
||||
///
|
||||
/// If the field is not indexed, then returns `None`.
|
||||
pub fn get_index_record_option(&self) -> Option<IndexRecordOption> {
|
||||
match *self {
|
||||
@@ -176,6 +207,10 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(ref json_obj_options) => json_obj_options
|
||||
.indexing
|
||||
.as_ref()
|
||||
.map(TextFieldIndexing::index_option),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,25 +224,30 @@ impl FieldType {
|
||||
JsonValue::String(ref field_text) => match *self {
|
||||
FieldType::Date(_) => {
|
||||
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
|
||||
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| {
|
||||
ValueParsingError::TypeError(format!(
|
||||
"Failed to parse date from JSON. Expected rfc3339 format, got {}. \
|
||||
{:?}",
|
||||
field_text, err
|
||||
))
|
||||
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|_err| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "rfc3339 format",
|
||||
json: json.clone(),
|
||||
}
|
||||
})?;
|
||||
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(
|
||||
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
|
||||
),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64(format!(
|
||||
"Expected base64 string, got {:?}",
|
||||
field_text
|
||||
))
|
||||
ValueParsingError::InvalidBase64 {
|
||||
base64: field_text.clone(),
|
||||
}
|
||||
}),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: json.clone(),
|
||||
}),
|
||||
},
|
||||
JsonValue::Number(ref field_val_num) => match *self {
|
||||
@@ -215,30 +255,42 @@ impl FieldType {
|
||||
if let Some(field_val_i64) = field_val_num.as_i64() {
|
||||
Ok(Value::I64(field_val_i64))
|
||||
} else {
|
||||
let msg = format!("Expected an i64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "an i64 int",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
if let Some(field_val_u64) = field_val_num.as_u64() {
|
||||
Ok(Value::U64(field_val_u64))
|
||||
} else {
|
||||
let msg = format!("Expected a u64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "u64",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
if let Some(field_val_f64) = field_val_num.as_f64() {
|
||||
Ok(Value::F64(field_val_f64))
|
||||
} else {
|
||||
let msg = format!("Expected a f64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "a f64",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
|
||||
let msg = format!("Expected a string, got {:?}", json);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: json.clone(),
|
||||
}),
|
||||
},
|
||||
JsonValue::Object(_) => match *self {
|
||||
FieldType::Str(_) => {
|
||||
@@ -247,28 +299,21 @@ impl FieldType {
|
||||
{
|
||||
Ok(Value::PreTokStr(tok_str_val))
|
||||
} else {
|
||||
let msg = format!(
|
||||
"Json value {:?} cannot be translated to PreTokenizedString.",
|
||||
json
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string or an pretokenized string",
|
||||
json: json.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
}),
|
||||
},
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -317,13 +362,13 @@ mod tests {
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521));
|
||||
match result {
|
||||
Err(ValueParsingError::TypeError(_)) => {}
|
||||
Err(ValueParsingError::TypeError { .. }) => {}
|
||||
_ => panic!("Expected parse failure for wrong type"),
|
||||
}
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-"));
|
||||
match result {
|
||||
Err(ValueParsingError::InvalidBase64(_)) => {}
|
||||
Err(ValueParsingError::InvalidBase64 { .. }) => {}
|
||||
_ => panic!("Expected parse failure for invalid base64"),
|
||||
}
|
||||
}
|
||||
|
||||
25
src/schema/json_object_options.rs
Normal file
25
src/schema/json_object_options.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::TextFieldIndexing;
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct JsonObjectOptions {
|
||||
stored: bool,
|
||||
// If set to some, int, date, f64 and text will be indexed.
|
||||
// Text will use the TextFieldIndexing setting for indexing.
|
||||
pub(crate) indexing: Option<TextFieldIndexing>,
|
||||
}
|
||||
|
||||
impl JsonObjectOptions {
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexing.is_some()
|
||||
}
|
||||
|
||||
pub fn get_text_indexing_option(&self) -> Option<&TextFieldIndexing> {
|
||||
self.indexing.as_ref()
|
||||
}
|
||||
}
|
||||
25
src/schema/json_object_utils.rs
Normal file
25
src/schema/json_object_utils.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use serde_json::Number;
|
||||
|
||||
use crate::schema::Value;
|
||||
|
||||
pub fn infer_type_from_string(string: &str) -> Value {
|
||||
// TODO can we avoid the copy?
|
||||
Value::Str(string.to_string())
|
||||
}
|
||||
|
||||
pub fn infer_type_from_number(number: &Number) -> Value {
|
||||
if let Some(num_val) = number.as_u64() {
|
||||
return Value::U64(num_val);
|
||||
}
|
||||
if let Some(num_val) = number.as_i64() {
|
||||
return Value::I64(num_val);
|
||||
}
|
||||
if let Some(num_val) = number.as_f64() {
|
||||
return Value::F64(num_val);
|
||||
}
|
||||
// This should never happen as long as we
|
||||
// don't use the serde_json feature = "arbitrary_precision".
|
||||
Value::Str(number.to_string())
|
||||
}
|
||||
|
||||
// TODO add unit tests
|
||||
@@ -104,7 +104,7 @@ mod document;
|
||||
mod facet;
|
||||
mod facet_options;
|
||||
mod schema;
|
||||
mod term;
|
||||
pub(crate) mod term;
|
||||
|
||||
mod field_entry;
|
||||
mod field_type;
|
||||
@@ -114,11 +114,14 @@ mod bytes_options;
|
||||
mod field;
|
||||
mod index_record_option;
|
||||
mod int_options;
|
||||
mod json_object_options;
|
||||
mod named_field_document;
|
||||
mod text_options;
|
||||
mod value;
|
||||
|
||||
mod flags;
|
||||
mod json_object_utils;
|
||||
mod term_writer;
|
||||
|
||||
pub use self::bytes_options::BytesOptions;
|
||||
pub use self::document::Document;
|
||||
@@ -138,6 +141,8 @@ pub use self::term::Term;
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
pub use self::value::Value;
|
||||
|
||||
pub use self::json_object_options::JsonObjectOptions;
|
||||
|
||||
/// Validator for a potential `field_name`.
|
||||
/// Returns true if the name can be use for a field name.
|
||||
///
|
||||
|
||||
@@ -666,7 +666,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::TypeError(_)
|
||||
ValueParsingError::TypeError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -684,7 +684,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -702,7 +702,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
));
|
||||
}
|
||||
@@ -720,7 +720,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,27 @@ use crate::DateTime;
|
||||
|
||||
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
|
||||
/// <field> + <type byte> + <value len>
|
||||
///
|
||||
/// - <field> is a big endian encoded u32 field id
|
||||
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
|
||||
/// The remaining 7 bits are used to encode the type of the value.
|
||||
/// If this is a JSON term, the type is the type of the leaf of the json.
|
||||
///
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
|
||||
/// The first bit of the `type_byte`
|
||||
/// is used to encode whether a Term is for
|
||||
/// json or not.
|
||||
pub const JSON_MARKER_BIT: u8 = 128u8;
|
||||
/// Separates the different segments of
|
||||
/// the json path.
|
||||
pub const JSON_PATH_SEGMENT_SEP: u8 = 0u8;
|
||||
/// Separates the json path and the value in
|
||||
/// a JSON term binary representation.
|
||||
pub const JSON_END_OF_PATH: u8 = 30u8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
@@ -164,13 +183,23 @@ where B: AsRef<[u8]>
|
||||
Term(data)
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
/// Returns true iff the term is a JSON term.
|
||||
pub fn is_json(&self) -> bool {
|
||||
self.typ_code() & JSON_MARKER_BIT == JSON_MARKER_BIT
|
||||
}
|
||||
|
||||
fn typ_code(&self) -> u8 {
|
||||
assert!(
|
||||
self.as_slice().len() >= 5,
|
||||
"the type does byte representation is too short"
|
||||
"the byte representation is too short"
|
||||
);
|
||||
Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code")
|
||||
self.as_slice()[4]
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
Type::from_code(self.typ_code() & (JSON_MARKER_BIT - 1))
|
||||
.expect("The term has an invalid type code")
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
@@ -189,11 +218,15 @@ where B: AsRef<[u8]>
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN {
|
||||
if self.typ() != T::to_type() {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let bytes = self.value_bytes_without_path();
|
||||
if bytes.len() != 8 {
|
||||
return None;
|
||||
}
|
||||
value_bytes.copy_from_slice(self.value_bytes_without_path());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
@@ -206,6 +239,35 @@ where B: AsRef<[u8]>
|
||||
self.get_fast_type::<i64>()
|
||||
}
|
||||
|
||||
fn json_path_value(&self) -> Option<&str> {
|
||||
if !self.is_json() {
|
||||
return None;
|
||||
}
|
||||
let value_bytes = self.value_bytes();
|
||||
let pos = value_bytes
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|b| b == JSON_END_OF_PATH)
|
||||
.expect("Could not find end of path");
|
||||
let json_path =
|
||||
str::from_utf8(&value_bytes[..pos]).expect("JSON value path is Invalid utf-8");
|
||||
Some(json_path)
|
||||
}
|
||||
|
||||
fn value_bytes_without_path(&self) -> &[u8] {
|
||||
let value_bytes = self.value_bytes();
|
||||
if self.is_json() {
|
||||
let pos = value_bytes
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|b| b == JSON_END_OF_PATH)
|
||||
.expect("Could not find end of path");
|
||||
&value_bytes[pos + 1..]
|
||||
} else {
|
||||
value_bytes
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the f64 type, or if the term byte representation
|
||||
@@ -233,7 +295,7 @@ where B: AsRef<[u8]>
|
||||
if self.typ() != Type::Str {
|
||||
return None;
|
||||
}
|
||||
str::from_utf8(self.value_bytes()).ok()
|
||||
str::from_utf8(self.value_bytes_without_path()).ok()
|
||||
}
|
||||
|
||||
/// Returns the facet associated with the term.
|
||||
@@ -247,7 +309,7 @@ where B: AsRef<[u8]>
|
||||
if self.typ() != Type::Facet {
|
||||
return None;
|
||||
}
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes_without_path()).ok()?;
|
||||
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
|
||||
}
|
||||
|
||||
@@ -261,7 +323,7 @@ where B: AsRef<[u8]>
|
||||
if self.typ() != Type::Bytes {
|
||||
return None;
|
||||
}
|
||||
Some(self.value_bytes())
|
||||
Some(self.value_bytes_without_path())
|
||||
}
|
||||
|
||||
/// Returns the serialized value of the term.
|
||||
@@ -290,14 +352,24 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
impl<B> fmt::Debug for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?;
|
||||
write!(f, "Term(type={:?}, field={}, ", typ, field_id)?;
|
||||
if let Some(path) = self.json_path_value() {
|
||||
write!(
|
||||
f,
|
||||
"path={}, ",
|
||||
path.replace(std::str::from_utf8(&[JSON_PATH_SEGMENT_SEP]).unwrap(), ".")
|
||||
)?;
|
||||
}
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = str::from_utf8(self.value_bytes()).ok();
|
||||
let s = self.as_str();
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
|
||||
131
src/schema/term_writer.rs
Normal file
131
src/schema/term_writer.rs
Normal file
@@ -0,0 +1,131 @@
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::Term;
|
||||
|
||||
// Copyright (C) 2021 Quickwit, Inc.
|
||||
//
|
||||
// Quickwit is offered under the AGPL v3.0 and as commercial software.
|
||||
// For commercial licensing, contact us at hello@quickwit.io.
|
||||
//
|
||||
// AGPL:
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
|
||||
use super::term::JSON_MARKER_BIT;
|
||||
pub struct JsonTermWriter {
|
||||
buffer: Vec<u8>,
|
||||
path_stack: Vec<usize>,
|
||||
}
|
||||
|
||||
impl JsonTermWriter {
|
||||
pub fn new() -> Self {
|
||||
let mut buffer = Vec::with_capacity(100);
|
||||
buffer.resize(5, 0);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(5);
|
||||
Self { buffer, path_stack }
|
||||
}
|
||||
|
||||
pub fn with_field(field: Field) -> Self {
|
||||
let mut json_term_writer: JsonTermWriter = Self::new();
|
||||
json_term_writer.set_field(field);
|
||||
json_term_writer
|
||||
}
|
||||
|
||||
fn trim_to_end_of_path(&mut self) {
|
||||
let end_of_path = *self.path_stack.last().unwrap();
|
||||
self.buffer.resize(end_of_path, 0u8);
|
||||
self.buffer[end_of_path - 1] = super::term::JSON_PATH_SEGMENT_SEP;
|
||||
}
|
||||
|
||||
fn close_path_and_set_type(&mut self, typ: Type) {
|
||||
self.trim_to_end_of_path();
|
||||
self.buffer[4] = JSON_MARKER_BIT | typ.to_code();
|
||||
let end_of_path = self.buffer.len();
|
||||
self.buffer[end_of_path - 1] = super::term::JSON_END_OF_PATH;
|
||||
}
|
||||
|
||||
pub fn push_path_segment(&mut self, segment: &str) {
|
||||
// the path stack should never be empty.
|
||||
self.trim_to_end_of_path();
|
||||
self.buffer.extend_from_slice(segment.as_bytes());
|
||||
self.buffer.push(super::term::JSON_PATH_SEGMENT_SEP);
|
||||
self.path_stack.push(self.buffer.len());
|
||||
}
|
||||
|
||||
pub fn pop_path_segment(&mut self) {
|
||||
self.path_stack.pop();
|
||||
assert!(self.path_stack.len() > 0);
|
||||
self.trim_to_end_of_path();
|
||||
}
|
||||
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.close_path_and_set_type(Type::Str);
|
||||
self.buffer.extend_from_slice(text.as_bytes());
|
||||
}
|
||||
|
||||
pub fn set_i64(&mut self, val: i64) {
|
||||
self.close_path_and_set_type(Type::I64);
|
||||
self.buffer
|
||||
.extend_from_slice(val.to_u64().to_be_bytes().as_slice());
|
||||
}
|
||||
|
||||
pub fn set_field(&mut self, field: Field) {
|
||||
self.buffer[0..4].copy_from_slice(field.field_id().to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
pub fn term(&self) -> Term<&[u8]> {
|
||||
Term::wrap(&self.buffer[..])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::Field;
|
||||
|
||||
use super::JsonTermWriter;
|
||||
|
||||
#[test]
|
||||
fn test_json_writer() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut json_writer = JsonTermWriter::with_field(field);
|
||||
json_writer.push_path_segment("attributes");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_text("red");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Str, field=1, path=attributes.color, \"red\")"
|
||||
);
|
||||
json_writer.set_text("blue");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Str, field=1, path=attributes.color, \"blue\")"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("dimensions");
|
||||
json_writer.push_path_segment("width");
|
||||
json_writer.set_i64(400);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=I64, field=1, path=attributes.dimensions.width, 400)"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("height");
|
||||
json_writer.set_i64(300);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=I64, field=1, path=attributes.dimensions.height, 300)"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,7 @@ impl TextOptions {
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
|
||||
/// - the name of the `Tokenizer` that should be used to process the field.
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
record: IndexRecordOption,
|
||||
fieldnorms: bool,
|
||||
|
||||
@@ -27,6 +27,7 @@ pub enum Value {
|
||||
Facet(Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(Vec<u8>),
|
||||
JsonObject(serde_json::Map<String, serde_json::Value>),
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
@@ -43,6 +44,7 @@ impl Serialize for Value {
|
||||
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -248,6 +250,7 @@ mod binary_serialize {
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
const EXT_CODE: u8 = 7;
|
||||
const JSON_OBJ_CODE: u8 = 8;
|
||||
|
||||
// extended types
|
||||
|
||||
@@ -296,8 +299,14 @@ mod binary_serialize {
|
||||
BYTES_CODE.serialize(writer)?;
|
||||
bytes.serialize(writer)
|
||||
}
|
||||
Value::JsonObject(ref map) => {
|
||||
JSON_OBJ_CODE.serialize(writer)?;
|
||||
serde_json::to_writer(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let type_code = u8::deserialize(reader)?;
|
||||
match type_code {
|
||||
@@ -347,6 +356,10 @@ mod binary_serialize {
|
||||
)),
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
let map = serde_json::from_reader(reader)?;
|
||||
Ok(Value::JsonObject(map))
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}", type_code),
|
||||
|
||||
@@ -127,7 +127,6 @@ mod remove_long;
|
||||
mod simple_tokenizer;
|
||||
mod stemmer;
|
||||
mod stop_word_filter;
|
||||
mod token_stream_chain;
|
||||
mod tokenized_string;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
@@ -143,7 +142,6 @@ pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
use std::ops::DerefMut;
|
||||
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
|
||||
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
pub(crate) struct TokenStreamChain<'a> {
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
position_shift: usize,
|
||||
stream_idx: usize,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> TokenStreamChain<'a> {
|
||||
pub fn new(
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
) -> TokenStreamChain<'a> {
|
||||
TokenStreamChain {
|
||||
offsets,
|
||||
stream_idx: 0,
|
||||
token_streams,
|
||||
position_shift: 0,
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.stream_idx < self.token_streams.len() {
|
||||
let token_stream = self.token_streams[self.stream_idx].deref_mut();
|
||||
if token_stream.advance() {
|
||||
let token = token_stream.token();
|
||||
let offset_offset = self.offsets[self.stream_idx];
|
||||
self.token.offset_from = token.offset_from + offset_offset;
|
||||
self.token.offset_to = token.offset_to + offset_offset;
|
||||
self.token.position = token.position + self.position_shift;
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(token.text.as_str());
|
||||
return true;
|
||||
} else {
|
||||
self.stream_idx += 1;
|
||||
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
assert!(
|
||||
self.stream_idx <= self.token_streams.len(),
|
||||
"You called .token(), after the end of the token stream has been reached"
|
||||
);
|
||||
&self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
assert!(
|
||||
self.stream_idx <= self.token_streams.len(),
|
||||
"You called .token(), after the end of the token stream has been reached"
|
||||
);
|
||||
&mut self.token
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::{SimpleTokenizer, TokenStream, Tokenizer};
|
||||
use super::{TokenStreamChain, POSITION_GAP};
|
||||
|
||||
#[test]
|
||||
fn test_chain_first_emits_no_tokens() {
|
||||
let token_streams = vec![
|
||||
SimpleTokenizer.token_stream(""),
|
||||
SimpleTokenizer.token_stream("hello world"),
|
||||
];
|
||||
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
assert_eq!(token_chain.token().text, "hello");
|
||||
assert_eq!(token_chain.token().offset_from, 0);
|
||||
assert_eq!(token_chain.token().offset_to, 5);
|
||||
assert_eq!(token_chain.token().position, POSITION_GAP - 1);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
assert_eq!(token_chain.token().text, "world");
|
||||
assert_eq!(token_chain.token().offset_from, 6);
|
||||
assert_eq!(token_chain.token().offset_to, 11);
|
||||
assert_eq!(token_chain.token().position, POSITION_GAP);
|
||||
|
||||
assert!(!token_chain.advance());
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,7 @@ use std::cmp::Ordering;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
/// Struct representing pre-tokenized text
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
@@ -40,32 +40,6 @@ impl From<PreTokenizedString> for PreTokenizedStream {
|
||||
}
|
||||
}
|
||||
|
||||
impl PreTokenizedStream {
|
||||
/// Creates a TokenStream from PreTokenizedString array
|
||||
pub fn chain_tokenized_strings<'a>(
|
||||
tok_strings: &'a [&'a PreTokenizedString],
|
||||
) -> BoxTokenStream {
|
||||
if tok_strings.len() == 1 {
|
||||
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &tok_string in tok_strings {
|
||||
offsets.push(total_offset);
|
||||
if let Some(last_token) = tok_string.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
// TODO remove the string cloning.
|
||||
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||
.iter()
|
||||
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||
.collect();
|
||||
TokenStreamChain::new(offsets, token_streams).into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for PreTokenizedStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.current_token += 1;
|
||||
@@ -125,68 +99,4 @@ mod tests {
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chain_tokenized_strings() {
|
||||
let tok_text = PreTokenizedString {
|
||||
text: String::from("A a"),
|
||||
tokens: vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let chain_parts = vec![&tok_text, &tok_text];
|
||||
|
||||
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
|
||||
let expected_tokens = vec![
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 1,
|
||||
position: 0,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
position: 1,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 3,
|
||||
offset_to: 4,
|
||||
position: 3,
|
||||
text: String::from("A"),
|
||||
position_length: 1,
|
||||
},
|
||||
Token {
|
||||
offset_from: 5,
|
||||
offset_to: 6,
|
||||
position: 4,
|
||||
text: String::from("a"),
|
||||
position_length: 1,
|
||||
},
|
||||
];
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
assert!(token_stream.advance());
|
||||
assert_eq!(token_stream.token(), &expected_token);
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,6 @@ use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::tokenizer::TokenStreamChain;
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct Token {
|
||||
@@ -84,31 +82,6 @@ impl TextAnalyzer {
|
||||
self
|
||||
}
|
||||
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> {
|
||||
assert!(!texts.is_empty());
|
||||
if texts.len() == 1 {
|
||||
self.token_stream(texts[0])
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
}
|
||||
let token_streams: Vec<BoxTokenStream<'a>> = texts
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|text| self.token_stream(text))
|
||||
.collect();
|
||||
From::from(TokenStreamChain::new(offsets, token_streams))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
let mut token_stream = self.tokenizer.token_stream(text);
|
||||
@@ -284,13 +257,10 @@ pub trait TokenStream {
|
||||
/// and push the tokens to a sink function.
|
||||
///
|
||||
/// Remove this.
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
||||
let mut num_tokens_pushed = 0u32;
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
|
||||
while self.advance() {
|
||||
sink(self.token());
|
||||
num_tokens_pushed += 1u32;
|
||||
}
|
||||
num_tokens_pushed
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user