mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-30 05:52:54 +00:00
Compare commits
38 Commits
fix_open_b
...
0.17
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2e255c4bef | ||
|
|
387592809f | ||
|
|
cedced5bb0 | ||
|
|
d31f045872 | ||
|
|
6656a70d1b | ||
|
|
d36e0a9549 | ||
|
|
8771b2673f | ||
|
|
a41d3d51a4 | ||
|
|
cae34ffe47 | ||
|
|
4b62f7907d | ||
|
|
7fa6a0b665 | ||
|
|
458ed29a31 | ||
|
|
e37775fe21 | ||
|
|
1cd2434a32 | ||
|
|
de2cba6d1e | ||
|
|
c0b1a58d27 | ||
|
|
848b795b9f | ||
|
|
091b668624 | ||
|
|
5004290daa | ||
|
|
5d2c2b804c | ||
|
|
1a92b588e0 | ||
|
|
010e92c118 | ||
|
|
2ead010c83 | ||
|
|
c4f66eb185 | ||
|
|
d7b46d2137 | ||
|
|
d042ce74c7 | ||
|
|
7ba9e662b8 | ||
|
|
fdd5ef85e5 | ||
|
|
704498a1ac | ||
|
|
1232af7928 | ||
|
|
d37633e034 | ||
|
|
9815067171 | ||
|
|
972cb6c26d | ||
|
|
4dc80cfa25 | ||
|
|
cef145790c | ||
|
|
e05e2a0c51 | ||
|
|
e028515caf | ||
|
|
850b9eaea4 |
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -10,7 +10,7 @@ jobs:
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --component llvm-tools-preview
|
||||
- name: Install cargo-llvm-cov
|
||||
|
||||
4
.github/workflows/long_running.yml
vendored
4
.github/workflows/long_running.yml
vendored
@@ -12,13 +12,13 @@ jobs:
|
||||
functional_test_unsorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Run indexing_unsorted
|
||||
run: cargo test indexing_unsorted -- --ignored
|
||||
functional_test_sorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Run indexing_sorted
|
||||
run: cargo test indexing_sorted -- --ignored
|
||||
|
||||
|
||||
9
.github/workflows/test.yml
vendored
9
.github/workflows/test.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
@@ -24,16 +24,23 @@ jobs:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
components: rustfmt
|
||||
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt, clippy
|
||||
|
||||
- name: Run tests
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
||||
|
||||
- name: Run tests quickwit feature
|
||||
run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace
|
||||
|
||||
- name: Check Formatting
|
||||
run: cargo +nightly fmt --all -- --check
|
||||
|
||||
- uses: actions-rs/clippy-check@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -7,6 +7,10 @@ Tantivy 0.17
|
||||
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-oss/tantivy/issues/1224)
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
|
||||
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
|
||||
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
|
||||
15
Cargo.toml
15
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.17.0-dev"
|
||||
version = "0.17.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -35,7 +35,7 @@ crossbeam = "0.8.1"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
|
||||
common = { version = "0.2", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
@@ -55,6 +55,9 @@ lru = "0.7.0"
|
||||
fastdivide = "0.4"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.8.0"
|
||||
pretty_assertions = "1.1.0"
|
||||
serde_cbor = {version="0.11", optional=true}
|
||||
async-trait = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
@@ -67,6 +70,7 @@ proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
pprof = {version= "0.6", features=["flamegraph", "criterion"]}
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5"
|
||||
@@ -92,6 +96,8 @@ snappy-compression = ["snap"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
quickwit = ["serde_cbor"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
||||
|
||||
@@ -110,3 +116,8 @@ required-features = ["fail/failpoints"]
|
||||
[[bench]]
|
||||
name = "analyzer"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "index-bench"
|
||||
harness = false
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
[](https://docs.rs/crate/tantivy/)
|
||||
[](https://github.com/quickwit-oss/tantivy/actions/workflows/test.yml)
|
||||
[](https://codecov.io/gh/quickwit-oss/tantivy)
|
||||
|
||||
100000
benches/hdfs.json
Normal file
100000
benches/hdfs.json
Normal file
File diff suppressed because it is too large
Load Diff
121
benches/index-bench.rs
Normal file
121
benches/index-bench.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
||||
const NUM_REPEATS: usize = 2;
|
||||
|
||||
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let schema = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_u64_field("timestamp", INDEXED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
schema_builder.add_text_field("severity", STRING);
|
||||
schema_builder.build()
|
||||
};
|
||||
let schema_with_store = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_u64_field("timestamp", INDEXED | STORED);
|
||||
schema_builder.add_text_field("body", TEXT | STORED);
|
||||
schema_builder.add_text_field("severity", STRING | STORED);
|
||||
schema_builder.build()
|
||||
};
|
||||
let dynamic_schema = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_json_field("json", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("index-hdfs");
|
||||
group.sample_size(20);
|
||||
group.bench_function("index-hdfs-no-commit", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
let doc = tantivy::doc!(json_field=>json_val);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
let doc = tantivy::doc!(json_field=>json_val);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = hdfs_index_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
@@ -6,6 +6,7 @@ extern crate test;
|
||||
mod tests {
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
@@ -20,6 +21,7 @@ mod tests {
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
|
||||
128
doc/src/json.md
Normal file
128
doc/src/json.md
Normal file
@@ -0,0 +1,128 @@
|
||||
# Json
|
||||
|
||||
As of tantivy 0.17, tantivy supports a json object type.
|
||||
This type can be used to allow for a schema-less search index.
|
||||
|
||||
When indexing a json object, we "flatten" the JSON. This operation emits terms that represent a triplet `(json_path, value_type, value)`
|
||||
|
||||
For instance, if user is a json field, the following document:
|
||||
|
||||
```json
|
||||
{
|
||||
"user": {
|
||||
"name": "Paul Masurel",
|
||||
"address": {
|
||||
"city": "Tokyo",
|
||||
"country": "Japan"
|
||||
},
|
||||
"created_at": "2018-11-12T23:20:50.52Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
emits the following tokens:
|
||||
- ("name", Text, "Paul")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
|
||||
# Bytes-encoding and lexicographical sort.
|
||||
|
||||
Like any other terms, these triplets are encoded into a binary format as follows.
|
||||
- `json_path`: the json path is a sequence of "segments". In the example above, `address.city`
|
||||
is just a debug representation of the json path `["address", "city"]`.
|
||||
Its representation is done by separating segments by a unicode char `\x01`, and ending the path by `\x00`.
|
||||
- `value type`: One byte represents the `Value` type.
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("name", Text, "Paul")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
As seen in "pitfalls", we may end up having to search for a value for a same path in several different fields. Putting the field code after the path makes it maximizes compression opportunities but also increases the chances for the two terms to end up in the actual same term dictionary block.
|
||||
|
||||
|
||||
# Pitfalls, limitation and corner cases.
|
||||
|
||||
Json gives very little information about the type of the literals it stores.
|
||||
All numeric types end up mapped as a "Number" and there are no types for dates.
|
||||
|
||||
At indexing, tantivy will try to interpret number and strings as different type with a
|
||||
priority order.
|
||||
|
||||
Numbers will be interpreted as u64, i64 and f64 in that order.
|
||||
Strings will be interpreted as rfc3999 dates or simple strings.
|
||||
|
||||
The first working type is picked and is the only term that is emitted for indexing.
|
||||
Note this interpretation happens on a per-document basis, and there is no effort to try to sniff
|
||||
a consistent field type at the scale of a segment.
|
||||
|
||||
On the query parser side on the other hand, we may end up emitting more than one type.
|
||||
For instance, we do not even know if the type is a number or string based.
|
||||
|
||||
So the query
|
||||
|
||||
```
|
||||
my_path.my_segment:233
|
||||
```
|
||||
|
||||
Will be interpreted as
|
||||
`(my_path.my_segment, String, 233) or (my_path.my_segment, u64, 233)`
|
||||
|
||||
Likewise, we need to emit two tokens if the query contains an rfc3999 date.
|
||||
Indeed the date could have been actually a single token inside the text of a document at ingestion time. Generally speaking, we will always at least emit a string token in query parsing, and sometimes more.
|
||||
|
||||
If one more json field is defined, things get even more complicated.
|
||||
|
||||
|
||||
## Default json field
|
||||
|
||||
If the schema contains a text field called "text" and a json field that is set as a default field:
|
||||
`text:hello` could be reasonably interpreted as targetting the text field or as targetting the json field called `json_dynamic` with the json_path "text".
|
||||
|
||||
If there is such an ambiguity, we decide to only search in the "text" field: `text:hello`.
|
||||
|
||||
In other words, the parser will not search in default json fields if there is a schema hit.
|
||||
This is a product decision.
|
||||
|
||||
The user can still target the JSON field by specifying its name explicitly:
|
||||
`json_dynamic.text:hello`.
|
||||
|
||||
## Range queries are not supported.
|
||||
|
||||
Json field do not support range queries.
|
||||
|
||||
## Arrays do not work like nested object.
|
||||
|
||||
If json object contains an array, a search query might return more documents
|
||||
than what might be expected.
|
||||
|
||||
Let's take an example.
|
||||
|
||||
```json
|
||||
{
|
||||
"cart_id": 3234234 ,
|
||||
"cart": [
|
||||
{"product_type": "sneakers", "attributes": {"color": "white"} },
|
||||
{"product_type": "t-shirt", "attributes": {"color": "red"}},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Despite the array structure, a document in tantivy is a bag of terms.
|
||||
The query:
|
||||
|
||||
```
|
||||
cart.product_type:sneakers AND cart.attributes.color:red
|
||||
```
|
||||
|
||||
Actually match the document above.
|
||||
|
||||
131
examples/aggregation.rs
Normal file
131
examples/aggregation.rs
Normal file
@@ -0,0 +1,131 @@
|
||||
// # Aggregation example
|
||||
//
|
||||
// This example shows how you can use built-in aggregations.
|
||||
// We will use range buckets and compute the average in each bucket.
|
||||
//
|
||||
|
||||
use serde_json::Value;
|
||||
use tantivy::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::metric::AverageAggregation;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::{doc, Index, Term};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
|
||||
let price_field = schema_builder.add_f64_field("price", score_fieldtype.clone());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of documents for this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 1f64,
|
||||
price_field => 0f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 3f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 5f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
highscore_field => 6f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 7f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 11f64,
|
||||
price_field => 10f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 14f64,
|
||||
price_field => 15f64,
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 15f64,
|
||||
price_field => 20f64,
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req_1: Aggregations = vec![(
|
||||
"average_price".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("price".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"score_ranges".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "highscore".to_string(),
|
||||
ranges: vec![
|
||||
(-1f64..9f64).into(),
|
||||
(9f64..14f64).into(),
|
||||
(14f64..20f64).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1.clone(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
80
examples/json_field.rs
Normal file
80
examples/json_field.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// We need two fields:
|
||||
// - a timestamp
|
||||
// - a json object field
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
let attributes = schema_builder.add_json_field("attributes", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:50.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 103},
|
||||
"description": "the best vacuum cleaner ever"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:51.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("cart.product_id:103")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser
|
||||
.parse_query("event_type:click AND cart.product_id:133")
|
||||
.unwrap();
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -9,7 +9,7 @@ description = "Fast field codecs used by tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.1", path = "../common/", package = "tantivy-common" }
|
||||
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
@@ -63,6 +63,7 @@ pub trait FastFieldDataAccess {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
/// Statistics are used in codec detection and stored in the fast field footer.
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
|
||||
@@ -419,10 +419,7 @@ mod tests {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
let _ = create_and_validate(&data, "random");
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ pub enum UserInputBound {
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
@@ -67,7 +67,7 @@ impl UserInputBound {
|
||||
}
|
||||
}
|
||||
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
|
||||
36
src/aggregation/README.md
Normal file
36
src/aggregation/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Contributing
|
||||
|
||||
When adding new bucket aggregation make sure to extend the "test_aggregation_flushing" test for at least 2 levels.
|
||||
|
||||
|
||||
|
||||
# Code Organization
|
||||
|
||||
Tantivy's aggregations have been designed to mimic the
|
||||
[aggregations of elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html).
|
||||
|
||||
The code is organized in submodules:
|
||||
|
||||
## bucket
|
||||
Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggegations.
|
||||
|
||||
## metric
|
||||
Contains all metric aggregations, like average aggregation. Metric aggregations do not have sub aggregations.
|
||||
|
||||
#### agg_req
|
||||
agg_req contains the users aggregation request. Deserialization from json is compatible with elasticsearch aggregation requests.
|
||||
|
||||
#### agg_req_with_accessor
|
||||
agg_req_with_accessor contains the users aggregation request enriched with fast field accessors etc, which are
|
||||
used during collection.
|
||||
|
||||
#### segment_agg_result
|
||||
segment_agg_result contains the aggregation result tree, which is used for collection of a segment.
|
||||
The tree from agg_req_with_accessor is passed during collection.
|
||||
|
||||
#### intermediate_agg_result
|
||||
intermediate_agg_result contains the aggregation tree for merging with other trees.
|
||||
|
||||
#### agg_result
|
||||
agg_result contains the final aggregation tree.
|
||||
|
||||
270
src/aggregation/agg_req.rs
Normal file
270
src/aggregation/agg_req.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
//! Contains the aggregation request tree. Used to build an
|
||||
//! [AggregationCollector](super::AggregationCollector).
|
||||
//!
|
||||
//! [Aggregations] is the top level entry point to create a request, which is a `HashMap<String,
|
||||
//! Aggregation>`.
|
||||
//!
|
||||
//! Requests are compatible with the json format of elasticsearch.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! use tantivy::aggregation::bucket::RangeAggregation;
|
||||
//! use tantivy::aggregation::agg_req::BucketAggregationType;
|
||||
//! use tantivy::aggregation::agg_req::{Aggregation, Aggregations};
|
||||
//! use tantivy::aggregation::agg_req::BucketAggregation;
|
||||
//! let agg_req1: Aggregations = vec![
|
||||
//! (
|
||||
//! "range".to_string(),
|
||||
//! Aggregation::Bucket(BucketAggregation {
|
||||
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
|
||||
//! field: "score".to_string(),
|
||||
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
//! }),
|
||||
//! sub_aggregation: Default::default(),
|
||||
//! }),
|
||||
//! ),
|
||||
//! ]
|
||||
//! .into_iter()
|
||||
//! .collect();
|
||||
//!
|
||||
//! let elasticsearch_compatible_json_req = r#"
|
||||
//! {
|
||||
//! "range": {
|
||||
//! "range": {
|
||||
//! "field": "score",
|
||||
//! "ranges": [
|
||||
//! { "from": 3.0, "to": 7.0 },
|
||||
//! { "from": 7.0, "to": 20.0 }
|
||||
//! ]
|
||||
//! }
|
||||
//! }
|
||||
//! }"#;
|
||||
//! let agg_req2: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
//! assert_eq!(agg_req1, agg_req2);
|
||||
//! ```
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use super::bucket::RangeAggregation;
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
|
||||
/// The top-level aggregation request structure, which contains [Aggregation] and their user defined
|
||||
/// names. It is also used in [buckets](BucketAggregation) to define sub-aggregations.
|
||||
///
|
||||
/// The key is the user defined name of the aggregation.
|
||||
pub type Aggregations = HashMap<String, Aggregation>;
|
||||
|
||||
/// Extract all fast field names used in the tree.
|
||||
pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut fast_field_names = Default::default();
|
||||
for el in aggs.values() {
|
||||
el.get_fast_field_names(&mut fast_field_names)
|
||||
}
|
||||
fast_field_names
|
||||
}
|
||||
|
||||
/// Aggregation request of [BucketAggregation] or [MetricAggregation].
|
||||
///
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum Aggregation {
|
||||
/// Bucket aggregation, see [BucketAggregation] for details.
|
||||
Bucket(BucketAggregation),
|
||||
/// Metric aggregation, see [MetricAggregation] for details.
|
||||
Metric(MetricAggregation),
|
||||
}
|
||||
|
||||
impl Aggregation {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
|
||||
Aggregation::Metric(metric) => metric.get_fast_field_names(fast_field_names),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// BucketAggregations create buckets of documents. Each bucket is associated with a rule which
|
||||
/// determines whether or not a document in the falls into it. In other words, the buckets
|
||||
/// effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
|
||||
/// fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
|
||||
/// compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
|
||||
/// metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
|
||||
/// the buckets created by their "parent" bucket aggregation. There are different bucket
|
||||
/// aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
|
||||
/// define fixed number of multiple buckets, and others dynamically create the buckets during the
|
||||
/// aggregation process.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BucketAggregation {
|
||||
/// Bucket aggregation strategy to group documents.
|
||||
#[serde(flatten)]
|
||||
pub bucket_agg: BucketAggregationType,
|
||||
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
|
||||
/// bucket.
|
||||
#[serde(rename = "aggs")]
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Aggregations::is_empty")]
|
||||
pub sub_aggregation: Aggregations,
|
||||
}
|
||||
|
||||
impl BucketAggregation {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
self.bucket_agg.get_fast_field_names(fast_field_names);
|
||||
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
|
||||
}
|
||||
}
|
||||
|
||||
/// The bucket aggregation types.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum BucketAggregationType {
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "range")]
|
||||
Range(RangeAggregation),
|
||||
}
|
||||
|
||||
impl BucketAggregationType {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// The aggregations in this family compute metrics based on values extracted
|
||||
/// from the documents that are being aggregated. Values are extracted from the fast field of
|
||||
/// the document.
|
||||
|
||||
/// Some aggregations output a single numeric metric (e.g. Average) and are called
|
||||
/// single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
|
||||
/// called multi-value numeric metrics aggregation.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum MetricAggregation {
|
||||
/// Calculates the average.
|
||||
#[serde(rename = "avg")]
|
||||
Average(AverageAggregation),
|
||||
/// Calculates stats sum, average, min, max, standard_deviation on a field.
|
||||
#[serde(rename = "stats")]
|
||||
Stats(StatsAggregation),
|
||||
}
|
||||
|
||||
impl MetricAggregation {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
MetricAggregation::Average(avg) => fast_field_names.insert(avg.field.to_string()),
|
||||
MetricAggregation::Stats(stats) => fast_field_names.insert(stats.field.to_string()),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn serialize_to_json_test() {
|
||||
let agg_req1: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let elasticsearch_compatible_json_req = r#"{
|
||||
"range": {
|
||||
"range": {
|
||||
"field": "score",
|
||||
"ranges": [
|
||||
{
|
||||
"to": 3.0
|
||||
},
|
||||
{
|
||||
"from": 3.0,
|
||||
"to": 7.0
|
||||
},
|
||||
{
|
||||
"from": 7.0,
|
||||
"to": 20.0
|
||||
},
|
||||
{
|
||||
"from": 20.0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let agg_req2: String = serde_json::to_string_pretty(&agg_req1).unwrap();
|
||||
assert_eq!(agg_req2, elasticsearch_compatible_json_req);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_fast_field_names() {
|
||||
let agg_req2: Aggregations = vec![
|
||||
(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score2".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
),
|
||||
(
|
||||
"metric".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("field123".to_string()),
|
||||
)),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req1: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: agg_req2,
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
get_fast_field_names(&agg_req1),
|
||||
vec![
|
||||
"score".to_string(),
|
||||
"score2".to_string(),
|
||||
"field123".to_string()
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
}
|
||||
146
src/aggregation/agg_req_with_accessor.rs
Normal file
146
src/aggregation/agg_req_with_accessor.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
//! This will enhance the request tree with access to the fastfield and metadata.
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::RangeAggregation;
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{type_and_cardinality, DynamicFastFieldReader, FastType};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
pub metrics: VecWithNames<MetricAggregationWithAccessor>,
|
||||
pub buckets: VecWithNames<BucketAggregationWithAccessor>,
|
||||
}
|
||||
|
||||
impl AggregationsWithAccessor {
|
||||
fn from_data(
|
||||
metrics: VecWithNames<MetricAggregationWithAccessor>,
|
||||
buckets: VecWithNames<BucketAggregationWithAccessor>,
|
||||
) -> Self {
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.metrics.is_empty() && self.buckets.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BucketAggregationWithAccessor {
|
||||
/// In general there can be buckets without fast field access, e.g. buckets that are created
|
||||
/// based on search terms. So eventually this needs to be Option or moved.
|
||||
pub(crate) accessor: DynamicFastFieldReader<u64>,
|
||||
pub(crate) field_type: Type,
|
||||
pub(crate) bucket_agg: BucketAggregationType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
}
|
||||
|
||||
impl BucketAggregationWithAccessor {
|
||||
fn try_from_bucket(
|
||||
bucket: &BucketAggregationType,
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<BucketAggregationWithAccessor> {
|
||||
let (accessor, field_type) = match &bucket {
|
||||
BucketAggregationType::Range(RangeAggregation {
|
||||
field: field_name,
|
||||
ranges: _,
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
};
|
||||
let sub_aggregation = sub_aggregation.clone();
|
||||
Ok(BucketAggregationWithAccessor {
|
||||
accessor,
|
||||
field_type,
|
||||
sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
|
||||
bucket_agg: bucket.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Contains the metric request and the fast field accessor.
|
||||
#[derive(Clone)]
|
||||
pub struct MetricAggregationWithAccessor {
|
||||
pub metric: MetricAggregation,
|
||||
pub field_type: Type,
|
||||
pub accessor: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl MetricAggregationWithAccessor {
|
||||
fn try_from_metric(
|
||||
metric: &MetricAggregation,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<MetricAggregationWithAccessor> {
|
||||
match &metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: field_name })
|
||||
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
|
||||
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor,
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<AggregationsWithAccessor> {
|
||||
let mut metrics = vec![];
|
||||
let mut buckets = vec![];
|
||||
for (key, agg) in aggs.iter() {
|
||||
match agg {
|
||||
Aggregation::Bucket(bucket) => buckets.push((
|
||||
key.to_string(),
|
||||
BucketAggregationWithAccessor::try_from_bucket(
|
||||
&bucket.bucket_agg,
|
||||
&bucket.sub_aggregation,
|
||||
reader,
|
||||
)?,
|
||||
)),
|
||||
Aggregation::Metric(metric) => metrics.push((
|
||||
key.to_string(),
|
||||
MetricAggregationWithAccessor::try_from_metric(metric, reader)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
Ok(AggregationsWithAccessor::from_data(
|
||||
VecWithNames::from_entries(metrics),
|
||||
VecWithNames::from_entries(buckets),
|
||||
))
|
||||
}
|
||||
|
||||
fn get_ff_reader_and_validate(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
|
||||
if let Some((ff_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
if cardinality == Cardinality::MultiValues || ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field type in aggregation {:?}, only Cardinality::SingleValue supported",
|
||||
field_type.value_type()
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Only single value fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
field_type.value_type()
|
||||
)));
|
||||
};
|
||||
|
||||
let ff_fields = reader.fast_fields();
|
||||
ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (field, field_type.value_type()))
|
||||
}
|
||||
170
src/aggregation/agg_result.rs
Normal file
170
src/aggregation/agg_result.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
//! Contains the final aggregation tree.
|
||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||
//! This conversion computes the final result. For example: The intermediate result contains
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
//! calculated on the step from intermediate to final aggregation result tree.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateMetricResult, IntermediateRangeBucketEntry,
|
||||
};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::Key;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The final aggegation result.
|
||||
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
|
||||
|
||||
impl From<IntermediateAggregationResults> for AggregationResults {
|
||||
fn from(tree: IntermediateAggregationResults) -> Self {
|
||||
Self(
|
||||
tree.0
|
||||
.into_iter()
|
||||
.map(|(key, agg)| (key, agg.into()))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
pub enum AggregationResult {
|
||||
/// Bucket result variant.
|
||||
BucketResult(BucketResult),
|
||||
/// Metric result variant.
|
||||
MetricResult(MetricResult),
|
||||
}
|
||||
impl From<IntermediateAggregationResult> for AggregationResult {
|
||||
fn from(tree: IntermediateAggregationResult) -> Self {
|
||||
match tree {
|
||||
IntermediateAggregationResult::Bucket(bucket) => {
|
||||
AggregationResult::BucketResult(bucket.into())
|
||||
}
|
||||
IntermediateAggregationResult::Metric(metric) => {
|
||||
AggregationResult::MetricResult(metric.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// MetricResult
|
||||
pub enum MetricResult {
|
||||
/// Average metric result.
|
||||
Average(SingleMetricResult),
|
||||
/// Stats metric result.
|
||||
Stats(Stats),
|
||||
}
|
||||
|
||||
impl From<IntermediateMetricResult> for MetricResult {
|
||||
fn from(metric: IntermediateMetricResult) -> Self {
|
||||
match metric {
|
||||
IntermediateMetricResult::Average(avg_data) => {
|
||||
MetricResult::Average(avg_data.finalize().into())
|
||||
}
|
||||
IntermediateMetricResult::Stats(intermediate_stats) => {
|
||||
MetricResult::Stats(intermediate_stats.finalize())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// BucketEntry holds bucket aggregation result types.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum BucketResult {
|
||||
/// This is the default entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Range {
|
||||
/// The range buckets sorted by range.
|
||||
buckets: Vec<RangeBucketEntry>,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<IntermediateBucketResult> for BucketResult {
|
||||
fn from(result: IntermediateBucketResult) -> Self {
|
||||
match result {
|
||||
IntermediateBucketResult::Range(range_map) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_map
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| bucket.into())
|
||||
.collect_vec();
|
||||
|
||||
buckets.sort_by(|a, b| {
|
||||
a.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.partial_cmp(&b.from.unwrap_or(f64::MIN))
|
||||
.unwrap_or(Ordering::Equal)
|
||||
});
|
||||
BucketResult::Range { buckets }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// ...
|
||||
/// "my_ranges": {
|
||||
/// "buckets": [
|
||||
/// {
|
||||
/// "key": "*-10",
|
||||
/// "to": 10,
|
||||
/// "doc_count": 5
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "10-20",
|
||||
/// "from": 10,
|
||||
/// "to": 20,
|
||||
/// "doc_count": 2
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "20-*",
|
||||
/// "from": 20,
|
||||
/// "doc_count": 3
|
||||
/// }
|
||||
/// ]
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeBucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
pub key: Key,
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<IntermediateRangeBucketEntry> for RangeBucketEntry {
|
||||
fn from(entry: IntermediateRangeBucketEntry) -> Self {
|
||||
RangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry.sub_aggregation.into(),
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
13
src/aggregation/bucket/mod.rs
Normal file
13
src/aggregation/bucket/mod.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
//! Module for all bucket aggregations.
|
||||
//!
|
||||
//! BucketAggregations create buckets of documents
|
||||
//! [BucketAggregation](super::agg_req::BucketAggregation).
|
||||
//!
|
||||
//! Results of final buckets are [BucketResult](super::agg_result::BucketResult).
|
||||
//! Results of intermediate buckets are
|
||||
//! [IntermediateBucketResult](super::intermediate_agg_result::IntermediateBucketResult)
|
||||
|
||||
mod range;
|
||||
|
||||
pub(crate) use range::SegmentRangeCollector;
|
||||
pub use range::*;
|
||||
563
src/aggregation/bucket/range.rs
Normal file
563
src/aggregation/bucket/range.rs
Normal file
@@ -0,0 +1,563 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
|
||||
};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||
/// against each bucket range. Note that this aggregation includes the from value and excludes the
|
||||
/// to value for each range.
|
||||
///
|
||||
/// Result type is [BucketResult](crate::aggregation::agg_result::BucketResult) with
|
||||
/// [RangeBucketEntry](crate::aggregation::agg_result::RangeBucketEntry) on the
|
||||
/// AggregationCollector.
|
||||
///
|
||||
/// Result type is
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateBucketResult] with
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateRangeBucketEntry] on the
|
||||
/// DistributedAggregationCollector.
|
||||
///
|
||||
/// # Request JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "range": {
|
||||
/// "field": "score",
|
||||
/// "ranges": [
|
||||
/// { "to": 3.0 },
|
||||
/// { "from": 3.0, "to": 7.0 },
|
||||
/// { "from": 7.0, "to": 20.0 }
|
||||
/// { "from": 20.0 }
|
||||
/// ]
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeAggregation {
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// Note that this aggregation includes the from value and excludes the to value for each
|
||||
/// range. Extra buckets will be created until the first to, and last from, if necessary.
|
||||
pub ranges: Vec<RangeAggregationRange>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The range for one range bucket.
|
||||
pub struct RangeAggregationRange {
|
||||
/// The from range value, which is inclusive in the range.
|
||||
/// None equals to an open ended interval.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub from: Option<f64>,
|
||||
/// The to range value, which is not inclusive in the range.
|
||||
/// None equals to an open ended interval.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<Range<f64>> for RangeAggregationRange {
|
||||
fn from(range: Range<f64>) -> Self {
|
||||
let from = if range.start == f64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(range.start)
|
||||
};
|
||||
let to = if range.end == f64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(range.end)
|
||||
};
|
||||
RangeAggregationRange { from, to }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentRangeAndBucketEntry {
|
||||
range: Range<u64>,
|
||||
bucket: SegmentRangeBucketEntry,
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct SegmentRangeCollector {
|
||||
/// The buckets containing the aggregation data.
|
||||
buckets: Vec<SegmentRangeAndBucketEntry>,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl SegmentRangeCollector {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
let field_type = self.field_type;
|
||||
|
||||
let buckets = self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
(
|
||||
range_to_string(&range_bucket.range, &field_type),
|
||||
range_bucket.bucket.into(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
IntermediateBucketResult::Range(buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &RangeAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
field_type: Type,
|
||||
) -> crate::Result<Self> {
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
let buckets = extend_validate_ranges(&req.ranges, &field_type)?
|
||||
.iter()
|
||||
.map(|range| {
|
||||
let to = if range.end == u64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.end, &field_type))
|
||||
};
|
||||
let from = if range.start == u64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.start, &field_type))
|
||||
};
|
||||
let sub_aggregation = if sub_aggregation.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(SegmentAggregationResultsCollector::from_req_and_validate(
|
||||
sub_aggregation,
|
||||
)?)
|
||||
};
|
||||
Ok(SegmentRangeAndBucketEntry {
|
||||
range: range.clone(),
|
||||
bucket: SegmentRangeBucketEntry {
|
||||
key: range_to_key(range, &field_type),
|
||||
doc_count: 0,
|
||||
sub_aggregation,
|
||||
from,
|
||||
to,
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
Ok(SegmentRangeCollector {
|
||||
buckets,
|
||||
field_type,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect_block(
|
||||
&mut self,
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = bucket_with_accessor.accessor.get(docs[0]);
|
||||
let val2 = bucket_with_accessor.accessor.get(docs[1]);
|
||||
let val3 = bucket_with_accessor.accessor.get(docs[2]);
|
||||
let val4 = bucket_with_accessor.accessor.get(docs[3]);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
let bucket_pos4 = self.get_bucket_pos(val4);
|
||||
|
||||
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = bucket_with_accessor.accessor.get(*doc);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
if force_flush {
|
||||
for bucket in &mut self.buckets {
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn increment_bucket(
|
||||
&mut self,
|
||||
bucket_pos: usize,
|
||||
doc: DocId,
|
||||
bucket_with_accessor: &AggregationsWithAccessor,
|
||||
) {
|
||||
let bucket = &mut self.buckets[bucket_pos];
|
||||
|
||||
bucket.bucket.doc_count += 1;
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation.collect(doc, bucket_with_accessor);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_bucket_pos(&self, val: u64) -> usize {
|
||||
let pos = self
|
||||
.buckets
|
||||
.binary_search_by_key(&val, |probe| probe.range.start)
|
||||
.unwrap_or_else(|pos| pos - 1);
|
||||
debug_assert!(self.buckets[pos].range.contains(&val));
|
||||
pos
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the user provided f64 range value to fast field value space.
|
||||
///
|
||||
/// Internally fast field values are always stored as u64.
|
||||
/// If the fast field has u64 [1,2,5], these values are stored as is in the fast field.
|
||||
/// A fast field with f64 [1.0, 2.0, 5.0] is converted to u64 space, using a
|
||||
/// monotonic mapping function, so the order is preserved.
|
||||
///
|
||||
/// Consequently, a f64 user range 1.0..3.0 needs to be converted to fast field value space using
|
||||
/// the same monotonic mapping function, so that the provided ranges contain the u64 values in the
|
||||
/// fast field.
|
||||
/// The alternative would be that every value read would be converted to the f64 range, but that is
|
||||
/// more computational expensive when many documents are hit.
|
||||
fn to_u64_range(range: &RangeAggregationRange, field_type: &Type) -> crate::Result<Range<u64>> {
|
||||
let start = if let Some(from) = range.from {
|
||||
f64_to_fastfield_u64(from, field_type)
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
|
||||
} else {
|
||||
u64::MIN
|
||||
};
|
||||
|
||||
let end = if let Some(to) = range.to {
|
||||
f64_to_fastfield_u64(to, field_type)
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
|
||||
} else {
|
||||
u64::MAX
|
||||
};
|
||||
|
||||
Ok(start..end)
|
||||
}
|
||||
|
||||
/// Extends the provided buckets to contain the whole value range, by inserting buckets at the
|
||||
/// beginning and end.
|
||||
fn extend_validate_ranges(
|
||||
buckets: &[RangeAggregationRange],
|
||||
field_type: &Type,
|
||||
) -> crate::Result<Vec<Range<u64>>> {
|
||||
let mut converted_buckets = buckets
|
||||
.iter()
|
||||
.map(|range| to_u64_range(range, field_type))
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
converted_buckets.sort_by_key(|bucket| bucket.start);
|
||||
if converted_buckets[0].start != u64::MIN {
|
||||
converted_buckets.insert(0, u64::MIN..converted_buckets[0].start);
|
||||
}
|
||||
|
||||
if converted_buckets[converted_buckets.len() - 1].end != u64::MAX {
|
||||
converted_buckets.push(converted_buckets[converted_buckets.len() - 1].end..u64::MAX);
|
||||
}
|
||||
|
||||
// fill up holes in the ranges
|
||||
let find_hole = |converted_buckets: &[Range<u64>]| {
|
||||
for (pos, ranges) in converted_buckets.windows(2).enumerate() {
|
||||
if ranges[0].end > ranges[1].start {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Overlapping ranges not supported range {:?}, range+1 {:?}",
|
||||
ranges[0], ranges[1]
|
||||
)));
|
||||
}
|
||||
if ranges[0].end != ranges[1].start {
|
||||
return Ok(Some(pos));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
};
|
||||
|
||||
while let Some(hole_pos) = find_hole(&converted_buckets)? {
|
||||
let new_range = converted_buckets[hole_pos].end..converted_buckets[hole_pos + 1].start;
|
||||
converted_buckets.insert(hole_pos + 1, new_range);
|
||||
}
|
||||
|
||||
Ok(converted_buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> String {
|
||||
// is_start is there for malformed requests, e.g. ig the user passes the range u64::MIN..0.0,
|
||||
// it should be rendererd as "*-0" and not "*-*"
|
||||
let to_str = |val: u64, is_start: bool| {
|
||||
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
|
||||
"*".to_string()
|
||||
} else {
|
||||
f64_from_fastfield_u64(val, field_type).to_string()
|
||||
}
|
||||
};
|
||||
|
||||
format!("{}-{}", to_str(range.start, true), to_str(range.end, false))
|
||||
}
|
||||
|
||||
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> Key {
|
||||
Key::Str(range_to_string(range, field_type))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
|
||||
};
|
||||
use crate::aggregation::tests::get_test_index_with_num_docs;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::query::AllQuery;
|
||||
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: Type,
|
||||
) -> SegmentRangeCollector {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges,
|
||||
};
|
||||
|
||||
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_fraction_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["range"]["buckets"][0]["key"], "*-0");
|
||||
assert_eq!(res["range"]["buckets"][0]["doc_count"], 0);
|
||||
assert_eq!(res["range"]["buckets"][1]["key"], "0-0.1");
|
||||
assert_eq!(res["range"]["buckets"][1]["doc_count"], 10);
|
||||
assert_eq!(res["range"]["buckets"][2]["key"], "0.1-0.2");
|
||||
assert_eq!(res["range"]["buckets"][2]["doc_count"], 10);
|
||||
assert_eq!(res["range"]["buckets"][3]["key"], "0.2-*");
|
||||
assert_eq!(res["range"]["buckets"][3]["doc_count"], 80);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.end, 20f64.to_u64());
|
||||
// Added bucket to fill hole
|
||||
assert_eq!(buckets[2].range.start, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.end, 30f64.to_u64());
|
||||
assert_eq!(buckets[3].range.start, 30f64.to_u64());
|
||||
assert_eq!(buckets[3].range.end, 40f64.to_u64());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_range_conversion_special_case() {
|
||||
// the monotonic conversion between f64 and u64, does not map f64::MIN.to_u64() ==
|
||||
// u64::MIN, but the into trait converts f64::MIN/MAX to None
|
||||
let buckets = vec![
|
||||
(f64::MIN..10f64).into(),
|
||||
(10f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.end, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.start, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.end, u64::MAX);
|
||||
assert_eq!(buckets.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_range_test_negative_vals() {
|
||||
let buckets = vec![(-10f64..-1f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||
}
|
||||
#[test]
|
||||
fn bucket_range_test_positive_vals() {
|
||||
let buckets = vec![(0f64..10f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_u64() {
|
||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||
let collector = get_collector_from_ranges(ranges, Type::U64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9), 0);
|
||||
assert_eq!(search(10), 1);
|
||||
assert_eq!(search(11), 1);
|
||||
assert_eq!(search(99), 1);
|
||||
assert_eq!(search(100), 2);
|
||||
assert_eq!(search(u64::MAX - 1), 2); // Since the end range is never included, the max
|
||||
// value
|
||||
};
|
||||
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
];
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
RangeAggregationRange {
|
||||
to: None,
|
||||
from: Some(100.0),
|
||||
},
|
||||
];
|
||||
check_ranges(ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![
|
||||
//(f64::MIN..10.0).into(),
|
||||
(10.0..100.0).into(),
|
||||
//(100.0..f64::MAX).into(),
|
||||
];
|
||||
|
||||
let collector = get_collector_from_ranges(ranges, Type::F64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9f64.to_u64()), 0);
|
||||
assert_eq!(search(10f64.to_u64()), 1);
|
||||
assert_eq!(search(11f64.to_u64()), 1);
|
||||
assert_eq!(search(99f64.to_u64()), 1);
|
||||
assert_eq!(search(100f64.to_u64()), 2);
|
||||
assert_eq!(search(u64::MAX - 1), 2); // Since the end range is never included,
|
||||
// the max value
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use itertools::Itertools;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
|
||||
|
||||
const TOTAL_DOCS: u64 = 1_000_000u64;
|
||||
const NUM_DOCS: u64 = 50_000u64;
|
||||
|
||||
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
|
||||
let bucket_size = num_docs / num_buckets;
|
||||
let mut buckets: Vec<RangeAggregationRange> = vec![];
|
||||
for i in 0..num_buckets {
|
||||
let bucket_start = (i * bucket_size) as f64;
|
||||
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
|
||||
}
|
||||
|
||||
get_collector_from_ranges(buckets, Type::U64)
|
||||
}
|
||||
|
||||
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let all_docs = (0..total_docs - 1).collect_vec();
|
||||
let mut vals = all_docs
|
||||
.as_slice()
|
||||
.choose_multiple(&mut rng, num_docs_returned as usize)
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
vals.sort();
|
||||
vals
|
||||
}
|
||||
|
||||
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
|
||||
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
|
||||
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
|
||||
b.iter(|| {
|
||||
let mut bucket_pos = 0;
|
||||
for val in &vals {
|
||||
bucket_pos = collector.get_bucket_pos(*val);
|
||||
}
|
||||
bucket_pos
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_range_100_buckets(b: &mut test::Bencher) {
|
||||
bench_range_binary_search(b, 100)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_range_10_buckets(b: &mut test::Bencher) {
|
||||
bench_range_binary_search(b, 10)
|
||||
}
|
||||
}
|
||||
149
src/aggregation/collector.rs
Normal file
149
src/aggregation/collector.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
use super::agg_req::Aggregations;
|
||||
use super::agg_req_with_accessor::AggregationsWithAccessor;
|
||||
use super::agg_result::AggregationResults;
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// Collector for aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
agg: Aggregations,
|
||||
}
|
||||
|
||||
impl AggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
pub fn from_aggs(agg: Aggregations) -> Self {
|
||||
Self { agg }
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for distributed aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
///
|
||||
/// # Purpose
|
||||
/// AggregationCollector returns `IntermediateAggregationResults` and not the final
|
||||
/// `AggregationResults`, so that results from differenct indices can be merged and then converted
|
||||
/// into the final `AggregationResults` via the `into()` method.
|
||||
pub struct DistributedAggregationCollector {
|
||||
agg: Aggregations,
|
||||
}
|
||||
|
||||
impl DistributedAggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
pub fn from_aggs(agg: Aggregations) -> Self {
|
||||
Self { agg }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for DistributedAggregationCollector {
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
type Child = AggregationSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for AggregationCollector {
|
||||
type Fruit = AggregationResults;
|
||||
|
||||
type Child = AggregationSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let aggs_with_accessor = get_aggs_with_accessor_and_validate(&self.agg, reader)?;
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
merge_fruits(segment_fruits).map(|res| res.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
mut segment_fruits: Vec<IntermediateAggregationResults>,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
if let Some(mut fruit) = segment_fruits.pop() {
|
||||
for next_fruit in segment_fruits {
|
||||
fruit.merge_fruits(&next_fruit);
|
||||
}
|
||||
Ok(fruit)
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
"no fruits provided in merge_fruits".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// AggregationSegmentCollector does the aggregation collection on a segment.
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs: AggregationsWithAccessor,
|
||||
result: SegmentAggregationResultsCollector,
|
||||
}
|
||||
|
||||
impl AggregationSegmentCollector {
|
||||
/// Creates an AggregationSegmentCollector from an [Aggregations] request and a segment reader.
|
||||
/// Also includes validation, e.g. checking field types and existence.
|
||||
pub fn from_agg_req_and_reader(
|
||||
agg: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<Self> {
|
||||
let aggs_with_accessor = get_aggs_with_accessor_and_validate(agg, reader)?;
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for AggregationSegmentCollector {
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
|
||||
self.result.collect(doc, &self.aggs);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
self.result.flush_staged_docs(&self.aggs, true);
|
||||
self.result.into()
|
||||
}
|
||||
}
|
||||
304
src/aggregation/intermediate_agg_result.rs
Normal file
304
src/aggregation/intermediate_agg_result.rs
Normal file
@@ -0,0 +1,304 @@
|
||||
//! Contains the intermediate aggregation tree, that can be merged.
|
||||
//! Intermediate aggregation results can be used to merge results between segments or between
|
||||
//! indices.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentMetricResultCollector,
|
||||
SegmentRangeBucketEntry,
|
||||
};
|
||||
use super::{Key, SerializedKey, VecWithNames};
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults(pub(crate) VecWithNames<IntermediateAggregationResult>);
|
||||
|
||||
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
|
||||
fn from(tree: SegmentAggregationResultsCollector) -> Self {
|
||||
let mut data = vec![];
|
||||
for (key, bucket) in tree.buckets.into_iter() {
|
||||
data.push((key, IntermediateAggregationResult::Bucket(bucket.into())));
|
||||
}
|
||||
for (key, metric) in tree.metrics.into_iter() {
|
||||
data.push((key, IntermediateAggregationResult::Metric(metric.into())));
|
||||
}
|
||||
Self(VecWithNames::from_entries(data))
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
/// Merge an other intermediate aggregation result into this result.
|
||||
///
|
||||
/// The order of the values need to be the same on both results. This is ensured when the same
|
||||
/// (key values) are present on the underlying VecWithNames struct.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateAggregationResults) {
|
||||
for (tree_left, tree_right) in self.0.values_mut().zip(other.0.values()) {
|
||||
tree_left.merge_fruits(tree_right);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateAggregationResult {
|
||||
/// Bucket variant
|
||||
Bucket(IntermediateBucketResult),
|
||||
/// Metric variant
|
||||
Metric(IntermediateMetricResult),
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateAggregationResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateAggregationResult::Bucket(res_left),
|
||||
IntermediateAggregationResult::Bucket(res_right),
|
||||
) => {
|
||||
res_left.merge_fruits(res_right);
|
||||
}
|
||||
(
|
||||
IntermediateAggregationResult::Metric(res_left),
|
||||
IntermediateAggregationResult::Metric(res_right),
|
||||
) => {
|
||||
res_left.merge_fruits(res_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible types in aggregation tree on merge fruits");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds the intermediate data for metric results
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateMetricResult {
|
||||
/// Average containing intermediate average data result
|
||||
Average(IntermediateAverage),
|
||||
/// AverageData variant
|
||||
Stats(IntermediateStats),
|
||||
}
|
||||
|
||||
impl From<SegmentMetricResultCollector> for IntermediateMetricResult {
|
||||
fn from(tree: SegmentMetricResultCollector) -> Self {
|
||||
match tree {
|
||||
SegmentMetricResultCollector::Average(collector) => {
|
||||
IntermediateMetricResult::Average(IntermediateAverage::from_collector(collector))
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(collector) => {
|
||||
IntermediateMetricResult::Stats(collector.stats)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateMetricResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateMetricResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateMetricResult::Average(avg_data_left),
|
||||
IntermediateMetricResult::Average(avg_data_right),
|
||||
) => {
|
||||
avg_data_left.merge_fruits(avg_data_right);
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::Stats(stats_left),
|
||||
IntermediateMetricResult::Stats(stats_right),
|
||||
) => {
|
||||
stats_left.merge_fruits(stats_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible fruit types in tree {:?}", other);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The intermediate bucket results. Internally they can be easily merged via the keys of the
|
||||
/// buckets.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateBucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range(HashMap<SerializedKey, IntermediateRangeBucketEntry>),
|
||||
}
|
||||
|
||||
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
|
||||
fn from(collector: SegmentBucketResultCollector) -> Self {
|
||||
match collector {
|
||||
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Range(entries_left),
|
||||
IntermediateBucketResult::Range(entries_right),
|
||||
) => {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.get(name) {
|
||||
entry_left.merge_fruits(entry_right);
|
||||
}
|
||||
}
|
||||
|
||||
for (key, res) in entries_right.iter() {
|
||||
if !entries_left.contains_key(key) {
|
||||
entries_left.insert(key.clone(), res.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateRangeBucketEntry {
|
||||
/// The unique the bucket is identified.
|
||||
pub key: Key,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
pub(crate) values: Option<Vec<u64>>,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
|
||||
fn from(entry: SegmentRangeBucketEntry) -> Self {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
|
||||
sub_aggregation.into()
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
// let sub_aggregation = entry.sub_aggregation.into();
|
||||
|
||||
IntermediateRangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
values: None,
|
||||
sub_aggregation,
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateRangeBucketEntry {
|
||||
fn merge_fruits(&mut self, other: &IntermediateRangeBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(&other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = HashMap::new();
|
||||
for (key, doc_count) in data {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
to: None,
|
||||
},
|
||||
);
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level2".to_string(),
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
|
||||
);
|
||||
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
|
||||
}
|
||||
|
||||
fn get_test_tree(data: &[(String, u64, String, u64)]) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = HashMap::new();
|
||||
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
from: None,
|
||||
to: None,
|
||||
sub_aggregation: get_sub_test_tree(&[(
|
||||
sub_aggregation_key.to_string(),
|
||||
*sub_aggregation_count,
|
||||
)]),
|
||||
},
|
||||
);
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level1".to_string(),
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
|
||||
);
|
||||
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_1() {
|
||||
let mut tree_left = get_test_tree(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_test_tree(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(&tree_right);
|
||||
|
||||
let tree_expected = get_test_tree(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||
]);
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_2() {
|
||||
let mut tree_left = get_test_tree(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_test_tree(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(&tree_right);
|
||||
|
||||
let tree_expected = get_test_tree(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
}
|
||||
114
src/aggregation/metric/average.rs
Normal file
114
src/aggregation/metric/average.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// A single-value metric aggregation that computes the average of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "avg": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub struct AverageAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
}
|
||||
impl AverageAggregation {
|
||||
/// Create new AverageAggregation from a field.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
AverageAggregation { field: field_name }
|
||||
}
|
||||
/// Return the field name.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAverageCollector {
|
||||
pub data: IntermediateAverage,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl Debug for SegmentAverageCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AverageCollector")
|
||||
.field("data", &self.data)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAverageCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
data: Default::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get(docs[0]);
|
||||
let val2 = field.get(docs[1]);
|
||||
let val3 = field.get(docs[2]);
|
||||
let val4 = field.get(docs[3]);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.data.collect(val1);
|
||||
self.data.collect(val2);
|
||||
self.data.collect(val3);
|
||||
self.data.collect(val4);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = field.get(*doc);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Contains mergeable version of average data.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAverage {
|
||||
pub(crate) sum: f64,
|
||||
pub(crate) doc_count: u64,
|
||||
}
|
||||
|
||||
impl IntermediateAverage {
|
||||
pub(crate) fn from_collector(collector: SegmentAverageCollector) -> Self {
|
||||
collector.data
|
||||
}
|
||||
|
||||
/// Merge average data into this instance.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateAverage) {
|
||||
self.sum += other.sum;
|
||||
self.doc_count += other.doc_count;
|
||||
}
|
||||
/// compute final result
|
||||
pub fn finalize(&self) -> Option<f64> {
|
||||
if self.doc_count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.sum / self.doc_count as f64)
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, val: f64) {
|
||||
self.doc_count += 1;
|
||||
self.sum += val;
|
||||
}
|
||||
}
|
||||
30
src/aggregation/metric/mod.rs
Normal file
30
src/aggregation/metric/mod.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
//! Module for all metric aggregations.
|
||||
//!
|
||||
//! The aggregations in this family compute metrics, see [super::agg_req::MetricAggregation] for
|
||||
//! details.
|
||||
mod average;
|
||||
mod stats;
|
||||
pub use average::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
pub use stats::*;
|
||||
|
||||
/// Single-metric aggregations use this common result structure.
|
||||
///
|
||||
/// Main reason to wrap it in value is to match elasticsearch output structure.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SingleMetricResult {
|
||||
/// The value of the single value metric.
|
||||
pub value: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<f64> for SingleMetricResult {
|
||||
fn from(value: f64) -> Self {
|
||||
Self { value: Some(value) }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<f64>> for SingleMetricResult {
|
||||
fn from(value: Option<f64>) -> Self {
|
||||
Self { value }
|
||||
}
|
||||
}
|
||||
314
src/aggregation/metric/stats.rs
Normal file
314
src/aggregation/metric/stats.rs
Normal file
@@ -0,0 +1,314 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
/// A multi-value metric aggregation that computes stats of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [Stats] for returned statistics.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "stats": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct StatsAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
}
|
||||
|
||||
impl StatsAggregation {
|
||||
/// Create new StatsAggregation from a field.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
StatsAggregation { field: field_name }
|
||||
}
|
||||
/// Return the field name.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
/// Stats contains a collection of statistics.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
/// The number of documents.
|
||||
pub count: usize,
|
||||
/// The sum of the fast field values.
|
||||
pub sum: f64,
|
||||
/// The standard deviation of the fast field values. None for count == 0.
|
||||
pub standard_deviation: Option<f64>,
|
||||
/// The min value of the fast field values.
|
||||
pub min: Option<f64>,
|
||||
/// The max value of the fast field values.
|
||||
pub max: Option<f64>,
|
||||
/// The average of the values. None for count == 0.
|
||||
pub avg: Option<f64>,
|
||||
}
|
||||
|
||||
/// IntermediateStats contains the mergeable version for stats.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
count: usize,
|
||||
sum: f64,
|
||||
squared_sum: f64,
|
||||
min: f64,
|
||||
max: f64,
|
||||
}
|
||||
|
||||
impl IntermediateStats {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
count: 0,
|
||||
sum: 0.0,
|
||||
squared_sum: 0.0,
|
||||
min: f64::MAX,
|
||||
max: f64::MIN,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn avg(&self) -> Option<f64> {
|
||||
if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.sum / (self.count as f64))
|
||||
}
|
||||
}
|
||||
|
||||
fn square_mean(&self) -> f64 {
|
||||
self.squared_sum / (self.count as f64)
|
||||
}
|
||||
|
||||
pub(crate) fn standard_deviation(&self) -> Option<f64> {
|
||||
self.avg()
|
||||
.map(|average| (self.square_mean() - average * average).sqrt())
|
||||
}
|
||||
|
||||
/// Merge data from other stats into this instance.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateStats) {
|
||||
self.count += other.count;
|
||||
self.sum += other.sum;
|
||||
self.squared_sum += other.squared_sum;
|
||||
self.min = self.min.min(other.min);
|
||||
self.max = self.max.max(other.max);
|
||||
}
|
||||
|
||||
/// compute final resultimprove_docs
|
||||
pub fn finalize(&self) -> Stats {
|
||||
let min = if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.min)
|
||||
};
|
||||
let max = if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.max)
|
||||
};
|
||||
Stats {
|
||||
count: self.count,
|
||||
sum: self.sum,
|
||||
standard_deviation: self.standard_deviation(),
|
||||
min,
|
||||
max,
|
||||
avg: self.avg(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, value: f64) {
|
||||
self.count += 1;
|
||||
self.sum += value;
|
||||
self.squared_sum += value * value;
|
||||
self.min = self.min.min(value);
|
||||
self.max = self.max.max(value);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentStatsCollector {
|
||||
pub(crate) stats: IntermediateStats,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl SegmentStatsCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
stats: IntermediateStats::new(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get(docs[0]);
|
||||
let val2 = field.get(docs[1]);
|
||||
let val3 = field.get(docs[2]);
|
||||
let val4 = field.get(docs[3]);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.stats.collect(val1);
|
||||
self.stats.collect(val2);
|
||||
self.stats.collect(val3);
|
||||
self.stats.collect(val4);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = field.get(*doc);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::iter;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::aggregation::tests::get_test_index_2_segments;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(false)?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = vec![
|
||||
(
|
||||
"stats_i64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score_i64".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"stats_f64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score_f64".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(3f64..7f64).into(),
|
||||
(7f64..19f64).into(),
|
||||
(19f64..20f64).into(),
|
||||
],
|
||||
}),
|
||||
sub_aggregation: iter::once((
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(
|
||||
StatsAggregation::from_field_name("score".to_string()),
|
||||
)),
|
||||
))
|
||||
.collect(),
|
||||
}),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(
|
||||
res["stats"],
|
||||
json!({
|
||||
"avg": 12.142857142857142,
|
||||
"count": 7,
|
||||
"max": 44.0,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.65313748796613,
|
||||
"sum": 85.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["stats_i64"],
|
||||
json!({
|
||||
"avg": 12.142857142857142,
|
||||
"count": 7,
|
||||
"max": 44.0,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.65313748796613,
|
||||
"sum": 85.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["stats_f64"],
|
||||
json!({
|
||||
"avg": 12.214285714285714,
|
||||
"count": 7,
|
||||
"max": 44.5,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.819905785437443,
|
||||
"sum": 85.5
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["range"]["buckets"][2]["stats"],
|
||||
json!({
|
||||
"avg": 10.666666666666666,
|
||||
"count": 3,
|
||||
"max": 14.0,
|
||||
"min": 7.0,
|
||||
"standard_deviation": 2.867441755680877,
|
||||
"sum": 32.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["range"]["buckets"][3]["stats"],
|
||||
json!({
|
||||
"avg": serde_json::Value::Null,
|
||||
"count": 0,
|
||||
"max": serde_json::Value::Null,
|
||||
"min": serde_json::Value::Null,
|
||||
"standard_deviation": serde_json::Value::Null,
|
||||
"sum": 0.0,
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
1224
src/aggregation/mod.rs
Normal file
1224
src/aggregation/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
204
src/aggregation/segment_agg_result.rs
Normal file
204
src/aggregation/segment_agg_result.rs
Normal file
@@ -0,0 +1,204 @@
|
||||
//! Contains aggregation trees which is used during collection in a segment.
|
||||
//! This tree contains datastructrues optimized for fast collection.
|
||||
//! The tree can be converted to an intermediate tree, which contains datastructrues optimized for
|
||||
//! merging.
|
||||
|
||||
use std::fmt::Debug;
|
||||
|
||||
use super::agg_req::MetricAggregation;
|
||||
use super::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
|
||||
};
|
||||
use super::bucket::SegmentRangeCollector;
|
||||
use super::metric::{
|
||||
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
|
||||
};
|
||||
use super::{Key, VecWithNames};
|
||||
use crate::aggregation::agg_req::BucketAggregationType;
|
||||
use crate::DocId;
|
||||
|
||||
pub(crate) const DOC_BLOCK_SIZE: usize = 256;
|
||||
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAggregationResultsCollector {
|
||||
pub(crate) metrics: VecWithNames<SegmentMetricResultCollector>,
|
||||
pub(crate) buckets: VecWithNames<SegmentBucketResultCollector>,
|
||||
staged_docs: DocBlock,
|
||||
num_staged_docs: usize,
|
||||
}
|
||||
|
||||
impl Debug for SegmentAggregationResultsCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentAggregationResultsCollector")
|
||||
.field("metrics", &self.metrics)
|
||||
.field("buckets", &self.buckets)
|
||||
.field("staged_docs", &&self.staged_docs[..self.num_staged_docs])
|
||||
.field("num_staged_docs", &self.num_staged_docs)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationResultsCollector {
|
||||
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
|
||||
let buckets = req
|
||||
.buckets
|
||||
.entries()
|
||||
.map(|(key, req)| {
|
||||
Ok((
|
||||
key.to_string(),
|
||||
SegmentBucketResultCollector::from_req_and_validate(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
let metrics = req
|
||||
.metrics
|
||||
.entries()
|
||||
.map(|(key, req)| {
|
||||
Ok((
|
||||
key.to_string(),
|
||||
SegmentMetricResultCollector::from_req_and_validate(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
Ok(SegmentAggregationResultsCollector {
|
||||
metrics: VecWithNames::from_entries(metrics),
|
||||
buckets: VecWithNames::from_entries(buckets),
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: 0,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) {
|
||||
self.staged_docs[self.num_staged_docs] = doc;
|
||||
self.num_staged_docs += 1;
|
||||
if self.num_staged_docs == self.staged_docs.len() {
|
||||
self.flush_staged_docs(agg_with_accessor, false);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn flush_staged_docs(
|
||||
&mut self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
for (agg_with_accessor, collector) in agg_with_accessor
|
||||
.metrics
|
||||
.values()
|
||||
.zip(self.metrics.values_mut())
|
||||
{
|
||||
collector.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
|
||||
}
|
||||
for (agg_with_accessor, collector) in agg_with_accessor
|
||||
.buckets
|
||||
.values()
|
||||
.zip(self.buckets.values_mut())
|
||||
{
|
||||
collector.collect_block(
|
||||
&self.staged_docs[..self.num_staged_docs],
|
||||
agg_with_accessor,
|
||||
force_flush,
|
||||
);
|
||||
}
|
||||
|
||||
self.num_staged_docs = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentMetricResultCollector {
|
||||
Average(SegmentAverageCollector),
|
||||
Stats(SegmentStatsCollector),
|
||||
}
|
||||
|
||||
impl SegmentMetricResultCollector {
|
||||
pub fn from_req_and_validate(req: &MetricAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: _ }) => {
|
||||
Ok(SegmentMetricResultCollector::Average(
|
||||
SegmentAverageCollector::from_req(req.field_type),
|
||||
))
|
||||
}
|
||||
MetricAggregation::Stats(StatsAggregation { field: _ }) => {
|
||||
Ok(SegmentMetricResultCollector::Stats(
|
||||
SegmentStatsCollector::from_req(req.field_type),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
|
||||
match self {
|
||||
SegmentMetricResultCollector::Average(avg_collector) => {
|
||||
avg_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(stats_collector) => {
|
||||
stats_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SegmentBucketAggregationResultCollectors will have specialized buckets for collection inside
|
||||
/// segments.
|
||||
/// The typical structure of Map<Key, Bucket> is not suitable during collection for performance
|
||||
/// reasons.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentBucketResultCollector {
|
||||
Range(SegmentRangeCollector),
|
||||
}
|
||||
|
||||
impl SegmentBucketResultCollector {
|
||||
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.bucket_agg {
|
||||
BucketAggregationType::Range(range_req) => {
|
||||
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
|
||||
range_req,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
)?))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect_block(
|
||||
&mut self,
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.collect_block(doc, bucket_with_accessor, force_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -19,7 +19,7 @@ use crate::{DocId, Score};
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// f64 field. are not supported.
|
||||
/// f64 fields are not supported.
|
||||
#[derive(Clone)]
|
||||
pub struct HistogramCollector {
|
||||
min_value: u64,
|
||||
|
||||
@@ -173,8 +173,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
/// Return true if more documents have been collected than the limit.
|
||||
#[inline]
|
||||
pub(crate) fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
|
||||
@@ -64,7 +64,7 @@ fn load_metas(
|
||||
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
/// let number_field = schema_builder.add_u64_field(
|
||||
/// "number",
|
||||
/// IntOptions::default().set_fast(Cardinality::SingleValue),
|
||||
/// NumericOptions::default().set_fast(Cardinality::SingleValue),
|
||||
/// );
|
||||
///
|
||||
/// let schema = schema_builder.build();
|
||||
|
||||
@@ -88,7 +88,8 @@ impl InvertedIndexReader {
|
||||
let postings_slice = self
|
||||
.postings_file_slice
|
||||
.slice(term_info.postings_range.clone());
|
||||
block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?);
|
||||
let postings_bytes = postings_slice.read_bytes()?;
|
||||
block_postings.reset(term_info.doc_freq, postings_bytes)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -197,3 +198,36 @@ impl InvertedIndexReader {
|
||||
.unwrap_or(0u32))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) async fn get_term_info_async(
|
||||
&self,
|
||||
term: &Term,
|
||||
) -> crate::AsyncIoResult<Option<TermInfo>> {
|
||||
self.termdict.get_async(term.value_bytes()).await
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `Term`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub async fn warm_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
with_positions: bool,
|
||||
) -> crate::AsyncIoResult<()> {
|
||||
let term_info_opt = self.get_term_info_async(term).await?;
|
||||
if let Some(term_info) = term_info_opt {
|
||||
self.postings_file_slice
|
||||
.read_bytes_slice_async(term_info.postings_range.clone())
|
||||
.await?;
|
||||
if with_positions {
|
||||
self.positions_file_slice
|
||||
.read_bytes_slice_async(term_info.positions_range.clone())
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,6 +110,13 @@ impl Searcher {
|
||||
store_reader.get(doc_address.doc_id)
|
||||
}
|
||||
|
||||
/// Fetches a document in an asynchronous manner.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
|
||||
let store_reader = &self.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get_async(doc_address.doc_id).await
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
|
||||
@@ -70,7 +70,7 @@ impl SegmentReader {
|
||||
self.max_doc - self.num_docs
|
||||
}
|
||||
|
||||
/// Returns true iff some of the documents of the segment have been deleted.
|
||||
/// Returns true if some of the documents of the segment have been deleted.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
@@ -121,9 +121,8 @@ impl SegmentReader {
|
||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was the field set to record norm during \
|
||||
indexing?",
|
||||
field_name
|
||||
"Field norm not found for field {field_name:?}. Was the field set to record norm \
|
||||
during indexing?"
|
||||
);
|
||||
crate::TantivyError::SchemaError(err_msg)
|
||||
})
|
||||
@@ -302,7 +301,7 @@ impl SegmentReader {
|
||||
self.alive_bitset_opt.as_ref()
|
||||
}
|
||||
|
||||
/// Returns true iff the `doc` is marked
|
||||
/// Returns true if the `doc` is marked
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.alive_bitset()
|
||||
|
||||
@@ -96,9 +96,9 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
||||
///
|
||||
/// There are currently two implementations of `Directory`
|
||||
///
|
||||
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
|
||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this
|
||||
/// should be your default choice.
|
||||
/// - The [`RamDirectory`](struct.RamDirectory.html), which
|
||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which
|
||||
/// should be used mostly for tests.
|
||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a file and returns a boxed `FileHandle`.
|
||||
@@ -128,7 +128,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// `DeleteError::DoesNotExist`.
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError>;
|
||||
|
||||
/// Returns true iff the file exists
|
||||
/// Returns true if and only if the file exists
|
||||
fn exists(&self, path: &Path) -> Result<bool, OpenReadError>;
|
||||
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::{fmt, io};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common::HasLen;
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
@@ -18,18 +19,35 @@ pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
/// The underlying behavior is therefore specific to the `Directory` that created it.
|
||||
/// Despite its name, a `FileSlice` may or may not directly map to an actual file
|
||||
/// on the filesystem.
|
||||
|
||||
#[async_trait]
|
||||
pub trait FileHandle: 'static + Send + Sync + HasLen + fmt::Debug {
|
||||
/// Reads a slice of bytes.
|
||||
///
|
||||
/// This method may panic if the range requested is invalid.
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes>;
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[doc(hidden)]
|
||||
async fn read_bytes_async(
|
||||
&self,
|
||||
_byte_range: Range<usize>,
|
||||
) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
Err(crate::error::AsyncIoError::AsyncUnsupported)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileHandle for &'static [u8] {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
let bytes = &self[range];
|
||||
Ok(OwnedBytes::new(bytes))
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
Ok(self.read_bytes(byte_range)?)
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> From<B> for FileSlice
|
||||
@@ -102,6 +120,12 @@ impl FileSlice {
|
||||
self.data.read_bytes(self.range.clone())
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[doc(hidden)]
|
||||
pub async fn read_bytes_async(&self) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
self.data.read_bytes_async(self.range.clone()).await
|
||||
}
|
||||
|
||||
/// Reads a specific slice of data.
|
||||
///
|
||||
/// This is equivalent to running `file_slice.slice(from, to).read_bytes()`.
|
||||
@@ -116,6 +140,23 @@ impl FileSlice {
|
||||
.read_bytes(self.range.start + range.start..self.range.start + range.end)
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[doc(hidden)]
|
||||
pub async fn read_bytes_slice_async(
|
||||
&self,
|
||||
byte_range: Range<usize>,
|
||||
) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
assert!(
|
||||
self.range.start + byte_range.end <= self.range.end,
|
||||
"`to` exceeds the fileslice length"
|
||||
);
|
||||
self.data
|
||||
.read_bytes_async(
|
||||
self.range.start + byte_range.start..self.range.start + byte_range.end,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Splits the FileSlice at the given offset and return two file slices.
|
||||
/// `file_slice[..split_offset]` and `file_slice[split_offset..]`.
|
||||
///
|
||||
@@ -160,10 +201,16 @@ impl FileSlice {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileHandle for FileSlice {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
self.read_bytes_slice(range)
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
async fn read_bytes_async(&self, byte_range: Range<usize>) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
self.read_bytes_slice_async(byte_range).await
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for FileSlice {
|
||||
@@ -172,6 +219,19 @@ impl HasLen for FileSlice {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileHandle for OwnedBytes {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(range))
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
async fn read_bytes_async(&self, range: Range<usize>) -> crate::AsyncIoResult<OwnedBytes> {
|
||||
let bytes = self.read_bytes(range)?;
|
||||
Ok(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io;
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::directory::{
|
||||
use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
|
||||
/// Returns true iff the file is "managed".
|
||||
/// Returns true if the file is "managed".
|
||||
/// Non-managed file are not subject to garbage collection.
|
||||
///
|
||||
/// Filenames that starts by a "." -typically locks-
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write};
|
||||
use std::io::{self, BufWriter, Read, Seek, Write};
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock};
|
||||
@@ -265,7 +264,7 @@ impl Write for SafeFileWriter {
|
||||
}
|
||||
|
||||
impl Seek for SafeFileWriter {
|
||||
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
|
||||
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
||||
self.0.seek(pos)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ mod file_slice;
|
||||
mod file_watcher;
|
||||
mod footer;
|
||||
mod managed_directory;
|
||||
mod owned_bytes;
|
||||
mod ram_directory;
|
||||
mod watch_event_router;
|
||||
|
||||
@@ -22,13 +21,13 @@ use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub use common::{AntiCallToken, TerminatingWrite};
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::directory::{Directory, DirectoryClone, DirectoryLock};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes};
|
||||
pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::owned_bytes::OwnedBytes;
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
|
||||
use crate::directory::FileHandle;
|
||||
|
||||
impl FileHandle for OwnedBytes {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(range))
|
||||
}
|
||||
}
|
||||
49
src/error.rs
49
src/error.rs
@@ -1,9 +1,11 @@
|
||||
//! Definition of Tantivy's error and result.
|
||||
//! Definition of Tantivy's errors and results.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
use std::{fmt, io};
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::directory::error::{
|
||||
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||
};
|
||||
@@ -12,7 +14,7 @@ use crate::{query, schema};
|
||||
|
||||
/// Represents a `DataCorruption` error.
|
||||
///
|
||||
/// When facing data corruption, tantivy actually panic or return this error.
|
||||
/// When facing data corruption, tantivy actually panics or returns this error.
|
||||
pub struct DataCorruption {
|
||||
filepath: Option<PathBuf>,
|
||||
comment: String,
|
||||
@@ -38,9 +40,9 @@ impl DataCorruption {
|
||||
|
||||
impl fmt::Debug for DataCorruption {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
write!(f, "Data corruption: ")?;
|
||||
write!(f, "Data corruption")?;
|
||||
if let Some(ref filepath) = &self.filepath {
|
||||
write!(f, "(in file `{:?}`)", filepath)?;
|
||||
write!(f, " (in file `{:?}`)", filepath)?;
|
||||
}
|
||||
write!(f, ": {}.", self.comment)?;
|
||||
Ok(())
|
||||
@@ -59,10 +61,10 @@ pub enum TantivyError {
|
||||
/// Failed to open a file for write.
|
||||
#[error("Failed to open file for write: '{0:?}'")]
|
||||
OpenWriteError(#[from] OpenWriteError),
|
||||
/// Index already exists in this directory
|
||||
/// Index already exists in this directory.
|
||||
#[error("Index already exists")]
|
||||
IndexAlreadyExists,
|
||||
/// Failed to acquire file lock
|
||||
/// Failed to acquire file lock.
|
||||
#[error("Failed to acquire Lockfile: {0:?}. {1:?}")]
|
||||
LockFailure(LockError, Option<String>),
|
||||
/// IO Error.
|
||||
@@ -74,26 +76,51 @@ pub enum TantivyError {
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
#[error("A thread holding the locked panicked and poisoned the lock")]
|
||||
Poisoned,
|
||||
/// The provided field name does not exist.
|
||||
#[error("The field does not exist: '{0}'")]
|
||||
FieldNotFound(String),
|
||||
/// Invalid argument was passed by the user.
|
||||
#[error("An invalid argument was passed: '{0}'")]
|
||||
InvalidArgument(String),
|
||||
/// An Error happened in one of the thread.
|
||||
/// An Error occurred in one of the threads.
|
||||
#[error("An error occurred in a thread: '{0}'")]
|
||||
ErrorInThread(String),
|
||||
/// An Error appeared related to opening or creating a index.
|
||||
/// An Error occurred related to opening or creating a index.
|
||||
#[error("Missing required index builder argument when open/create index: '{0}'")]
|
||||
IndexBuilderMissingArgument(&'static str),
|
||||
/// An Error appeared related to the schema.
|
||||
/// An Error occurred related to the schema.
|
||||
#[error("Schema error: '{0}'")]
|
||||
SchemaError(String),
|
||||
/// System error. (e.g.: We failed spawning a new thread)
|
||||
/// System error. (e.g.: We failed spawning a new thread).
|
||||
#[error("System error.'{0}'")]
|
||||
SystemError(String),
|
||||
/// Index incompatible with current version of tantivy
|
||||
/// Index incompatible with current version of Tantivy.
|
||||
#[error("{0:?}")]
|
||||
IncompatibleIndex(Incompatibility),
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
#[derive(Error, Debug)]
|
||||
#[doc(hidden)]
|
||||
pub enum AsyncIoError {
|
||||
#[error("io::Error `{0}`")]
|
||||
Io(#[from] io::Error),
|
||||
#[error("Asynchronous API is unsupported by this directory")]
|
||||
AsyncUnsupported,
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
impl From<AsyncIoError> for TantivyError {
|
||||
fn from(async_io_err: AsyncIoError) -> Self {
|
||||
match async_io_err {
|
||||
AsyncIoError::Io(io_err) => TantivyError::from(io_err),
|
||||
AsyncIoError::AsyncUnsupported => {
|
||||
TantivyError::SystemError(format!("{:?}", async_io_err))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||
TantivyError::DataCorruption(data_corruption)
|
||||
|
||||
@@ -7,7 +7,7 @@ use ownedbytes::OwnedBytes;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
|
||||
/// Write a alive `BitSet`
|
||||
/// Write an alive `BitSet`
|
||||
///
|
||||
/// where `alive_bitset` is the set of alive `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
@@ -55,19 +55,19 @@ impl AliveBitSet {
|
||||
AliveBitSet::from(readonly_bitset)
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
/// Opens an alive bitset given its file.
|
||||
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
|
||||
let bitset = ReadOnlyBitSet::open(bytes);
|
||||
AliveBitSet::from(bitset)
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
/// Returns true if the document is still "alive". In other words, if it has not been deleted.
|
||||
#[inline]
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
self.bitset.contains(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
/// Returns true if the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
!self.is_alive(doc)
|
||||
@@ -79,13 +79,13 @@ impl AliveBitSet {
|
||||
self.bitset.iter()
|
||||
}
|
||||
|
||||
/// Get underlying bitset
|
||||
/// Get underlying bitset.
|
||||
#[inline]
|
||||
pub fn bitset(&self) -> &ReadOnlyBitSet {
|
||||
&self.bitset
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
/// The number of alive documents.
|
||||
pub fn num_alive_docs(&self) -> usize {
|
||||
self.num_alive_docs
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight = term_query.specialized_weight(&searcher, true)?;
|
||||
let term_weight = term_query.specialized_weight(&*searcher, true)?;
|
||||
let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0)?;
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
Ok(())
|
||||
@@ -99,7 +99,7 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight_err = term_query.specialized_weight(&searcher, false);
|
||||
let term_weight_err = term_query.specialized_weight(&*searcher, false);
|
||||
assert!(matches!(
|
||||
term_weight_err,
|
||||
Err(crate::TantivyError::SchemaError(_))
|
||||
|
||||
@@ -7,7 +7,7 @@ use crate::DocId;
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
/// This `BytesFastFieldWriter` is only useful for advanced user.
|
||||
/// This `BytesFastFieldWriter` is only useful for advanced users.
|
||||
/// The normal way to get your associated bytes in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::SingleValue`
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//!
|
||||
//! It is the equivalent of `Lucene`'s `DocValues`.
|
||||
//!
|
||||
//! Fast fields is a column-oriented fashion storage of `tantivy`.
|
||||
//! A fast field is a column-oriented fashion storage for `tantivy`.
|
||||
//!
|
||||
//! It is designed for the fast random access of some document
|
||||
//! fields given a document id.
|
||||
@@ -12,11 +12,10 @@
|
||||
//!
|
||||
//!
|
||||
//! Fields have to be declared as `FAST` in the schema.
|
||||
//! Currently only 64-bits integers (signed or unsigned) are
|
||||
//! supported.
|
||||
//! Currently supported fields are: u64, i64, f64 and bytes.
|
||||
//!
|
||||
//! They are stored in a bit-packed fashion so that their
|
||||
//! memory usage is directly linear with the amplitude of the
|
||||
//! u64, i64 and f64 fields are stored in a bit-packed fashion so that
|
||||
//! their memory usage is directly linear with the amplitude of the
|
||||
//! values stored.
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
@@ -28,6 +27,7 @@ pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::chrono::{NaiveDateTime, Utc};
|
||||
@@ -212,7 +212,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{Document, Field, IntOptions, Schema, FAST};
|
||||
use crate::schema::{Document, Field, NumericOptions, Schema, FAST};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
@@ -520,7 +520,7 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let multi_date_field = schema_builder.add_date_field(
|
||||
"multi_date",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -16,7 +16,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema};
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
|
||||
use crate::{Document, Index, Term};
|
||||
|
||||
#[test]
|
||||
@@ -24,7 +24,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -59,14 +59,14 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
IntOptions::default()
|
||||
NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_stored(),
|
||||
);
|
||||
let time_i =
|
||||
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
||||
schema_builder.add_i64_field("time_stamp_i", NumericOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -94,8 +94,11 @@ mod tests {
|
||||
assert_eq!(reader.num_docs(), 5);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?;
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query(&format!(
|
||||
"multi_date_field:\"{}\"",
|
||||
first_time_stamp.to_rfc3339()
|
||||
))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(results.len(), 1);
|
||||
for (_score, doc_address) in results {
|
||||
@@ -150,7 +153,7 @@ mod tests {
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let range_q = format!(
|
||||
"[{} TO {}}}",
|
||||
"multi_date_field:[{} TO {}}}",
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
);
|
||||
@@ -196,7 +199,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -226,7 +229,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default()
|
||||
NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
|
||||
@@ -90,7 +90,7 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
mod tests {
|
||||
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema};
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() -> crate::Result<()> {
|
||||
@@ -148,7 +148,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_multifastfield_reader_min_max() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_options = IntOptions::default()
|
||||
let field_options = NumericOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::MultiValues);
|
||||
let item_field = schema_builder.add_i64_field("items", field_options);
|
||||
|
||||
@@ -14,7 +14,7 @@ use crate::DocId;
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
///
|
||||
/// This `Writer` is only useful for advanced user.
|
||||
/// This `Writer` is only useful for advanced users.
|
||||
/// The normal way to get your multivalued int in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::MultiValues`
|
||||
@@ -23,10 +23,11 @@ use crate::DocId;
|
||||
///
|
||||
/// The `MultiValuedFastFieldWriter` can be acquired from the
|
||||
/// fastfield writer, by calling
|
||||
/// [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
|
||||
/// [`.get_multivalue_writer_mut(...)`](./struct.FastFieldsWriter.html#method.
|
||||
/// get_multivalue_writer_mut).
|
||||
///
|
||||
/// Once acquired, writing is done by calling calls to
|
||||
/// `.add_document_vals(&[u64])` once per document.
|
||||
/// Once acquired, writing is done by calling
|
||||
/// [`.add_document_vals(&[u64])`](MultiValuedFastFieldWriter::add_document_vals) once per document.
|
||||
///
|
||||
/// The serializer makes it possible to remap all of the values
|
||||
/// that were pushed to the writer using a mapping.
|
||||
|
||||
@@ -112,6 +112,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
#[inline]
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
@@ -119,6 +120,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
@@ -174,6 +176,7 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
@@ -17,14 +17,14 @@ pub struct FastFieldReaders {
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
enum FastType {
|
||||
pub(crate) enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
F64,
|
||||
Date,
|
||||
}
|
||||
|
||||
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
match field_type {
|
||||
FieldType::U64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
@@ -55,7 +55,8 @@ impl FastFieldReaders {
|
||||
self.fast_fields_composite.space_usage()
|
||||
}
|
||||
|
||||
fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
|
||||
#[doc(hidden)]
|
||||
pub fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
|
||||
self.fast_fields_composite
|
||||
.open_read_with_idx(field, idx)
|
||||
.ok_or_else(|| {
|
||||
|
||||
@@ -197,7 +197,7 @@ impl CompositeFastFieldSerializer {
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
/// After this call the data must be persistently saved on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
/// The `FastFieldsWriter` groups all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
|
||||
@@ -35,8 +35,7 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
let mut doc_set: Vec<u64> = Vec::new();
|
||||
|
||||
let mut doc_id = 0u64;
|
||||
for iteration in 0..get_num_iterations() {
|
||||
dbg!(iteration);
|
||||
for _iteration in 0..get_num_iterations() {
|
||||
let num_docs: usize = rng.gen_range(0..4);
|
||||
if !doc_set.is_empty() {
|
||||
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
|
||||
|
||||
@@ -221,7 +221,7 @@ impl DeleteCursor {
|
||||
}
|
||||
|
||||
/// Advance to the next delete operation.
|
||||
/// Returns true iff there is such an operation.
|
||||
/// Returns true if and only if there is such an operation.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.load_block_if_required() {
|
||||
self.pos += 1;
|
||||
|
||||
@@ -168,12 +168,12 @@ mod tests_indexsorting {
|
||||
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
|
||||
let my_number = schema_builder.add_u64_field(
|
||||
"my_number",
|
||||
IntOptions::default().set_fast(Cardinality::SingleValue),
|
||||
NumericOptions::default().set_fast(Cardinality::SingleValue),
|
||||
);
|
||||
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
@@ -794,8 +794,8 @@ mod tests {
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{QueryParser, TermQuery};
|
||||
use crate::schema::{
|
||||
self, Cardinality, Facet, FacetOptions, IndexRecordOption, IntOptions, TextFieldIndexing,
|
||||
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
self, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term};
|
||||
|
||||
@@ -1404,7 +1404,7 @@ mod tests {
|
||||
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
IntOptions::default()
|
||||
NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_stored(),
|
||||
);
|
||||
|
||||
415
src/indexer/json_term_writer.rs
Normal file
415
src/indexer/json_term_writer.rs
Normal file
@@ -0,0 +1,415 @@
|
||||
use chrono::Utc;
|
||||
use fnv::FnvHashMap;
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use crate::schema::Type;
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
use crate::{DocId, Term};
|
||||
|
||||
/// This object is a map storing the last position for a given path for the current document
|
||||
/// being indexed.
|
||||
///
|
||||
/// It is key to solve the following problem:
|
||||
/// If we index a JsonObject emitting several terms with the same path
|
||||
/// we do not want to create false positive in phrase queries.
|
||||
///
|
||||
/// For instance:
|
||||
///
|
||||
/// ```json
|
||||
/// {"bands": [
|
||||
/// {"band_name": "Elliot Smith"},
|
||||
/// {"band_name": "The Who"},
|
||||
/// ]}
|
||||
/// ```
|
||||
///
|
||||
/// If we are careless and index each band names independently,
|
||||
/// `Elliot` and `The` will end up indexed at position 0, and `Smith` and `Who` will be indexed at
|
||||
/// position 1.
|
||||
/// As a result, with lemmatization, "The Smiths" will match our object.
|
||||
///
|
||||
/// Worse, if a same term is appears in the second object, a non increasing value would be pushed
|
||||
/// to the position recorder probably provoking a panic.
|
||||
///
|
||||
/// This problem is solved for regular multivalued object by offsetting the position
|
||||
/// of values, with a position gap. Here we would like `The` and `Who` to get indexed at
|
||||
/// position 2 and 3 respectively.
|
||||
///
|
||||
/// With regular fields, we sort the fields beforehands, so that all terms with the same
|
||||
/// path are indexed consecutively.
|
||||
///
|
||||
/// In JSON object, we do not have this confort, so we need to record these position offsets in
|
||||
/// a map.
|
||||
///
|
||||
/// Note that using a single position for the entire object would not hurt correctness.
|
||||
/// It would however hurt compression.
|
||||
///
|
||||
/// We can therefore afford working with a map that is not imperfect. It is fine if several
|
||||
/// path map to the same index position as long as the probability is relatively low.
|
||||
#[derive(Default)]
|
||||
struct IndexingPositionsPerPath {
|
||||
positions_per_path: FnvHashMap<u32, IndexingPosition>,
|
||||
}
|
||||
|
||||
impl IndexingPositionsPerPath {
|
||||
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
|
||||
self.positions_per_path
|
||||
.entry(murmurhash2(term.as_slice()))
|
||||
.or_insert_with(Default::default)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn index_json_values<'a>(
|
||||
doc: DocId,
|
||||
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
term_buffer: &mut Term,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> crate::Result<()> {
|
||||
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
|
||||
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
|
||||
for json_value_res in json_values {
|
||||
let json_value = json_value_res?;
|
||||
index_json_object(
|
||||
doc,
|
||||
json_value,
|
||||
text_analyzer,
|
||||
&mut json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
&mut positions_per_path,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn index_json_object<'a>(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Map<String, serde_json::Value>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter<'a>,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
for (json_path_segment, json_value) in json_value {
|
||||
json_term_writer.push_path_segment(json_path_segment);
|
||||
index_json_value(
|
||||
doc,
|
||||
json_value,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
positions_per_path,
|
||||
);
|
||||
json_term_writer.pop_path_segment();
|
||||
}
|
||||
}
|
||||
|
||||
fn index_json_value<'a>(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Value,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter<'a>,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
match json_value {
|
||||
serde_json::Value::Null => {}
|
||||
serde_json::Value::Bool(val_bool) => {
|
||||
let bool_u64 = if *val_bool { 1u64 } else { 0u64 };
|
||||
json_term_writer.set_fast_value(bool_u64);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
serde_json::Value::Number(number) => {
|
||||
if let Some(number_u64) = number.as_u64() {
|
||||
json_term_writer.set_fast_value(number_u64);
|
||||
} else if let Some(number_i64) = number.as_i64() {
|
||||
json_term_writer.set_fast_value(number_i64);
|
||||
} else if let Some(number_f64) = number.as_f64() {
|
||||
json_term_writer.set_fast_value(number_f64);
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
serde_json::Value::String(text) => match infer_type_from_str(text) {
|
||||
TextOrDateTime::Text(text) => {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
// TODO make sure the chain position works out.
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
let indexing_position = positions_per_path.get_position(json_term_writer.term());
|
||||
postings_writer.index_text(
|
||||
doc,
|
||||
&mut *token_stream,
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
);
|
||||
}
|
||||
TextOrDateTime::DateTime(dt) => {
|
||||
json_term_writer.set_fast_value(dt);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
},
|
||||
serde_json::Value::Array(arr) => {
|
||||
for val in arr {
|
||||
index_json_value(
|
||||
doc,
|
||||
val,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
positions_per_path,
|
||||
);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
index_json_object(
|
||||
doc,
|
||||
map,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
positions_per_path,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum TextOrDateTime<'a> {
|
||||
Text(&'a str),
|
||||
DateTime(crate::DateTime),
|
||||
}
|
||||
|
||||
fn infer_type_from_str(text: &str) -> TextOrDateTime {
|
||||
match chrono::DateTime::parse_from_rfc3339(text) {
|
||||
Ok(dt) => {
|
||||
let dt_utc = dt.with_timezone(&Utc);
|
||||
TextOrDateTime::DateTime(dt_utc)
|
||||
}
|
||||
Err(_) => TextOrDateTime::Text(text),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JsonTermWriter<'a> {
|
||||
term_buffer: &'a mut Term,
|
||||
path_stack: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a> JsonTermWriter<'a> {
|
||||
pub fn wrap(term_buffer: &'a mut Term) -> Self {
|
||||
term_buffer.clear_with_type(Type::Json);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(5);
|
||||
Self {
|
||||
term_buffer,
|
||||
path_stack,
|
||||
}
|
||||
}
|
||||
|
||||
fn trim_to_end_of_path(&mut self) {
|
||||
let end_of_path = *self.path_stack.last().unwrap();
|
||||
self.term_buffer.truncate(end_of_path);
|
||||
}
|
||||
|
||||
pub fn close_path_and_set_type(&mut self, typ: Type) {
|
||||
self.trim_to_end_of_path();
|
||||
let buffer = self.term_buffer.as_mut();
|
||||
let buffer_len = buffer.len();
|
||||
buffer[buffer_len - 1] = JSON_END_OF_PATH;
|
||||
buffer.push(typ.to_code());
|
||||
}
|
||||
|
||||
pub fn push_path_segment(&mut self, segment: &str) {
|
||||
// the path stack should never be empty.
|
||||
self.trim_to_end_of_path();
|
||||
let buffer = self.term_buffer.as_mut();
|
||||
let buffer_len = buffer.len();
|
||||
if self.path_stack.len() > 1 {
|
||||
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
|
||||
}
|
||||
buffer.extend(segment.as_bytes());
|
||||
buffer.push(JSON_PATH_SEGMENT_SEP);
|
||||
self.path_stack.push(buffer.len());
|
||||
}
|
||||
|
||||
pub fn pop_path_segment(&mut self) {
|
||||
self.path_stack.pop();
|
||||
assert!(!self.path_stack.is_empty());
|
||||
self.trim_to_end_of_path();
|
||||
}
|
||||
|
||||
/// Returns the json path of the term being currently built.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn path(&self) -> &[u8] {
|
||||
let end_of_path = self.path_stack.last().cloned().unwrap_or(6);
|
||||
&self.term().as_slice()[5..end_of_path - 1]
|
||||
}
|
||||
|
||||
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
self.close_path_and_set_type(T::to_type());
|
||||
self.term_buffer
|
||||
.as_mut()
|
||||
.extend_from_slice(val.to_u64().to_be_bytes().as_slice());
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn set_str(&mut self, text: &str) {
|
||||
self.close_path_and_set_type(Type::Str);
|
||||
self.term_buffer.as_mut().extend_from_slice(text.as_bytes());
|
||||
}
|
||||
|
||||
pub fn term(&self) -> &Term {
|
||||
self.term_buffer
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::JsonTermWriter;
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_json_writer() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("attributes");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")"
|
||||
);
|
||||
json_writer.set_str("blue");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("dimensions");
|
||||
json_writer.push_path_segment("width");
|
||||
json_writer.set_fast_value(400i64);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("height");
|
||||
json_writer.set_fast_value(300i64);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(-4i64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4u64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4.0f64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_push_after_set_path_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("attribute");
|
||||
json_writer.set_str("something");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pop_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.push_path_segment("hue");
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_writer_path() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
assert_eq!(json_writer.path(), b"color");
|
||||
json_writer.push_path_segment("hue");
|
||||
assert_eq!(json_writer.path(), b"color\x01hue");
|
||||
json_writer.set_str("pink");
|
||||
assert_eq!(json_writer.path(), b"color\x01hue");
|
||||
}
|
||||
}
|
||||
@@ -278,7 +278,7 @@ impl IndexMerger {
|
||||
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
debug_time!("write_fast_fields");
|
||||
debug_time!("write-fast-fields");
|
||||
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -307,16 +307,16 @@ impl IndexMerger {
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Str(_) => {
|
||||
// We don't handle str fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future.
|
||||
}
|
||||
FieldType::Bytes(byte_options) => {
|
||||
if byte_options.is_fast() {
|
||||
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {
|
||||
// We don't handle json / string fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -597,7 +597,7 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
debug_time!("write_hierarchical_facet_field");
|
||||
debug_time!("write-hierarchical-facet-field");
|
||||
|
||||
// Multifastfield consists of 2 fastfields.
|
||||
// The first serves as an index into the second one and is stricly increasing.
|
||||
@@ -827,7 +827,7 @@ impl IndexMerger {
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<Option<TermOrdinalMapping>> {
|
||||
debug_time!("write_postings_for_field");
|
||||
debug_time!("write-postings-for-field");
|
||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
|
||||
@@ -1023,7 +1023,8 @@ impl IndexMerger {
|
||||
store_writer: &mut StoreWriter,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
debug_time!("write_storable_fields");
|
||||
debug_time!("write-storable-fields");
|
||||
debug!("write-storable-field");
|
||||
|
||||
let store_readers: Vec<_> = self
|
||||
.readers
|
||||
@@ -1036,6 +1037,7 @@ impl IndexMerger {
|
||||
.map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
|
||||
.collect();
|
||||
if !doc_id_mapping.is_trivial() {
|
||||
debug!("non-trivial-doc-id-mapping");
|
||||
for (old_doc_id, reader_ordinal) in doc_id_mapping.iter() {
|
||||
let doc_bytes_it = &mut document_iterators[*reader_ordinal as usize];
|
||||
if let Some(doc_bytes_res) = doc_bytes_it.next() {
|
||||
@@ -1050,6 +1052,7 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
debug!("trivial-doc-id-mapping");
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader()?;
|
||||
if reader.has_deletes()
|
||||
@@ -1099,10 +1102,11 @@ impl IndexMerger {
|
||||
} else {
|
||||
self.get_doc_id_from_concatenated_data()?
|
||||
};
|
||||
|
||||
debug!("write-fieldnorms");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
|
||||
}
|
||||
debug!("write-postings");
|
||||
let fieldnorm_data = serializer
|
||||
.segment()
|
||||
.open_read(SegmentComponent::FieldNorms)?;
|
||||
@@ -1112,12 +1116,15 @@ impl IndexMerger {
|
||||
fieldnorm_readers,
|
||||
&doc_id_mapping,
|
||||
)?;
|
||||
debug!("write-fastfields");
|
||||
self.write_fast_fields(
|
||||
serializer.get_fast_field_serializer(),
|
||||
term_ord_mappings,
|
||||
&doc_id_mapping,
|
||||
)?;
|
||||
debug!("write-storagefields");
|
||||
self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?;
|
||||
debug!("close-serializer");
|
||||
serializer.close()?;
|
||||
Ok(self.max_doc)
|
||||
}
|
||||
@@ -1137,7 +1144,7 @@ mod tests {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery};
|
||||
use crate::schema::{
|
||||
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, IntOptions, Term,
|
||||
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
|
||||
TextFieldIndexing, INDEXED, TEXT,
|
||||
};
|
||||
use crate::{
|
||||
@@ -1157,7 +1164,7 @@ mod tests {
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let date_field = schema_builder.add_date_field("date", INDEXED);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_fieldtype = schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
@@ -1306,7 +1313,7 @@ mod tests {
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_fieldtype = schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
@@ -1666,7 +1673,7 @@ mod tests {
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let int_options = IntOptions::default()
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
@@ -1830,7 +1837,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_all_deleted() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||
@@ -1867,7 +1874,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||
@@ -1994,7 +2001,7 @@ mod tests {
|
||||
fn merges_f64_fast_fields_correctly() -> crate::Result<()> {
|
||||
let mut builder = schema::SchemaBuilder::new();
|
||||
|
||||
let fast_multi = IntOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let fast_multi = NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
|
||||
let field = builder.add_f64_field("f64", schema::FAST);
|
||||
let multi_field = builder.add_f64_field("f64s", fast_multi);
|
||||
|
||||
@@ -7,14 +7,14 @@ mod tests {
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, IntOptions,
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions,
|
||||
};
|
||||
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
@@ -63,7 +63,7 @@ mod tests {
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_stored()
|
||||
.set_indexed();
|
||||
@@ -75,7 +75,7 @@ mod tests {
|
||||
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
@@ -486,11 +486,11 @@ mod bench_sorted_index_merge {
|
||||
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::{Cardinality, Document, IntOptions, Schema};
|
||||
use crate::schema::{Cardinality, Document, NumericOptions, Schema};
|
||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
let int_options = NumericOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
|
||||
@@ -5,6 +5,7 @@ pub mod doc_id_mapping;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod index_writer;
|
||||
mod index_writer_status;
|
||||
mod json_term_writer;
|
||||
mod log_merge_policy;
|
||||
mod merge_operation;
|
||||
pub mod merge_policy;
|
||||
@@ -24,6 +25,7 @@ use crossbeam::channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub(crate) use self::json_term_writer::JsonTermWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
|
||||
@@ -3,14 +3,16 @@ use super::operation::AddOperation;
|
||||
use crate::core::Segment;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::indexer::json_term_writer::index_json_values;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::{
|
||||
compute_table_size, serialize_postings, IndexingContext, PerFieldPostingsWriter, PostingsWriter,
|
||||
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::{Field, FieldEntry, FieldType, FieldValue, Schema, Term, Type, Value};
|
||||
use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, TokenStreamChain, Tokenizer,
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer,
|
||||
};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent};
|
||||
|
||||
@@ -54,13 +56,13 @@ fn remap_doc_opstamps(
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter {
|
||||
pub(crate) max_doc: DocId,
|
||||
pub(crate) indexing_context: IndexingContext,
|
||||
pub(crate) ctx: IndexingContext,
|
||||
pub(crate) per_field_postings_writers: PerFieldPostingsWriter,
|
||||
pub(crate) segment_serializer: SegmentSerializer,
|
||||
pub(crate) fast_field_writers: FastFieldsWriter,
|
||||
pub(crate) fieldnorms_writer: FieldNormsWriter,
|
||||
pub(crate) doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<TextAnalyzer>>,
|
||||
per_field_text_analyzers: Vec<TextAnalyzer>,
|
||||
term_buffer: Term,
|
||||
schema: Schema,
|
||||
}
|
||||
@@ -84,29 +86,33 @@ impl SegmentWriter {
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||
let tokenizers = schema
|
||||
let per_field_text_analyzers = schema
|
||||
.fields()
|
||||
.map(
|
||||
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
}),
|
||||
.map(|(_, field_entry): (_, &FieldEntry)| {
|
||||
let text_options = match field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
json_object_options.get_text_indexing_options()
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
)
|
||||
};
|
||||
text_options
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
})
|
||||
.unwrap_or_default()
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
indexing_context: IndexingContext::new(table_size),
|
||||
ctx: IndexingContext::new(table_size),
|
||||
per_field_postings_writers,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
|
||||
segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(&schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers,
|
||||
per_field_text_analyzers,
|
||||
term_buffer: Term::new(),
|
||||
schema,
|
||||
})
|
||||
@@ -129,7 +135,7 @@ impl SegmentWriter {
|
||||
.transpose()?;
|
||||
remap_and_write(
|
||||
&self.per_field_postings_writers,
|
||||
self.indexing_context,
|
||||
self.ctx,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
&self.schema,
|
||||
@@ -141,7 +147,7 @@ impl SegmentWriter {
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.indexing_context.mem_usage()
|
||||
self.ctx.mem_usage()
|
||||
+ self.fieldnorms_writer.mem_usage()
|
||||
+ self.fast_field_writers.mem_usage()
|
||||
+ self.segment_serializer.mem_usage()
|
||||
@@ -161,13 +167,12 @@ impl SegmentWriter {
|
||||
if !field_entry.is_indexed() {
|
||||
continue;
|
||||
}
|
||||
let (term_buffer, indexing_context) =
|
||||
(&mut self.term_buffer, &mut self.indexing_context);
|
||||
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
|
||||
let postings_writer: &mut dyn PostingsWriter =
|
||||
self.per_field_postings_writers.get_for_field_mut(field);
|
||||
term_buffer.set_field(field_entry.field_type().value_type(), field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
term_buffer.set_field(Type::Facet, field);
|
||||
for value in values {
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
@@ -176,12 +181,8 @@ impl SegmentWriter {
|
||||
.token_stream(facet_str)
|
||||
.process(&mut |token| {
|
||||
term_buffer.set_text(&token.text);
|
||||
let unordered_term_id = postings_writer.subscribe(
|
||||
doc_id,
|
||||
0u32,
|
||||
term_buffer,
|
||||
indexing_context,
|
||||
);
|
||||
let unordered_term_id =
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
// TODO pass indexing context directly in subscribe function
|
||||
unordered_term_id_opt = Some(unordered_term_id);
|
||||
});
|
||||
@@ -209,72 +210,79 @@ impl SegmentWriter {
|
||||
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
}
|
||||
let text_analyzer =
|
||||
&self.per_field_text_analyzers[field.field_id() as usize];
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
token_streams.push(text_analyzer.token_stream(text));
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let num_tokens = if token_streams.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
for mut token_stream in token_streams {
|
||||
assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&mut token_stream,
|
||||
&mut *token_stream,
|
||||
term_buffer,
|
||||
indexing_context,
|
||||
)
|
||||
};
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
ctx,
|
||||
&mut indexing_position,
|
||||
);
|
||||
}
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, indexing_position.num_tokens);
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::U64, field);
|
||||
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_u64(u64_val);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::Date, field);
|
||||
let date_val = value.as_date().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_i64(date_val.timestamp());
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::I64, field);
|
||||
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_i64(i64_val);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::F64, field);
|
||||
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_f64(f64_val);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::Bytes(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::Bytes, field);
|
||||
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_bytes(bytes);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, indexing_context);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it = values
|
||||
.iter()
|
||||
.map(|value| value.as_json().ok_or_else(make_schema_error));
|
||||
index_json_values(
|
||||
doc_id,
|
||||
json_values_it,
|
||||
text_analyzer,
|
||||
term_buffer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -323,13 +331,14 @@ impl SegmentWriter {
|
||||
/// `doc_id_map` is used to map to the new doc_id order.
|
||||
fn remap_and_write(
|
||||
per_field_postings_writers: &PerFieldPostingsWriter,
|
||||
indexing_context: IndexingContext,
|
||||
ctx: IndexingContext,
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FieldNormsWriter,
|
||||
schema: &Schema,
|
||||
mut serializer: SegmentSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> crate::Result<()> {
|
||||
debug!("remap-and-write");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?;
|
||||
}
|
||||
@@ -338,19 +347,21 @@ fn remap_and_write(
|
||||
.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
let term_ord_map = serialize_postings(
|
||||
indexing_context,
|
||||
ctx,
|
||||
per_field_postings_writers,
|
||||
fieldnorm_readers,
|
||||
doc_id_map,
|
||||
schema,
|
||||
serializer.get_postings_serializer(),
|
||||
)?;
|
||||
debug!("fastfield-serialize");
|
||||
fast_field_writers.serialize(
|
||||
serializer.get_fast_field_serializer(),
|
||||
&term_ord_map,
|
||||
doc_id_map,
|
||||
)?;
|
||||
|
||||
debug!("resort-docstore");
|
||||
// finalize temp docstore and create version, which reflects the doc_id_map
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let store_write = serializer
|
||||
@@ -373,6 +384,7 @@ fn remap_and_write(
|
||||
}
|
||||
}
|
||||
|
||||
debug!("serializer-close");
|
||||
serializer.close()?;
|
||||
|
||||
Ok(())
|
||||
@@ -402,10 +414,16 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::Utc;
|
||||
|
||||
use super::compute_initial_table_size;
|
||||
use crate::schema::{Schema, STORED, TEXT};
|
||||
use crate::collector::Count;
|
||||
use crate::indexer::json_term_writer::JsonTermWriter;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::Document;
|
||||
use crate::{DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
@@ -444,4 +462,247 @@ mod tests {
|
||||
Some("title")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{
|
||||
"toto": "titi",
|
||||
"float": -0.2,
|
||||
"unsigned": 1,
|
||||
"signed": -2,
|
||||
"complexobject": {
|
||||
"field.with.dot": 1
|
||||
},
|
||||
"date": "1985-04-12T23:20:50.52Z",
|
||||
"my_arr": [2, 3, {"my_key": "two tokens"}, 4]
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val.clone());
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let doc = searcher
|
||||
.doc(DocAddress {
|
||||
segment_ord: 0u32,
|
||||
doc_id: 0u32,
|
||||
})
|
||||
.unwrap();
|
||||
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
|
||||
&schema.to_json(&doc),
|
||||
)
|
||||
.unwrap()
|
||||
.get("json")
|
||||
.unwrap()[0]
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
assert_eq!(json_val, serdeser_json_val);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_idx = segment_reader.inverted_index(json_field).unwrap();
|
||||
let term_dict = inv_idx.terms();
|
||||
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut term_stream = term_dict.stream().unwrap();
|
||||
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("complexobject");
|
||||
json_term_writer.push_path_segment("field.with.dot");
|
||||
json_term_writer.set_fast_value(1u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("date");
|
||||
json_term_writer.set_fast_value(
|
||||
chrono::DateTime::parse_from_rfc3339("1985-04-12T23:20:50.52Z")
|
||||
.unwrap()
|
||||
.with_timezone(&Utc),
|
||||
);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("float");
|
||||
json_term_writer.set_fast_value(-0.2f64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("my_arr");
|
||||
json_term_writer.set_fast_value(2u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.set_fast_value(3u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.set_fast_value(4u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.push_path_segment("my_key");
|
||||
json_term_writer.set_str("tokens");
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.set_str("two");
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("signed");
|
||||
json_term_writer.set_fast_value(-2i64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("toto");
|
||||
json_term_writer.set_str("titi");
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("unsigned");
|
||||
json_term_writer.set_fast_value(1u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
assert!(!term_stream.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_tokenized_with_position() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
|
||||
doc.add_json_object(json_field, json_val);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_index = segment_reader.inverted_index(json_field).unwrap();
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.set_str("token");
|
||||
let term_info = inv_index
|
||||
.get_term_info(json_term_writer.term())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
term_info,
|
||||
TermInfo {
|
||||
doc_freq: 1,
|
||||
postings_range: 2..4,
|
||||
positions_range: 2..5
|
||||
}
|
||||
);
|
||||
let mut postings = inv_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
let mut positions = Vec::new();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&positions[..], &[1, 2]);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_raw_no_position() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
|
||||
let doc = doc!(json_field=>json_val);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_index = segment_reader.inverted_index(json_field).unwrap();
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.set_str("two tokens");
|
||||
let term_info = inv_index
|
||||
.get_term_info(json_term_writer.term())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
term_info,
|
||||
TermInfo {
|
||||
doc_freq: 1,
|
||||
postings_range: 0..1,
|
||||
positions_range: 0..0
|
||||
}
|
||||
);
|
||||
let mut postings = inv_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqs)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut positions = Vec::new();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_overlapping_path() {
|
||||
// This test checks that we do not end up detecting phrase query due
|
||||
// to several string literal in the same json object being overlapping.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.push_path_segment("field");
|
||||
json_term_writer.set_str("hello");
|
||||
let hello_term = json_term_writer.term().clone();
|
||||
json_term_writer.set_str("nothello");
|
||||
let nothello_term = json_term_writer.term().clone();
|
||||
json_term_writer.set_str("happy");
|
||||
let happy_term = json_term_writer.term().clone();
|
||||
let phrase_query = PhraseQuery::new(vec![hello_term, happy_term.clone()]);
|
||||
assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 1);
|
||||
let phrase_query = PhraseQuery::new(vec![nothello_term, happy_term]);
|
||||
assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,6 +134,10 @@ pub use crate::error::TantivyError;
|
||||
/// and instead, refer to this as `crate::Result<T>`.
|
||||
pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
|
||||
/// Result for an Async io operation.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub type AsyncIoResult<T> = std::result::Result<T, crate::error::AsyncIoError>;
|
||||
|
||||
/// Tantivy DateTime
|
||||
pub type DateTime = chrono::DateTime<chrono::Utc>;
|
||||
|
||||
@@ -144,6 +148,7 @@ mod indexer;
|
||||
pub mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
pub mod aggregation;
|
||||
pub mod collector;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io;
|
||||
|
||||
use common::{BinarySerializable, VInt};
|
||||
use common::VInt;
|
||||
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
@@ -28,9 +28,7 @@ pub struct BlockSegmentPostings {
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
block_max_score_cache: Option<Score>,
|
||||
|
||||
doc_freq: u32,
|
||||
|
||||
data: OwnedBytes,
|
||||
pub(crate) skip_reader: SkipReader,
|
||||
}
|
||||
@@ -70,13 +68,13 @@ fn decode_vint_block(
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut bytes: OwnedBytes,
|
||||
) -> (Option<OwnedBytes>, OwnedBytes) {
|
||||
) -> io::Result<(Option<OwnedBytes>, OwnedBytes)> {
|
||||
if doc_freq < COMPRESSION_BLOCK_SIZE as u32 {
|
||||
return (None, bytes);
|
||||
return Ok((None, bytes));
|
||||
}
|
||||
let skip_len = VInt::deserialize(&mut bytes).expect("Data corrupted").0 as usize;
|
||||
let skip_len = VInt::deserialize_u64(&mut bytes)? as usize;
|
||||
let (skip_data, postings_data) = bytes.split(skip_len);
|
||||
(Some(skip_data), postings_data)
|
||||
Ok((Some(skip_data), postings_data))
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
@@ -92,8 +90,8 @@ impl BlockSegmentPostings {
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) =
|
||||
split_into_skips_and_postings(doc_freq, data.read_bytes()?);
|
||||
let bytes = data.read_bytes()?;
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
|
||||
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
|
||||
@@ -166,8 +164,9 @@ impl BlockSegmentPostings {
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) -> io::Result<()> {
|
||||
let (skip_data_opt, postings_data) =
|
||||
split_into_skips_and_postings(doc_freq, postings_data)?;
|
||||
self.data = postings_data;
|
||||
self.block_max_score_cache = None;
|
||||
self.loaded_offset = std::usize::MAX;
|
||||
@@ -178,6 +177,7 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
self.doc_freq = doc_freq;
|
||||
self.load_block();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the block postings.
|
||||
@@ -322,7 +322,7 @@ impl BlockSegmentPostings {
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
/// Returns false if and only if there is no remaining block.
|
||||
pub fn advance(&mut self) {
|
||||
self.skip_reader.advance();
|
||||
self.block_max_score_cache = None;
|
||||
|
||||
94
src/postings/json_postings_writer.rs
Normal file
94
src/postings/json_postings_writer.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
use std::io;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
|
||||
use crate::postings::stacker::Addr;
|
||||
use crate::postings::{
|
||||
FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
|
||||
};
|
||||
use crate::schema::term::as_json_path_type_value_bytes;
|
||||
use crate::schema::Type;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::{DocId, Term};
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
|
||||
str_posting_writer: SpecializedPostingsWriter<Rec>,
|
||||
non_str_posting_writer: SpecializedPostingsWriter<NothingRecorder>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
fn from(json_postings_writer: JsonPostingsWriter<Rec>) -> Box<dyn PostingsWriter> {
|
||||
Box::new(json_postings_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
pos: u32,
|
||||
term: &crate::Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
|
||||
}
|
||||
|
||||
fn index_text(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
) {
|
||||
self.str_posting_writer.index_text(
|
||||
doc_id,
|
||||
token_stream,
|
||||
term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
);
|
||||
}
|
||||
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
// TODO optimization opportunity here.
|
||||
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
|
||||
if typ == Type::Str {
|
||||
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
} else {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens()
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ pub(crate) use self::block_search::branchless_binary_search;
|
||||
mod block_segment_postings;
|
||||
pub(crate) mod compression;
|
||||
mod indexing_context;
|
||||
mod json_postings_writer;
|
||||
mod per_field_postings_writer;
|
||||
mod postings;
|
||||
mod postings_writer;
|
||||
@@ -21,7 +22,7 @@ pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub(crate) use self::indexing_context::IndexingContext;
|
||||
pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter;
|
||||
pub use self::postings::Postings;
|
||||
pub(crate) use self::postings_writer::{serialize_postings, PostingsWriter};
|
||||
pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, PostingsWriter};
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::postings::json_postings_writer::JsonPostingsWriter;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
|
||||
use crate::postings::PostingsWriter;
|
||||
@@ -33,21 +34,38 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::default().into()
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed),
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::default().into()),
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::F64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bytes(_)
|
||||
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
| FieldType::Facet(_) => Box::new(SpecializedPostingsWriter::<NothingRecorder>::default()),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() {
|
||||
match text_indexing_option.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
JsonPostingsWriter::<NothingRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
JsonPostingsWriter::<TermFrequencyRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
JsonPostingsWriter::<TfAndPositionRecorder>::default().into()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
JsonPostingsWriter::<NothingRecorder>::default().into()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,11 +13,13 @@ use crate::postings::{
|
||||
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
|
||||
UnorderedTermId,
|
||||
};
|
||||
use crate::schema::{Field, FieldType, Schema, Term, Type};
|
||||
use crate::schema::{Field, FieldType, Schema, Term};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
|
||||
const POSITION_GAP: u32 = 1;
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
@@ -47,7 +49,7 @@ fn make_field_partition(
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
pub(crate) fn serialize_postings(
|
||||
indexing_context: IndexingContext,
|
||||
ctx: IndexingContext,
|
||||
per_field_postings_writers: &PerFieldPostingsWriter,
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
@@ -55,15 +57,13 @@ pub(crate) fn serialize_postings(
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(indexing_context.term_index.len());
|
||||
term_offsets.extend(indexing_context.term_index.iter());
|
||||
Vec::with_capacity(ctx.term_index.len());
|
||||
term_offsets.extend(ctx.term_index.iter());
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
|
||||
for (field, byte_offsets) in field_offsets {
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
@@ -83,6 +83,7 @@ pub(crate) fn serialize_postings(
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes(_) => {}
|
||||
FieldType::JsonObject(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
@@ -92,7 +93,7 @@ pub(crate) fn serialize_postings(
|
||||
postings_writer.serialize(
|
||||
&term_offsets[byte_offsets],
|
||||
doc_id_map,
|
||||
&indexing_context,
|
||||
&ctx,
|
||||
&mut field_serializer,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
@@ -100,6 +101,12 @@ pub(crate) fn serialize_postings(
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct IndexingPosition {
|
||||
pub num_tokens: u32,
|
||||
pub end_position: u32,
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
@@ -110,14 +117,14 @@ pub(crate) trait PostingsWriter {
|
||||
/// * doc - the document id
|
||||
/// * pos - the term position (expressed in tokens)
|
||||
/// * term - the term
|
||||
/// * indexing_context - Contains a term hashmap and a memory arena to store all necessary
|
||||
/// posting list information.
|
||||
/// * ctx - Contains a term hashmap and a memory arena to store all necessary posting list
|
||||
/// information.
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
indexing_context: &mut IndexingContext,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
@@ -126,7 +133,7 @@ pub(crate) trait PostingsWriter {
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
indexing_context: &IndexingContext,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()>;
|
||||
|
||||
@@ -134,27 +141,35 @@ pub(crate) trait PostingsWriter {
|
||||
fn index_text(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut Term,
|
||||
indexing_context: &mut IndexingContext,
|
||||
) -> u32 {
|
||||
term_buffer.set_field(Type::Str, field);
|
||||
let mut sink = |token: &Token| {
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
) {
|
||||
let end_of_path_idx = term_buffer.as_slice().len();
|
||||
let mut num_tokens = 0;
|
||||
let mut end_position = 0;
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term_buffer.set_text(token.text.as_str());
|
||||
self.subscribe(doc_id, token.position as u32, term_buffer, indexing_context);
|
||||
} else {
|
||||
if token.text.len() > MAX_TOKEN_LEN {
|
||||
warn!(
|
||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||
MAX_TOKEN_LEN in the documentation for more information.",
|
||||
token.text.len(),
|
||||
MAX_TOKEN_LEN
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
term_buffer.truncate(end_of_path_idx);
|
||||
term_buffer.append_bytes(token.text.as_bytes());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = start_position + token.position_length as u32;
|
||||
self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
num_tokens += 1;
|
||||
});
|
||||
indexing_position.end_position = end_position + POSITION_GAP;
|
||||
indexing_position.num_tokens += num_tokens;
|
||||
term_buffer.truncate(end_of_path_idx);
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64;
|
||||
@@ -162,40 +177,50 @@ pub(crate) trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
#[derive(Default)]
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
/// constructor
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a memory arena.
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
fn from(
|
||||
specialized_postings_writer: SpecializedPostingsWriter<Rec>,
|
||||
) -> Box<dyn PostingsWriter> {
|
||||
Box::new(specialized_postings_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
#[inline]
|
||||
pub(crate) fn serialize_one_term(
|
||||
term: &Term<&[u8]>,
|
||||
addr: Addr,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let recorder: Rec = ctx.term_index.read(addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
serializer.close_term()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
indexing_context: &mut IndexingContext,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
self.total_num_tokens += 1;
|
||||
let (term_index, arena) = (
|
||||
&mut indexing_context.term_index,
|
||||
&mut indexing_context.arena,
|
||||
);
|
||||
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
|
||||
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
@@ -206,7 +231,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new();
|
||||
let mut recorder = Rec::default();
|
||||
recorder.new_doc(doc, arena);
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
@@ -218,21 +243,12 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
indexing_context: &IndexingContext,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
let recorder: Rec = indexing_context.term_index.read(*addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(
|
||||
&indexing_context.arena,
|
||||
doc_id_map,
|
||||
serializer,
|
||||
&mut buffer_lender,
|
||||
);
|
||||
serializer.close_term()?;
|
||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
use common::read_u32_vint;
|
||||
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
@@ -56,9 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub(crate) trait Recorder: Copy + 'static {
|
||||
///
|
||||
fn new() -> Self;
|
||||
pub(crate) trait Recorder: Copy + Default + 'static {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
@@ -90,21 +88,23 @@ pub struct NothingRecorder {
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new() -> Self {
|
||||
impl Default for NothingRecorder {
|
||||
fn default() -> Self {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(doc);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {}
|
||||
@@ -152,8 +152,8 @@ pub struct TermFrequencyRecorder {
|
||||
term_doc_freq: u32,
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new() -> Self {
|
||||
impl Default for TermFrequencyRecorder {
|
||||
fn default() -> Self {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: 0,
|
||||
@@ -161,7 +161,9 @@ impl Recorder for TermFrequencyRecorder {
|
||||
term_doc_freq: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
@@ -169,7 +171,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||
self.term_doc_freq += 1;
|
||||
self.current_doc = doc;
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(doc);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {
|
||||
@@ -178,7 +180,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(self.current_tf);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
@@ -223,15 +225,18 @@ pub struct TfAndPositionRecorder {
|
||||
current_doc: DocId,
|
||||
term_doc_freq: u32,
|
||||
}
|
||||
impl Recorder for TfAndPositionRecorder {
|
||||
fn new() -> Self {
|
||||
|
||||
impl Default for TfAndPositionRecorder {
|
||||
fn default() -> Self {
|
||||
TfAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
term_doc_freq: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TfAndPositionRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
@@ -239,15 +244,17 @@ impl Recorder for TfAndPositionRecorder {
|
||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.term_doc_freq += 1u32;
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(doc);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
|
||||
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(arena));
|
||||
self.stack
|
||||
.writer(arena)
|
||||
.write_u32_vint(position.wrapping_add(1u32));
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
||||
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(arena));
|
||||
self.stack.writer(arena).write_u32_vint(POSITION_END);
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
@@ -300,7 +307,9 @@ impl Recorder for TfAndPositionRecorder {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{write_u32_vint, BufferLender, VInt32Reader};
|
||||
use common::write_u32_vint;
|
||||
|
||||
use super::{BufferLender, VInt32Reader};
|
||||
|
||||
#[test]
|
||||
fn test_buffer_lender() {
|
||||
|
||||
@@ -76,7 +76,7 @@ impl InvertedIndexSerializer {
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -122,24 +122,21 @@ impl<'a> FieldSerializer<'a> {
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
total_num_tokens.serialize(postings_write)?;
|
||||
let mode = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
||||
text_indexing_options.index_option()
|
||||
} else {
|
||||
IndexRecordOption::Basic
|
||||
}
|
||||
}
|
||||
_ => IndexRecordOption::Basic,
|
||||
};
|
||||
let index_record_option = field_type
|
||||
.index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
|
||||
let average_fieldnorm = fieldnorm_reader
|
||||
.as_ref()
|
||||
.map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
|
||||
.unwrap_or(0.0);
|
||||
let postings_serializer =
|
||||
PostingsSerializer::new(postings_write, average_fieldnorm, mode, fieldnorm_reader);
|
||||
let positions_serializer_opt = if mode.has_positions() {
|
||||
let postings_serializer = PostingsSerializer::new(
|
||||
postings_write,
|
||||
average_fieldnorm,
|
||||
index_record_option,
|
||||
fieldnorm_reader,
|
||||
);
|
||||
let positions_serializer_opt = if index_record_option.has_positions() {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
} else {
|
||||
None
|
||||
@@ -203,6 +200,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq);
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
assert_eq!(term_freq as usize, position_deltas.len());
|
||||
positions_serializer.write_positions_delta(position_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
use std::{io, mem};
|
||||
use std::mem;
|
||||
|
||||
use common::serialize_vint_u32;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::postings::stacker::memory_arena::{load, store};
|
||||
@@ -97,12 +99,13 @@ fn ensure_capacity<'a>(
|
||||
}
|
||||
|
||||
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
pub fn write_u32_vint(&mut self, val: u32) {
|
||||
let mut buf = [0u8; 8];
|
||||
let data = serialize_vint_u32(val, &mut buf);
|
||||
self.extend_from_slice(data);
|
||||
}
|
||||
|
||||
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
|
||||
if buf.is_empty() {
|
||||
// we need to cut early, because `ensure_capacity`
|
||||
// allocates if there is no capacity at all right now.
|
||||
return;
|
||||
}
|
||||
while !buf.is_empty() {
|
||||
let add_len: usize;
|
||||
{
|
||||
@@ -117,25 +120,6 @@ impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
// There is no use case to only write the capacity.
|
||||
// This is not IO after all, so we write the whole
|
||||
// buffer even if the contract of `.write` is looser.
|
||||
self.extend_from_slice(buf);
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.extend_from_slice(buf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new() -> ExpUnrolledLinkedList {
|
||||
ExpUnrolledLinkedList {
|
||||
@@ -178,8 +162,7 @@ impl ExpUnrolledLinkedList {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::{len_to_capacity, *};
|
||||
@@ -205,18 +188,14 @@ mod tests {
|
||||
let mut eull = ExpUnrolledLinkedList::new();
|
||||
let data: Vec<u32> = (0..100).collect();
|
||||
for &el in &data {
|
||||
assert!(eull
|
||||
.writer(&mut arena)
|
||||
.write_u32::<LittleEndian>(el)
|
||||
.is_ok());
|
||||
eull.writer(&mut arena).write_u32_vint(el);
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
eull.read_to_end(&arena, &mut buffer);
|
||||
let mut result = vec![];
|
||||
let mut remaining = &buffer[..];
|
||||
while !remaining.is_empty() {
|
||||
result.push(LittleEndian::read_u32(&remaining[..4]));
|
||||
remaining = &remaining[4..];
|
||||
result.push(read_u32_vint(&mut remaining));
|
||||
}
|
||||
assert_eq!(&result[..], &data[..]);
|
||||
}
|
||||
@@ -231,14 +210,11 @@ mod tests {
|
||||
let mut vec2: Vec<u8> = vec![];
|
||||
|
||||
for i in 0..9 {
|
||||
assert!(stack.writer(&mut eull).write_u32::<LittleEndian>(i).is_ok());
|
||||
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
|
||||
stack.writer(&mut eull).write_u32_vint(i);
|
||||
assert!(write_u32_vint(i, &mut vec1).is_ok());
|
||||
if i % 2 == 0 {
|
||||
assert!(stack2
|
||||
.writer(&mut eull)
|
||||
.write_u32::<LittleEndian>(i)
|
||||
.is_ok());
|
||||
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
|
||||
stack2.writer(&mut eull).write_u32_vint(i);
|
||||
assert!(write_u32_vint(i, &mut vec2).is_ok());
|
||||
}
|
||||
}
|
||||
let mut res1 = vec![];
|
||||
@@ -303,7 +279,6 @@ mod tests {
|
||||
mod bench {
|
||||
use std::iter;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
use test::Bencher;
|
||||
|
||||
use super::super::MemoryArena;
|
||||
@@ -339,7 +314,9 @@ mod bench {
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
let _ = stacks[t].writer(&mut arena).write_u32::<NativeEndian>(i);
|
||||
stacks[t]
|
||||
.writer(&mut arena)
|
||||
.extend_from_slice(&i.to_ne_bytes());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -47,7 +47,7 @@ fn find_pivot_doc(
|
||||
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
|
||||
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
|
||||
fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers: &mut Vec<TermScorerWithMaxScore>,
|
||||
scorers: &mut [TermScorerWithMaxScore],
|
||||
pivot_len: usize,
|
||||
) {
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
@@ -82,7 +82,7 @@ fn block_max_was_too_low_advance_one_scorer(
|
||||
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
|
||||
// except term_scorers[ord] that might be in advance compared to its ranks,
|
||||
// bubble up term_scorers[ord] in order to restore the ordering.
|
||||
fn restore_ordering(term_scorers: &mut Vec<TermScorerWithMaxScore>, ord: usize) {
|
||||
fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) {
|
||||
let doc = term_scorers[ord].doc();
|
||||
for i in ord + 1..term_scorers.len() {
|
||||
if term_scorers[i].doc() >= doc {
|
||||
|
||||
@@ -204,8 +204,8 @@ impl BooleanQuery {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BooleanQuery;
|
||||
use crate::collector::DocSetCollector;
|
||||
use crate::query::{QueryClone, TermQuery};
|
||||
use crate::collector::{Count, DocSetCollector};
|
||||
use crate::query::{QueryClone, QueryParser, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{DocAddress, Index, Term};
|
||||
|
||||
@@ -282,4 +282,42 @@ mod tests {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_json_array_pitfall_bag_of_terms() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"cart": [
|
||||
{"product_type": "sneakers", "attributes": {"color": "white"}},
|
||||
{"product_type": "t-shirt", "attributes": {"color": "red"}},
|
||||
{"product_type": "cd", "attributes": {"genre": "blues"}},
|
||||
]
|
||||
})))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
let doc_matches = |query: &str| {
|
||||
let query_parser = QueryParser::for_index(&index, vec![json_field]);
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
searcher.search(&query, &Count).unwrap() == 1
|
||||
};
|
||||
// As expected
|
||||
assert!(doc_matches(
|
||||
r#"cart.product_type:sneakers AND cart.attributes.color:white"#
|
||||
));
|
||||
// Unexpected match, due to the fact that array do not act as nested docs.
|
||||
assert!(doc_matches(
|
||||
r#"cart.product_type:sneakers AND cart.attributes.color:red"#
|
||||
));
|
||||
// However, bviously this works...
|
||||
assert!(!doc_matches(
|
||||
r#"cart.product_type:sneakers AND cart.attributes.color:blues"#
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ where TScoreCombiner: ScoreCombiner {
|
||||
.iter()
|
||||
.all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq)
|
||||
{
|
||||
// Block wand is only available iff we read frequencies.
|
||||
// Block wand is only available if we read frequencies.
|
||||
return SpecializedScorer::TermUnion(scorers);
|
||||
} else {
|
||||
return SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(
|
||||
|
||||
@@ -9,10 +9,12 @@ pub use self::phrase_weight::PhraseWeight;
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
|
||||
use crate::core::Index;
|
||||
use crate::query::Weight;
|
||||
use crate::query::{QueryParser, Weight};
|
||||
use crate::schema::{Schema, Term, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED};
|
||||
|
||||
@@ -179,6 +181,90 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[ignore]
|
||||
#[test]
|
||||
pub fn test_phrase_score_with_slop() -> crate::Result<()> {
|
||||
let index = create_index(&["a c b", "a b c a b"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let mut phrase_query = PhraseQuery::new(terms);
|
||||
phrase_query.set_slop(1);
|
||||
searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
};
|
||||
let scores = test_query(vec!["a", "b"]);
|
||||
assert_nearly_equals!(scores[0], 0.40618482);
|
||||
assert_nearly_equals!(scores[1], 0.46844664);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_score_with_slop_size() -> crate::Result<()> {
|
||||
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let mut phrase_query = PhraseQuery::new(terms);
|
||||
phrase_query.set_slop(3);
|
||||
searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
};
|
||||
let scores = test_query(vec!["a", "c"]);
|
||||
assert_nearly_equals!(scores[0], 0.29086056);
|
||||
assert_nearly_equals!(scores[1], 0.26706287);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_score_with_slop_ordering() -> crate::Result<()> {
|
||||
let index = create_index(&[
|
||||
"a e b e c",
|
||||
"a e e e e e b e e e e c",
|
||||
"a c b",
|
||||
"a c e b e",
|
||||
"a e c b",
|
||||
"a e b c",
|
||||
])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let mut phrase_query = PhraseQuery::new(terms);
|
||||
phrase_query.set_slop(3);
|
||||
searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
};
|
||||
let scores = test_query(vec!["a", "b", "c"]);
|
||||
// The first and last matches.
|
||||
assert_nearly_equals!(scores[0], 0.23091172);
|
||||
assert_nearly_equals!(scores[1], 0.25024384);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test] // motivated by #234
|
||||
pub fn test_phrase_query_docfreq_order() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -248,4 +334,56 @@ pub mod tests {
|
||||
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_query_on_json() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text": "elliot smith the happy who"
|
||||
})))?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text": "the who elliot smith"
|
||||
})))?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"arr": [{"text":"the who"}, {"text":"elliot smith"}]
|
||||
})))?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text2": "the smith"
|
||||
})))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
let matching_docs = |query: &str| {
|
||||
let query_parser = QueryParser::for_index(&index, vec![json_field]);
|
||||
let phrase_query = query_parser.parse_query(query).unwrap();
|
||||
let phrase_weight = phrase_query.weight(&*searcher, false).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.scorer(searcher.segment_reader(0), 1.0f32)
|
||||
.unwrap();
|
||||
let mut docs = Vec::new();
|
||||
loop {
|
||||
let doc = phrase_scorer.doc();
|
||||
if doc == TERMINATED {
|
||||
break;
|
||||
}
|
||||
docs.push(doc);
|
||||
phrase_scorer.advance();
|
||||
}
|
||||
docs
|
||||
};
|
||||
assert!(matching_docs(r#"text:"the smith""#).is_empty());
|
||||
assert_eq!(&matching_docs(r#"text:the"#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text:"the""#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text:"smith""#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text:"elliot smith""#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text2:"the smith""#), &[3u32]);
|
||||
assert!(&matching_docs(r#"arr.text:"the smith""#).is_empty());
|
||||
assert_eq!(&matching_docs(r#"arr.text:"elliot smith""#), &[2]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use crate::schema::{Field, IndexRecordOption, Term};
|
||||
pub struct PhraseQuery {
|
||||
field: Field,
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
slop: u32,
|
||||
}
|
||||
|
||||
impl PhraseQuery {
|
||||
@@ -53,9 +54,15 @@ impl PhraseQuery {
|
||||
PhraseQuery {
|
||||
field,
|
||||
phrase_terms: terms,
|
||||
slop: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Slop allowed for the phrase.
|
||||
pub fn set_slop(&mut self, value: u32) {
|
||||
self.slop = value;
|
||||
}
|
||||
|
||||
/// The `Field` this `PhraseQuery` is targeting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
@@ -94,11 +101,11 @@ impl PhraseQuery {
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
|
||||
Ok(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight,
|
||||
scoring_enabled,
|
||||
))
|
||||
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
|
||||
if self.slop > 0 {
|
||||
weight.slop(self.slop);
|
||||
}
|
||||
Ok(weight)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -52,24 +52,25 @@ pub struct PhraseScorer<TPostings: Postings> {
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
slop: u32,
|
||||
}
|
||||
|
||||
/// Returns true iff the two sorted array contain a common element
|
||||
/// Returns true if and only if the two sorted arrays contain a common element
|
||||
fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
|
||||
let mut left_i = 0;
|
||||
let mut right_i = 0;
|
||||
while left_i < left.len() && right_i < right.len() {
|
||||
let left_val = left[left_i];
|
||||
let right_val = right[right_i];
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
while left_index < left.len() && right_index < right.len() {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
match left_val.cmp(&right_val) {
|
||||
Ordering::Less => {
|
||||
left_i += 1;
|
||||
left_index += 1;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return true;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
right_i += 1;
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -77,23 +78,23 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
|
||||
}
|
||||
|
||||
fn intersection_count(left: &[u32], right: &[u32]) -> usize {
|
||||
let mut left_i = 0;
|
||||
let mut right_i = 0;
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let mut count = 0;
|
||||
while left_i < left.len() && right_i < right.len() {
|
||||
let left_val = left[left_i];
|
||||
let right_val = right[right_i];
|
||||
while left_index < left.len() && right_index < right.len() {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
match left_val.cmp(&right_val) {
|
||||
Ordering::Less => {
|
||||
left_i += 1;
|
||||
left_index += 1;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
count += 1;
|
||||
left_i += 1;
|
||||
right_i += 1;
|
||||
left_index += 1;
|
||||
right_index += 1;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
right_i += 1;
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -105,38 +106,91 @@ fn intersection_count(left: &[u32], right: &[u32]) -> usize {
|
||||
///
|
||||
/// Returns the length of the intersection
|
||||
fn intersection(left: &mut [u32], right: &[u32]) -> usize {
|
||||
let mut left_i = 0;
|
||||
let mut right_i = 0;
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let mut count = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
while left_i < left_len && right_i < right_len {
|
||||
let left_val = left[left_i];
|
||||
let right_val = right[right_i];
|
||||
while left_index < left_len && right_index < right_len {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
match left_val.cmp(&right_val) {
|
||||
Ordering::Less => {
|
||||
left_i += 1;
|
||||
left_index += 1;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
left[count] = left_val;
|
||||
count += 1;
|
||||
left_i += 1;
|
||||
right_i += 1;
|
||||
left_index += 1;
|
||||
right_index += 1;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
right_i += 1;
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Intersect twos sorted arrays `left` and `right` and outputs the
|
||||
/// resulting array in left.
|
||||
///
|
||||
/// Condition for match is that the value stored in left is less than or equal to
|
||||
/// the value in right and that the distance to the previous token is lte to the slop.
|
||||
///
|
||||
/// Returns the length of the intersection
|
||||
fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let mut count = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
while left_index < left_len && right_index < right_len {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
|
||||
// The three conditions are:
|
||||
// left_val < right_slop -> left index increment.
|
||||
// right_slop <= left_val <= right -> find the best match.
|
||||
// left_val > right -> right index increment.
|
||||
let right_slop = if right_val >= slop {
|
||||
right_val - slop
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if left_val < right_slop {
|
||||
left_index += 1;
|
||||
} else if right_slop <= left_val && left_val <= right_val {
|
||||
while left_index + 1 < left_len {
|
||||
// there could be a better match
|
||||
let next_left_val = left[left_index + 1];
|
||||
if next_left_val > right_val {
|
||||
// the next value is outside the range, so current one is the best.
|
||||
break;
|
||||
}
|
||||
// the next value is better.
|
||||
left_index += 1;
|
||||
}
|
||||
// store the match in left.
|
||||
left[count] = right_val;
|
||||
count += 1;
|
||||
left_index += 1;
|
||||
right_index += 1;
|
||||
} else if left_val > right_val {
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
pub fn new(
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
similarity_weight: Bm25Weight,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
scoring_enabled: bool,
|
||||
slop: u32,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
let max_offset = term_postings
|
||||
.iter()
|
||||
@@ -159,6 +213,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
scoring_enabled,
|
||||
slop,
|
||||
};
|
||||
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
|
||||
scorer.advance();
|
||||
@@ -181,51 +236,54 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
fn phrase_exists(&mut self) -> bool {
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
.positions(&mut self.right);
|
||||
}
|
||||
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
|
||||
if intersection_len == 0 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
let intersection_len = self.compute_phrase_match();
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
|
||||
fn compute_phrase_count(&mut self) -> u32 {
|
||||
let intersection_len = self.compute_phrase_match();
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
|
||||
fn compute_phrase_match(&mut self) -> usize {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_terms - 1 {
|
||||
let end_term = if self.has_slop() {
|
||||
self.num_terms
|
||||
} else {
|
||||
self.num_terms - 1
|
||||
};
|
||||
for i in 1..end_term {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
.positions(&mut self.right);
|
||||
}
|
||||
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
|
||||
intersection_len = if self.has_slop() {
|
||||
intersection_with_slop(
|
||||
&mut self.left[..intersection_len],
|
||||
&self.right[..],
|
||||
self.slop,
|
||||
)
|
||||
} else {
|
||||
intersection(&mut self.left[..intersection_len], &self.right[..])
|
||||
};
|
||||
if intersection_len == 0 {
|
||||
return 0u32;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
intersection_len
|
||||
}
|
||||
|
||||
fn has_slop(&self) -> bool {
|
||||
self.slop > 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,18 +326,26 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{intersection, intersection_count};
|
||||
use super::{intersection, intersection_count, intersection_with_slop};
|
||||
|
||||
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
|
||||
test_intersection_aux(left, right, expected);
|
||||
test_intersection_aux(right, left, expected);
|
||||
test_intersection_aux(left, right, expected, 0);
|
||||
test_intersection_aux(right, left, expected, 0);
|
||||
}
|
||||
|
||||
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32]) {
|
||||
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
|
||||
let mut left_vec = Vec::from(left);
|
||||
let left_mut = &mut left_vec[..];
|
||||
assert_eq!(intersection_count(left_mut, right), expected.len());
|
||||
let count = intersection(left_mut, right);
|
||||
if slop == 0 {
|
||||
let left_mut = &mut left_vec[..];
|
||||
assert_eq!(intersection_count(left_mut, right), expected.len());
|
||||
let count = intersection(left_mut, right);
|
||||
assert_eq!(&left_mut[..count], expected);
|
||||
return;
|
||||
}
|
||||
let mut right_vec = Vec::from(right);
|
||||
let right_mut = &mut right_vec[..];
|
||||
let count = intersection_with_slop(left_mut, right_mut, slop);
|
||||
assert_eq!(&left_mut[..count], expected);
|
||||
}
|
||||
|
||||
@@ -291,6 +357,36 @@ mod tests {
|
||||
test_intersection_sym(&[5, 7], &[1, 5, 10, 12], &[5]);
|
||||
test_intersection_sym(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12]);
|
||||
}
|
||||
#[test]
|
||||
fn test_slop() {
|
||||
// The slop is not symetric. It does not allow for the phrase to be out of order.
|
||||
test_intersection_aux(&[1], &[2], &[2], 1);
|
||||
test_intersection_aux(&[1], &[3], &[], 1);
|
||||
test_intersection_aux(&[1], &[3], &[3], 2);
|
||||
test_intersection_aux(&[], &[2], &[], 100000);
|
||||
test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
|
||||
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
|
||||
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
|
||||
test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[2, 4, 6], 1);
|
||||
test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[], 0);
|
||||
}
|
||||
|
||||
fn test_merge(left: &[u32], right: &[u32], expected_left: &[u32], slop: u32) {
|
||||
let mut left_vec = Vec::from(left);
|
||||
let left_mut = &mut left_vec[..];
|
||||
let mut right_vec = Vec::from(right);
|
||||
let right_mut = &mut right_vec[..];
|
||||
let count = intersection_with_slop(left_mut, right_mut, slop);
|
||||
assert_eq!(&left_mut[..count], expected_left);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_slop() {
|
||||
test_merge(&[1, 2], &[1], &[1], 1);
|
||||
test_merge(&[3], &[4], &[4], 2);
|
||||
test_merge(&[3], &[4], &[4], 2);
|
||||
test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -12,6 +12,7 @@ pub struct PhraseWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
slop: u32,
|
||||
}
|
||||
|
||||
impl PhraseWeight {
|
||||
@@ -21,23 +22,26 @@ impl PhraseWeight {
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
) -> PhraseWeight {
|
||||
let slop = 0;
|
||||
PhraseWeight {
|
||||
phrase_terms,
|
||||
similarity_weight,
|
||||
scoring_enabled,
|
||||
slop,
|
||||
}
|
||||
}
|
||||
|
||||
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
if self.scoring_enabled {
|
||||
reader.get_fieldnorms_reader(field)
|
||||
} else {
|
||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
|
||||
return Ok(fieldnorm_reader);
|
||||
}
|
||||
}
|
||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||
}
|
||||
|
||||
fn phrase_scorer(
|
||||
pub(crate) fn phrase_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
boost: Score,
|
||||
@@ -73,8 +77,13 @@ impl PhraseWeight {
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
self.scoring_enabled,
|
||||
self.slop,
|
||||
)))
|
||||
}
|
||||
|
||||
pub fn slop(&mut self, slop: u32) {
|
||||
self.slop = slop;
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -174,7 +174,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
r#"TermQuery(Term(type=Str, field=1, val="hello"))"#
|
||||
r#"TermQuery(Term(type=Str, field=1, "hello"))"#
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -93,6 +93,10 @@ impl TermWeight {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn term(&self) -> &Term {
|
||||
&self.term
|
||||
}
|
||||
|
||||
pub(crate) fn specialized_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
|
||||
@@ -122,7 +122,7 @@ impl IndexReaderBuilder {
|
||||
|
||||
/// Sets the number of [Searcher] to pool.
|
||||
///
|
||||
/// See [Self::searcher()].
|
||||
/// See [IndexReader::searcher()].
|
||||
#[must_use]
|
||||
pub fn num_searchers(mut self, num_searchers: usize) -> IndexReaderBuilder {
|
||||
self.num_searchers = num_searchers;
|
||||
|
||||
@@ -10,7 +10,7 @@ pub const GC_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// `Warmer` can be used to maintain segment-level state e.g. caches.
|
||||
///
|
||||
/// They must be registered with the [IndexReaderBuilder].
|
||||
/// They must be registered with the [super::IndexReaderBuilder].
|
||||
pub trait Warmer: Sync + Send {
|
||||
/// Perform any warming work using the provided [Searcher].
|
||||
fn warm(&self, searcher: &Searcher) -> crate::Result<()>;
|
||||
|
||||
@@ -14,10 +14,10 @@ pub struct BytesOptions {
|
||||
}
|
||||
|
||||
/// For backward compability we add an intermediary to interpret the
|
||||
/// lack of fieldnorms attribute as "true" iff indexed.
|
||||
/// lack of fieldnorms attribute as "true" if and only if indexed.
|
||||
///
|
||||
/// (Downstream, for the moment, this attribute is not used anyway if not indexed...)
|
||||
/// Note that: newly serialized IntOptions will include the new attribute.
|
||||
/// (Downstream, for the moment, this attribute is not used if not indexed...)
|
||||
/// Note that: newly serialized NumericOptions will include the new attribute.
|
||||
#[derive(Deserialize)]
|
||||
struct BytesOptionsDeser {
|
||||
indexed: bool,
|
||||
@@ -39,22 +39,22 @@ impl From<BytesOptionsDeser> for BytesOptions {
|
||||
}
|
||||
|
||||
impl BytesOptions {
|
||||
/// Returns true iff the value is indexed.
|
||||
/// Returns true if the value is indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the value is normed.
|
||||
/// Returns true if and only if the value is normed.
|
||||
pub fn fieldnorms(&self) -> bool {
|
||||
self.fieldnorms
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
/// Returns true if the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Returns true iff the value is stored.
|
||||
/// Returns true if the value is stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ impl Document {
|
||||
self.field_values.len()
|
||||
}
|
||||
|
||||
/// Returns true iff the document contains no fields.
|
||||
/// Returns true if the document contains no fields.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.field_values.is_empty()
|
||||
}
|
||||
@@ -117,7 +117,16 @@ impl Document {
|
||||
|
||||
/// Add a bytes field
|
||||
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
|
||||
self.add_field_value(field, value.into())
|
||||
self.add_field_value(field, value.into());
|
||||
}
|
||||
|
||||
/// Add a bytes field
|
||||
pub fn add_json_object(
|
||||
&mut self,
|
||||
field: Field,
|
||||
json_object: serde_json::Map<String, serde_json::Value>,
|
||||
) {
|
||||
self.add_field_value(field, json_object);
|
||||
}
|
||||
|
||||
/// Add a (field, value) to the document.
|
||||
|
||||
@@ -49,7 +49,7 @@ impl Facet {
|
||||
Facet("".to_string())
|
||||
}
|
||||
|
||||
/// Returns true iff the facet is the root facet `/`.
|
||||
/// Returns true if the facet is the root facet `/`.
|
||||
pub fn is_root(&self) -> bool {
|
||||
self.encoded_str().is_empty()
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ pub struct FacetOptions {
|
||||
}
|
||||
|
||||
impl FacetOptions {
|
||||
/// Returns true iff the value is stored.
|
||||
/// Returns true if the value is stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::{is_valid_field_name, FacetOptions, FieldType, IntOptions, TextOptions};
|
||||
use crate::schema::{
|
||||
is_valid_field_name, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, TextOptions,
|
||||
};
|
||||
|
||||
/// A `FieldEntry` represents a field and its configuration.
|
||||
/// `Schema` are a collection of `FieldEntry`
|
||||
@@ -27,71 +29,44 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
/// Creates a new text field entry.
|
||||
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Str(text_options),
|
||||
}
|
||||
Self::new(field_name, FieldType::Str(text_options))
|
||||
}
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::U64(field_type),
|
||||
}
|
||||
/// Creates a new u64 field entry.
|
||||
pub fn new_u64(field_name: String, int_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::U64(int_options))
|
||||
}
|
||||
|
||||
/// Creates a new i64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::I64(field_type),
|
||||
}
|
||||
/// Creates a new i64 field entry.
|
||||
pub fn new_i64(field_name: String, int_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::I64(int_options))
|
||||
}
|
||||
|
||||
/// Creates a new f64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::F64(field_type),
|
||||
}
|
||||
/// Creates a new f64 field entry.
|
||||
pub fn new_f64(field_name: String, f64_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::F64(f64_options))
|
||||
}
|
||||
|
||||
/// Creates a new date field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Date(field_type),
|
||||
}
|
||||
/// Creates a new date field entry.
|
||||
pub fn new_date(field_name: String, date_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::Date(date_options))
|
||||
}
|
||||
|
||||
/// Creates a field entry for a facet.
|
||||
pub fn new_facet(field_name: String, field_type: FacetOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Facet(field_type),
|
||||
}
|
||||
pub fn new_facet(field_name: String, facet_options: FacetOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::Facet(facet_options))
|
||||
}
|
||||
|
||||
/// Creates a field entry for a bytes field
|
||||
pub fn new_bytes(field_name: String, bytes_type: BytesOptions) -> FieldEntry {
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Bytes(bytes_type),
|
||||
}
|
||||
pub fn new_bytes(field_name: String, bytes_options: BytesOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::Bytes(bytes_options))
|
||||
}
|
||||
|
||||
/// Creates a field entry for a json field
|
||||
pub fn new_json(field_name: String, json_object_options: JsonObjectOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::JsonObject(json_object_options))
|
||||
}
|
||||
|
||||
/// Returns the name of the field
|
||||
@@ -104,19 +79,19 @@ impl FieldEntry {
|
||||
&self.field_type
|
||||
}
|
||||
|
||||
/// Returns true iff the field is indexed.
|
||||
/// Returns true if the field is indexed.
|
||||
///
|
||||
/// An indexed field is searchable.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.field_type.is_indexed()
|
||||
}
|
||||
|
||||
/// Returns true iff the field is normed
|
||||
/// Returns true if the field is normed
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
self.field_type.has_fieldnorms()
|
||||
}
|
||||
|
||||
/// Returns true iff the field is a int (signed or unsigned) fast field
|
||||
/// Returns true if the field is a int (signed or unsigned) fast field
|
||||
pub fn is_fast(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options)
|
||||
@@ -127,7 +102,7 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the field is stored
|
||||
/// Returns true if the field is stored
|
||||
pub fn is_stored(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options)
|
||||
@@ -137,6 +112,7 @@ impl FieldEntry {
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
FieldType::Facet(ref options) => options.is_stored(),
|
||||
FieldType::Bytes(ref options) => options.is_stored(),
|
||||
FieldType::JsonObject(ref options) => options.is_stored(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,32 @@
|
||||
use chrono::{FixedOffset, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value};
|
||||
use crate::schema::{
|
||||
Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions,
|
||||
Value,
|
||||
};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Error)]
|
||||
pub enum ValueParsingError {
|
||||
/// Encountered a numerical value that overflows or underflow its integer type.
|
||||
OverflowError(String),
|
||||
/// The json node is not of the correct type.
|
||||
/// (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
|
||||
/// Tantivy will try to autocast values.
|
||||
TypeError(String),
|
||||
/// The json node is a string but contains json that is
|
||||
/// not valid base64.
|
||||
InvalidBase64(String),
|
||||
#[error("Overflow error. Expected {expected}, got {json}")]
|
||||
OverflowError {
|
||||
expected: &'static str,
|
||||
json: serde_json::Value,
|
||||
},
|
||||
#[error("Type error. Expected {expected}, got {json}")]
|
||||
TypeError {
|
||||
expected: &'static str,
|
||||
json: serde_json::Value,
|
||||
},
|
||||
#[error("Invalid base64: {base64}")]
|
||||
InvalidBase64 { base64: String },
|
||||
}
|
||||
|
||||
/// Type of the value that a field can take.
|
||||
@@ -43,9 +50,11 @@ pub enum Type {
|
||||
Facet = b'h',
|
||||
/// `Vec<u8>`
|
||||
Bytes = b'b',
|
||||
/// Leaf in a Json object.
|
||||
Json = b'j',
|
||||
}
|
||||
|
||||
const ALL_TYPES: [Type; 7] = [
|
||||
const ALL_TYPES: [Type; 8] = [
|
||||
Type::Str,
|
||||
Type::U64,
|
||||
Type::I64,
|
||||
@@ -53,6 +62,7 @@ const ALL_TYPES: [Type; 7] = [
|
||||
Type::Date,
|
||||
Type::Facet,
|
||||
Type::Bytes,
|
||||
Type::Json,
|
||||
];
|
||||
|
||||
impl Type {
|
||||
@@ -67,6 +77,20 @@ impl Type {
|
||||
*self as u8
|
||||
}
|
||||
|
||||
/// Returns a human readable name for the Type.
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
Type::Str => "Str",
|
||||
Type::U64 => "U64",
|
||||
Type::I64 => "I64",
|
||||
Type::F64 => "F64",
|
||||
Type::Date => "Date",
|
||||
Type::Facet => "Facet",
|
||||
Type::Bytes => "Bytes",
|
||||
Type::Json => "Json",
|
||||
}
|
||||
}
|
||||
|
||||
/// Interprets a 1byte code as a type.
|
||||
/// Returns None if the code is invalid.
|
||||
pub fn from_code(code: u8) -> Option<Self> {
|
||||
@@ -78,6 +102,7 @@ impl Type {
|
||||
b'd' => Some(Type::Date),
|
||||
b'h' => Some(Type::Facet),
|
||||
b'b' => Some(Type::Bytes),
|
||||
b'j' => Some(Type::Json),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -93,17 +118,19 @@ pub enum FieldType {
|
||||
#[serde(rename = "text")]
|
||||
Str(TextOptions),
|
||||
/// Unsigned 64-bits integers field type configuration
|
||||
U64(IntOptions),
|
||||
U64(NumericOptions),
|
||||
/// Signed 64-bits integers 64 field type configuration
|
||||
I64(IntOptions),
|
||||
I64(NumericOptions),
|
||||
/// 64-bits float 64 field type configuration
|
||||
F64(IntOptions),
|
||||
F64(NumericOptions),
|
||||
/// Signed 64-bits Date 64 field type configuration,
|
||||
Date(IntOptions),
|
||||
Date(NumericOptions),
|
||||
/// Hierachical Facet
|
||||
Facet(FacetOptions),
|
||||
/// Bytes (one per document)
|
||||
Bytes(BytesOptions),
|
||||
/// Json object
|
||||
JsonObject(JsonObjectOptions),
|
||||
}
|
||||
|
||||
impl FieldType {
|
||||
@@ -117,10 +144,11 @@ impl FieldType {
|
||||
FieldType::Date(_) => Type::Date,
|
||||
FieldType::Facet(_) => Type::Facet,
|
||||
FieldType::Bytes(_) => Type::Bytes,
|
||||
FieldType::JsonObject(_) => Type::Json,
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true iff the field is indexed.
|
||||
/// returns true if the field is indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options().is_some(),
|
||||
@@ -130,10 +158,32 @@ impl FieldType {
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::Facet(ref _facet_options) => true,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
|
||||
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true iff the field is normed.
|
||||
/// Returns the index record option for the field.
|
||||
///
|
||||
/// If the field is not indexed, returns `None`.
|
||||
pub fn index_record_option(&self) -> Option<IndexRecordOption> {
|
||||
match self {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing| text_indexing.index_option()),
|
||||
FieldType::JsonObject(json_object_options) => json_object_options
|
||||
.get_text_indexing_options()
|
||||
.map(|text_indexing| text_indexing.index_option()),
|
||||
field_type => {
|
||||
if field_type.is_indexed() {
|
||||
Some(IndexRecordOption::Basic)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is normed.
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
@@ -146,12 +196,17 @@ impl FieldType {
|
||||
| FieldType::Date(ref int_options) => int_options.fieldnorms(),
|
||||
FieldType::Facet(_) => false,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
|
||||
FieldType::JsonObject(ref _json_object_options) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a field configuration, return the maximal possible
|
||||
/// `IndexRecordOption` available.
|
||||
///
|
||||
/// For the Json object, this does not necessarily mean it is the index record
|
||||
/// option level is available for all terms.
|
||||
/// (Non string terms have the Basic indexing option at most.)
|
||||
///
|
||||
/// If the field is not indexed, then returns `None`.
|
||||
pub fn get_index_record_option(&self) -> Option<IndexRecordOption> {
|
||||
match *self {
|
||||
@@ -176,6 +231,9 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(ref json_obj_options) => json_obj_options
|
||||
.get_text_indexing_options()
|
||||
.map(TextFieldIndexing::index_option),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,91 +242,100 @@ impl FieldType {
|
||||
/// Tantivy will not try to cast values.
|
||||
/// For instance, If the json value is the integer `3` and the
|
||||
/// target field is a `Str`, this method will return an Error.
|
||||
pub fn value_from_json(&self, json: &JsonValue) -> Result<Value, ValueParsingError> {
|
||||
match *json {
|
||||
JsonValue::String(ref field_text) => match *self {
|
||||
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
|
||||
match json {
|
||||
JsonValue::String(field_text) => match *self {
|
||||
FieldType::Date(_) => {
|
||||
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
|
||||
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| {
|
||||
ValueParsingError::TypeError(format!(
|
||||
"Failed to parse date from JSON. Expected rfc3339 format, got {}. \
|
||||
{:?}",
|
||||
field_text, err
|
||||
))
|
||||
chrono::DateTime::parse_from_rfc3339(&field_text).map_err(|_err| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "rfc3339 format",
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?;
|
||||
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(
|
||||
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
|
||||
),
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64(format!(
|
||||
"Expected base64 string, got {:?}",
|
||||
field_text
|
||||
))
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(&field_text)
|
||||
.map(Value::Bytes)
|
||||
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::String(field_text),
|
||||
}),
|
||||
},
|
||||
JsonValue::Number(ref field_val_num) => match *self {
|
||||
JsonValue::Number(field_val_num) => match self {
|
||||
FieldType::I64(_) | FieldType::Date(_) => {
|
||||
if let Some(field_val_i64) = field_val_num.as_i64() {
|
||||
Ok(Value::I64(field_val_i64))
|
||||
} else {
|
||||
let msg = format!("Expected an i64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "an i64 int",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
if let Some(field_val_u64) = field_val_num.as_u64() {
|
||||
Ok(Value::U64(field_val_u64))
|
||||
} else {
|
||||
let msg = format!("Expected a u64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "u64",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
if let Some(field_val_f64) = field_val_num.as_f64() {
|
||||
Ok(Value::F64(field_val_f64))
|
||||
} else {
|
||||
let msg = format!("Expected a f64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "a f64",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
|
||||
let msg = format!("Expected a string, got {:?}", json);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
}),
|
||||
},
|
||||
JsonValue::Object(_) => match *self {
|
||||
JsonValue::Object(json_map) => match self {
|
||||
FieldType::Str(_) => {
|
||||
if let Ok(tok_str_val) =
|
||||
serde_json::from_value::<PreTokenizedString>(json.clone())
|
||||
{
|
||||
if let Ok(tok_str_val) = serde_json::from_value::<PreTokenizedString>(
|
||||
serde_json::Value::Object(json_map.clone()),
|
||||
) {
|
||||
Ok(Value::PreTokStr(tok_str_val))
|
||||
} else {
|
||||
let msg = format!(
|
||||
"Json value {:?} cannot be translated to PreTokenizedString.",
|
||||
json
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string or an pretokenized string",
|
||||
json: JsonValue::Object(json_map),
|
||||
})
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
FieldType::JsonObject(_) => Ok(Value::JsonObject(json_map)),
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: JsonValue::Object(json_map),
|
||||
}),
|
||||
},
|
||||
_ => {
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json, self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -276,6 +343,7 @@ impl FieldType {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
||||
use serde_json::json;
|
||||
|
||||
use super::FieldType;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
@@ -311,19 +379,19 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bytes_value_from_json() {
|
||||
let result = FieldType::Bytes(Default::default())
|
||||
.value_from_json(&json!("dGhpcyBpcyBhIHRlc3Q="))
|
||||
.value_from_json(json!("dGhpcyBpcyBhIHRlc3Q="))
|
||||
.unwrap();
|
||||
assert_eq!(result, Value::Bytes("this is a test".as_bytes().to_vec()));
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521));
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(json!(521));
|
||||
match result {
|
||||
Err(ValueParsingError::TypeError(_)) => {}
|
||||
Err(ValueParsingError::TypeError { .. }) => {}
|
||||
_ => panic!("Expected parse failure for wrong type"),
|
||||
}
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-"));
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(json!("-"));
|
||||
match result {
|
||||
Err(ValueParsingError::InvalidBase64(_)) => {}
|
||||
Err(ValueParsingError::InvalidBase64 { .. }) => {}
|
||||
_ => panic!("Expected parse failure for invalid base64"),
|
||||
}
|
||||
}
|
||||
@@ -385,7 +453,7 @@ mod tests {
|
||||
});
|
||||
|
||||
let deserialized_value = FieldType::Str(TextOptions::default())
|
||||
.value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap())
|
||||
.value_from_json(serde_json::from_str(pre_tokenized_string_json).unwrap())
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(deserialized_value, expected_value);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
use crate::schema::{NumericOptions, TextOptions};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StoredFlag;
|
||||
@@ -22,8 +22,8 @@ pub const STORED: SchemaFlagList<StoredFlag, ()> = SchemaFlagList {
|
||||
pub struct IndexedFlag;
|
||||
/// Flag to mark the field as indexed. An indexed field is searchable and has a fieldnorm.
|
||||
///
|
||||
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
|
||||
/// Of course, text fields can also be indexed... But this is expressed by using either the
|
||||
/// The `INDEXED` flag can only be used when building `NumericOptions` (`u64`, `i64` and `f64`
|
||||
/// fields) Of course, text fields can also be indexed... But this is expressed by using either the
|
||||
/// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags.
|
||||
pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
|
||||
head: IndexedFlag,
|
||||
@@ -36,7 +36,7 @@ pub struct FastFlag;
|
||||
///
|
||||
/// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering
|
||||
/// or collection should be mark as fast fields.
|
||||
/// The `FAST` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
|
||||
/// The `FAST` flag can only be used when building `NumericOptions` (`u64`, `i64` and `f64` fields)
|
||||
pub const FAST: SchemaFlagList<FastFlag, ()> = SchemaFlagList {
|
||||
head: FastFlag,
|
||||
tail: (),
|
||||
@@ -58,10 +58,10 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone + Into<IntOptions>> BitOr<IntOptions> for SchemaFlagList<T, ()> {
|
||||
type Output = IntOptions;
|
||||
impl<T: Clone + Into<NumericOptions>> BitOr<NumericOptions> for SchemaFlagList<T, ()> {
|
||||
type Output = NumericOptions;
|
||||
|
||||
fn bitor(self, rhs: IntOptions) -> Self::Output {
|
||||
fn bitor(self, rhs: NumericOptions) -> Self::Output {
|
||||
self.head.into() | rhs
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ pub enum IndexRecordOption {
|
||||
}
|
||||
|
||||
impl IndexRecordOption {
|
||||
/// Returns true iff this option includes encoding
|
||||
/// Returns true if this option includes encoding
|
||||
/// term frequencies.
|
||||
pub fn has_freq(self) -> bool {
|
||||
match self {
|
||||
@@ -39,7 +39,7 @@ impl IndexRecordOption {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff this option include encoding
|
||||
/// Returns true if this option include encoding
|
||||
/// term positions.
|
||||
pub fn has_positions(self) -> bool {
|
||||
match self {
|
||||
|
||||
109
src/schema/json_object_options.rs
Normal file
109
src/schema/json_object_options.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::flags::{SchemaFlagList, StoredFlag};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
||||
|
||||
/// The `JsonObjectOptions` make it possible to
|
||||
/// configure how a json object field should be indexed and stored.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct JsonObjectOptions {
|
||||
stored: bool,
|
||||
// If set to some, int, date, f64 and text will be indexed.
|
||||
// Text will use the TextFieldIndexing setting for indexing.
|
||||
indexing: Option<TextFieldIndexing>,
|
||||
}
|
||||
|
||||
impl JsonObjectOptions {
|
||||
/// Returns `true` if the json object should be stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
/// Returns `true` iff the json object should be indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexing.is_some()
|
||||
}
|
||||
|
||||
/// Returns the text indexing options.
|
||||
///
|
||||
/// If set to `Some` then both int and str values will be indexed.
|
||||
/// The inner `TextFieldIndexing` will however, only apply to the str values
|
||||
/// in the json object.
|
||||
pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> {
|
||||
self.indexing.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<StoredFlag> for JsonObjectOptions {
|
||||
fn from(_stored_flag: StoredFlag) -> Self {
|
||||
JsonObjectOptions {
|
||||
stored: true,
|
||||
indexing: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<()> for JsonObjectOptions {
|
||||
fn from(_: ()) -> Self {
|
||||
Self::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Into<JsonObjectOptions>> BitOr<T> for JsonObjectOptions {
|
||||
type Output = JsonObjectOptions;
|
||||
|
||||
fn bitor(self, other: T) -> Self {
|
||||
let other = other.into();
|
||||
JsonObjectOptions {
|
||||
indexing: self.indexing.or(other.indexing),
|
||||
stored: self.stored | other.stored,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Head, Tail> From<SchemaFlagList<Head, Tail>> for JsonObjectOptions
|
||||
where
|
||||
Head: Clone,
|
||||
Tail: Clone,
|
||||
Self: BitOr<Output = Self> + From<Head> + From<Tail>,
|
||||
{
|
||||
fn from(head_tail: SchemaFlagList<Head, Tail>) -> Self {
|
||||
Self::from(head_tail.head) | Self::from(head_tail.tail)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TextOptions> for JsonObjectOptions {
|
||||
fn from(text_options: TextOptions) -> Self {
|
||||
JsonObjectOptions {
|
||||
stored: text_options.is_stored(),
|
||||
indexing: text_options.get_indexing_options().cloned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::schema::{STORED, TEXT};
|
||||
|
||||
#[test]
|
||||
fn test_json_options() {
|
||||
{
|
||||
let json_options: JsonObjectOptions = (STORED | TEXT).into();
|
||||
assert!(json_options.is_stored());
|
||||
assert!(json_options.is_indexed());
|
||||
}
|
||||
{
|
||||
let json_options: JsonObjectOptions = TEXT.into();
|
||||
assert!(!json_options.is_stored());
|
||||
assert!(json_options.is_indexed());
|
||||
}
|
||||
{
|
||||
let json_options: JsonObjectOptions = STORED.into();
|
||||
assert!(json_options.is_stored());
|
||||
assert!(!json_options.is_indexed());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -60,7 +60,7 @@
|
||||
//! ```
|
||||
//! use tantivy::schema::*;
|
||||
//! let mut schema_builder = Schema::builder();
|
||||
//! let num_stars_options = IntOptions::default()
|
||||
//! let num_stars_options = NumericOptions::default()
|
||||
//! .set_stored()
|
||||
//! .set_indexed();
|
||||
//! schema_builder.add_u64_field("num_stars", num_stars_options);
|
||||
@@ -104,7 +104,7 @@ mod document;
|
||||
mod facet;
|
||||
mod facet_options;
|
||||
mod schema;
|
||||
mod term;
|
||||
pub(crate) mod term;
|
||||
|
||||
mod field_entry;
|
||||
mod field_type;
|
||||
@@ -112,14 +112,14 @@ mod field_value;
|
||||
|
||||
mod bytes_options;
|
||||
mod field;
|
||||
mod flags;
|
||||
mod index_record_option;
|
||||
mod int_options;
|
||||
mod json_object_options;
|
||||
mod named_field_document;
|
||||
mod numeric_options;
|
||||
mod text_options;
|
||||
mod value;
|
||||
|
||||
mod flags;
|
||||
|
||||
pub use self::bytes_options::BytesOptions;
|
||||
pub use self::document::Document;
|
||||
pub(crate) use self::facet::FACET_SEP_BYTE;
|
||||
@@ -131,8 +131,11 @@ pub use self::field_type::{FieldType, Type};
|
||||
pub use self::field_value::FieldValue;
|
||||
pub use self::flags::{FAST, INDEXED, STORED};
|
||||
pub use self::index_record_option::IndexRecordOption;
|
||||
pub use self::int_options::{Cardinality, IntOptions};
|
||||
pub use self::json_object_options::JsonObjectOptions;
|
||||
pub use self::named_field_document::NamedFieldDocument;
|
||||
pub use self::numeric_options::NumericOptions;
|
||||
#[allow(deprecated)]
|
||||
pub use self::numeric_options::{Cardinality, IntOptions};
|
||||
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
|
||||
pub use self::term::Term;
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
|
||||
@@ -16,10 +16,14 @@ pub enum Cardinality {
|
||||
MultiValues,
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
|
||||
/// Deprecated use [NumericOptions] instead.
|
||||
pub type IntOptions = NumericOptions;
|
||||
|
||||
/// Define how an u64, i64, of f64 field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
#[serde(from = "IntOptionsDeser")]
|
||||
pub struct IntOptions {
|
||||
#[serde(from = "NumericOptionsDeser")]
|
||||
pub struct NumericOptions {
|
||||
indexed: bool,
|
||||
// This boolean has no effect if the field is not marked as indexed too.
|
||||
fieldnorms: bool, // This attribute only has an effect if indexed is true.
|
||||
@@ -29,12 +33,12 @@ pub struct IntOptions {
|
||||
}
|
||||
|
||||
/// For backward compability we add an intermediary to interpret the
|
||||
/// lack of fieldnorms attribute as "true" iff indexed.
|
||||
/// lack of fieldnorms attribute as "true" if and only if indexed.
|
||||
///
|
||||
/// (Downstream, for the moment, this attribute is not used anyway if not indexed...)
|
||||
/// Note that: newly serialized IntOptions will include the new attribute.
|
||||
/// Note that: newly serialized NumericOptions will include the new attribute.
|
||||
#[derive(Deserialize)]
|
||||
struct IntOptionsDeser {
|
||||
struct NumericOptionsDeser {
|
||||
indexed: bool,
|
||||
#[serde(default)]
|
||||
fieldnorms: Option<bool>, // This attribute only has an effect if indexed is true.
|
||||
@@ -43,9 +47,9 @@ struct IntOptionsDeser {
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
impl From<IntOptionsDeser> for IntOptions {
|
||||
fn from(deser: IntOptionsDeser) -> Self {
|
||||
IntOptions {
|
||||
impl From<NumericOptionsDeser> for NumericOptions {
|
||||
fn from(deser: NumericOptionsDeser) -> Self {
|
||||
NumericOptions {
|
||||
indexed: deser.indexed,
|
||||
fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed),
|
||||
fast: deser.fast,
|
||||
@@ -54,7 +58,7 @@ impl From<IntOptionsDeser> for IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl IntOptions {
|
||||
impl NumericOptions {
|
||||
/// Returns true iff the value is stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
@@ -70,6 +74,15 @@ impl IntOptions {
|
||||
self.fieldnorms && self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field and multivalue.
|
||||
pub fn is_multivalue_fast(&self) -> bool {
|
||||
if let Some(cardinality) = self.fast {
|
||||
cardinality == Cardinality::MultiValues
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast.is_some()
|
||||
@@ -80,7 +93,7 @@ impl IntOptions {
|
||||
/// Only the fields that are set as *stored* are
|
||||
/// persisted into the Tantivy's store.
|
||||
#[must_use]
|
||||
pub fn set_stored(mut self) -> IntOptions {
|
||||
pub fn set_stored(mut self) -> NumericOptions {
|
||||
self.stored = true;
|
||||
self
|
||||
}
|
||||
@@ -92,7 +105,7 @@ impl IntOptions {
|
||||
///
|
||||
/// This is required for the field to be searchable.
|
||||
#[must_use]
|
||||
pub fn set_indexed(mut self) -> IntOptions {
|
||||
pub fn set_indexed(mut self) -> NumericOptions {
|
||||
self.indexed = true;
|
||||
self
|
||||
}
|
||||
@@ -102,7 +115,7 @@ impl IntOptions {
|
||||
/// Setting an integer as fieldnorm will generate
|
||||
/// the fieldnorm data for it.
|
||||
#[must_use]
|
||||
pub fn set_fieldnorm(mut self) -> IntOptions {
|
||||
pub fn set_fieldnorm(mut self) -> NumericOptions {
|
||||
self.fieldnorms = true;
|
||||
self
|
||||
}
|
||||
@@ -114,7 +127,7 @@ impl IntOptions {
|
||||
/// If more than one value is associated to a fast field, only the last one is
|
||||
/// kept.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self, cardinality: Cardinality) -> IntOptions {
|
||||
pub fn set_fast(mut self, cardinality: Cardinality) -> NumericOptions {
|
||||
self.fast = Some(cardinality);
|
||||
self
|
||||
}
|
||||
@@ -128,15 +141,15 @@ impl IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<()> for IntOptions {
|
||||
fn from(_: ()) -> IntOptions {
|
||||
IntOptions::default()
|
||||
impl From<()> for NumericOptions {
|
||||
fn from(_: ()) -> NumericOptions {
|
||||
NumericOptions::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFlag> for IntOptions {
|
||||
impl From<FastFlag> for NumericOptions {
|
||||
fn from(_: FastFlag) -> Self {
|
||||
IntOptions {
|
||||
NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
@@ -145,9 +158,9 @@ impl From<FastFlag> for IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<StoredFlag> for IntOptions {
|
||||
impl From<StoredFlag> for NumericOptions {
|
||||
fn from(_: StoredFlag) -> Self {
|
||||
IntOptions {
|
||||
NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: true,
|
||||
@@ -156,9 +169,9 @@ impl From<StoredFlag> for IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<IndexedFlag> for IntOptions {
|
||||
impl From<IndexedFlag> for NumericOptions {
|
||||
fn from(_: IndexedFlag) -> Self {
|
||||
IntOptions {
|
||||
NumericOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
stored: false,
|
||||
@@ -167,12 +180,12 @@ impl From<IndexedFlag> for IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Into<IntOptions>> BitOr<T> for IntOptions {
|
||||
type Output = IntOptions;
|
||||
impl<T: Into<NumericOptions>> BitOr<T> for NumericOptions {
|
||||
type Output = NumericOptions;
|
||||
|
||||
fn bitor(self, other: T) -> IntOptions {
|
||||
fn bitor(self, other: T) -> NumericOptions {
|
||||
let other = other.into();
|
||||
IntOptions {
|
||||
NumericOptions {
|
||||
indexed: self.indexed | other.indexed,
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
stored: self.stored | other.stored,
|
||||
@@ -181,7 +194,7 @@ impl<T: Into<IntOptions>> BitOr<T> for IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl<Head, Tail> From<SchemaFlagList<Head, Tail>> for IntOptions
|
||||
impl<Head, Tail> From<SchemaFlagList<Head, Tail>> for NumericOptions
|
||||
where
|
||||
Head: Clone,
|
||||
Tail: Clone,
|
||||
@@ -202,10 +215,10 @@ mod tests {
|
||||
"indexed": true,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
let int_options: NumericOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
&NumericOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
fast: None,
|
||||
@@ -220,10 +233,10 @@ mod tests {
|
||||
"indexed": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
let int_options: NumericOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
&NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
fast: None,
|
||||
@@ -239,10 +252,10 @@ mod tests {
|
||||
"fieldnorms": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
let int_options: NumericOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
&NumericOptions {
|
||||
indexed: true,
|
||||
fieldnorms: false,
|
||||
fast: None,
|
||||
@@ -259,10 +272,10 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
let int_options: NumericOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
&NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: true,
|
||||
fast: None,
|
||||
@@ -5,7 +5,7 @@ use std::sync::Arc;
|
||||
use serde::de::{SeqAccess, Visitor};
|
||||
use serde::ser::SerializeSeq;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::{self, Map as JsonObject, Value as JsonValue};
|
||||
use serde_json::{self, Value as JsonValue};
|
||||
|
||||
use super::*;
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
@@ -52,7 +52,7 @@ impl SchemaBuilder {
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_u64_field<T: Into<IntOptions>>(
|
||||
pub fn add_u64_field<T: Into<NumericOptions>>(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: T,
|
||||
@@ -72,7 +72,7 @@ impl SchemaBuilder {
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_i64_field<T: Into<IntOptions>>(
|
||||
pub fn add_i64_field<T: Into<NumericOptions>>(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: T,
|
||||
@@ -92,7 +92,7 @@ impl SchemaBuilder {
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_f64_field<T: Into<IntOptions>>(
|
||||
pub fn add_f64_field<T: Into<NumericOptions>>(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: T,
|
||||
@@ -114,7 +114,7 @@ impl SchemaBuilder {
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_date_field<T: Into<IntOptions>>(
|
||||
pub fn add_date_field<T: Into<NumericOptions>>(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: T,
|
||||
@@ -173,6 +173,16 @@ impl SchemaBuilder {
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a json object field to the schema.
|
||||
pub fn add_json_field<T: Into<JsonObjectOptions>>(
|
||||
&mut self,
|
||||
field_name: &str,
|
||||
field_options: T,
|
||||
) -> Field {
|
||||
let field_entry = FieldEntry::new_json(field_name.to_string(), field_options.into());
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a field entry to the schema in build.
|
||||
pub fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||
let field = Field::from_field_id(self.fields.len() as u32);
|
||||
@@ -298,23 +308,23 @@ impl Schema {
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
|
||||
let json_obj: JsonObject<String, JsonValue> =
|
||||
serde_json::from_str(doc_json).map_err(|_| {
|
||||
let doc_json_sample: String = if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJson(doc_json_sample)
|
||||
})?;
|
||||
let json_obj: serde_json::Map<String, JsonValue> =
|
||||
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
|
||||
self.json_object_to_doc(json_obj)
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn json_object_to_doc(
|
||||
&self,
|
||||
json_obj: serde_json::Map<String, JsonValue>,
|
||||
) -> Result<Document, DocParsingError> {
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
if let Some(field) = self.get_field(field_name) {
|
||||
for (field_name, json_value) in json_obj {
|
||||
if let Some(field) = self.get_field(&field_name) {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
match json_value {
|
||||
JsonValue::Array(json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
@@ -383,12 +393,24 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[error("The provided string is not valid JSON")]
|
||||
NotJson(String),
|
||||
InvalidJson(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[error("The field '{0:?}' could not be parsed: {1:?}")]
|
||||
ValueError(String, ValueParsingError),
|
||||
}
|
||||
|
||||
impl DocParsingError {
|
||||
/// Builds a NotJson DocParsingError
|
||||
fn invalid_json(invalid_json: &str) -> Self {
|
||||
let sample_json: String = if invalid_json.len() < 20 {
|
||||
invalid_json.to_string()
|
||||
} else {
|
||||
format!("{:?}...", &invalid_json[0..20])
|
||||
};
|
||||
DocParsingError::InvalidJson(sample_json)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -398,8 +420,8 @@ mod tests {
|
||||
use serde_json;
|
||||
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::int_options::Cardinality::SingleValue;
|
||||
use crate::schema::schema::DocParsingError::NotJson;
|
||||
use crate::schema::numeric_options::Cardinality::SingleValue;
|
||||
use crate::schema::schema::DocParsingError::InvalidJson;
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
@@ -413,13 +435,13 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_schema_serialization() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_options = IntOptions::default()
|
||||
let count_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default()
|
||||
let popularity_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = IntOptions::default()
|
||||
let score_options = NumericOptions::default()
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
@@ -529,7 +551,7 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_document_to_json() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_options = IntOptions::default()
|
||||
let count_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
@@ -594,13 +616,13 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_options = IntOptions::default()
|
||||
let count_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default()
|
||||
let popularity_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = IntOptions::default()
|
||||
let score_options = NumericOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
@@ -666,7 +688,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::TypeError(_)
|
||||
ValueParsingError::TypeError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -684,7 +706,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -702,7 +724,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
));
|
||||
}
|
||||
@@ -720,7 +742,7 @@ mod tests {
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(
|
||||
_,
|
||||
ValueParsingError::OverflowError(_)
|
||||
ValueParsingError::OverflowError { .. }
|
||||
))
|
||||
);
|
||||
}
|
||||
@@ -732,7 +754,7 @@ mod tests {
|
||||
"count": 50,
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(json_err, Err(NotJson(_)));
|
||||
assert_matches!(json_err, Err(InvalidJson(_)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -744,7 +766,7 @@ mod tests {
|
||||
.set_tokenizer("raw")
|
||||
.set_index_option(IndexRecordOption::Basic),
|
||||
);
|
||||
let timestamp_options = IntOptions::default()
|
||||
let timestamp_options = NumericOptions::default()
|
||||
.set_stored()
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::convert::TryInto;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::{fmt, str};
|
||||
|
||||
@@ -8,8 +9,26 @@ use crate::DateTime;
|
||||
|
||||
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
|
||||
/// <field> + <type byte> + <value len>
|
||||
///
|
||||
/// - <field> is a big endian encoded u32 field id
|
||||
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
|
||||
/// The remaining 7 bits are used to encode the type of the value.
|
||||
/// If this is a JSON term, the type is the type of the leaf of the json.
|
||||
///
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
|
||||
/// Separates the different segments of
|
||||
/// the json path.
|
||||
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
|
||||
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
|
||||
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
|
||||
|
||||
/// Separates the json path and the value in
|
||||
/// a JSON term binary representation.
|
||||
pub const JSON_END_OF_PATH: u8 = 0u8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
@@ -17,6 +36,12 @@ const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where B: AsRef<[u8]>;
|
||||
|
||||
impl AsMut<Vec<u8>> for Term {
|
||||
fn as_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Term {
|
||||
pub(crate) fn new() -> Term {
|
||||
Term(Vec::with_capacity(100))
|
||||
@@ -120,6 +145,22 @@ impl Term {
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.set_bytes(text.as_bytes());
|
||||
}
|
||||
|
||||
/// Removes the value_bytes and set the type code.
|
||||
pub fn clear_with_type(&mut self, typ: Type) {
|
||||
self.truncate(5);
|
||||
self.0[4] = typ.to_code();
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn truncate(&mut self, len: usize) {
|
||||
self.0.truncate(len);
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn append_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.extend_from_slice(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Ord for Term<B>
|
||||
@@ -164,13 +205,16 @@ where B: AsRef<[u8]>
|
||||
Term(data)
|
||||
}
|
||||
|
||||
fn typ_code(&self) -> u8 {
|
||||
*self
|
||||
.as_slice()
|
||||
.get(4)
|
||||
.expect("the byte representation is too short")
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
assert!(
|
||||
self.as_slice().len() >= 5,
|
||||
"the type does byte representation is too short"
|
||||
);
|
||||
Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code")
|
||||
Type::from_code(self.typ_code()).expect("The term has an invalid type code")
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
@@ -189,10 +233,14 @@ where B: AsRef<[u8]>
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN {
|
||||
if self.typ() != T::to_type() {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
let bytes = self.value_bytes();
|
||||
if bytes.len() != 8 {
|
||||
return None;
|
||||
}
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
@@ -290,40 +338,74 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?;
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = str::from_utf8(self.value_bytes()).ok();
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
write_opt(f, self.as_u64())?;
|
||||
}
|
||||
Type::I64 => {
|
||||
let val_i64 = self.as_i64();
|
||||
write_opt(f, val_i64)?;
|
||||
}
|
||||
Type::F64 => {
|
||||
let val_f64 = self.as_f64();
|
||||
write_opt(f, val_f64)?;
|
||||
}
|
||||
// TODO pretty print these types too.
|
||||
Type::Date => {
|
||||
let val_date = self.as_date();
|
||||
write_opt(f, val_date)?;
|
||||
}
|
||||
Type::Facet => {
|
||||
let facet = self.as_facet().map(|facet| facet.to_path_string());
|
||||
write_opt(f, facet)?;
|
||||
}
|
||||
Type::Bytes => {
|
||||
write_opt(f, self.as_bytes())?;
|
||||
fn as_str(value_bytes: &[u8]) -> Option<&str> {
|
||||
std::str::from_utf8(value_bytes).ok()
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(bytes: &[u8]) -> Option<T> {
|
||||
let value_u64 = u64::from_be_bytes(bytes.try_into().ok()?);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
|
||||
/// Returns the json path (without non-human friendly separators, the type of the value, and the
|
||||
/// value bytes). Returns None if the value is not JSON or is not valid.
|
||||
pub(crate) fn as_json_path_type_value_bytes(bytes: &[u8]) -> Option<(&str, Type, &[u8])> {
|
||||
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
|
||||
let json_path = str::from_utf8(&bytes[..pos]).ok()?;
|
||||
let type_code = *bytes.get(pos + 1)?;
|
||||
let typ = Type::from_code(type_code)?;
|
||||
Some((json_path, typ, &bytes[pos + 2..]))
|
||||
}
|
||||
|
||||
fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = as_str(bytes);
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
write_opt(f, get_fast_type::<u64>(bytes))?;
|
||||
}
|
||||
Type::I64 => {
|
||||
write_opt(f, get_fast_type::<i64>(bytes))?;
|
||||
}
|
||||
Type::F64 => {
|
||||
write_opt(f, get_fast_type::<f64>(bytes))?;
|
||||
}
|
||||
// TODO pretty print these types too.
|
||||
Type::Date => {
|
||||
write_opt(f, get_fast_type::<crate::DateTime>(bytes))?;
|
||||
}
|
||||
Type::Facet => {
|
||||
let facet_str = str::from_utf8(bytes)
|
||||
.ok()
|
||||
.map(ToString::to_string)
|
||||
.map(Facet::from_encoded_string)
|
||||
.map(|facet| facet.to_path_string());
|
||||
write_opt(f, facet_str)?;
|
||||
}
|
||||
Type::Bytes => {
|
||||
write_opt(f, Some(bytes))?;
|
||||
}
|
||||
Type::Json => {
|
||||
if let Some((path, typ, bytes)) = as_json_path_type_value_bytes(bytes) {
|
||||
let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, ".");
|
||||
write!(f, "path={path_pretty}, vtype={typ:?}, ")?;
|
||||
debug_value_bytes(typ, bytes, f)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<B> fmt::Debug for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={typ:?}, field={field_id}, ")?;
|
||||
debug_value_bytes(typ, self.value_bytes(), f)?;
|
||||
write!(f, ")",)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ impl TextOptions {
|
||||
self.indexing.as_ref()
|
||||
}
|
||||
|
||||
/// Returns true iff the text is to be stored.
|
||||
/// Returns true if the text is to be stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
@@ -46,7 +46,7 @@ impl TextOptions {
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
|
||||
/// - the name of the `Tokenizer` that should be used to process the field.
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
record: IndexRecordOption,
|
||||
fieldnorms: bool,
|
||||
@@ -83,7 +83,7 @@ impl TextFieldIndexing {
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns true iff fieldnorms are stored.
|
||||
/// Returns true if and only if fieldnorms are stored.
|
||||
pub fn fieldnorms(&self) -> bool {
|
||||
self.fieldnorms
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::fmt;
|
||||
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::Map;
|
||||
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
@@ -27,6 +28,8 @@ pub enum Value {
|
||||
Facet(Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(Vec<u8>),
|
||||
/// Json object value.
|
||||
JsonObject(serde_json::Map<String, serde_json::Value>),
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
@@ -43,6 +46,7 @@ impl Serialize for Value {
|
||||
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,6 +172,17 @@ impl Value {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the json object, provided the value is of the JsonObject type.
|
||||
///
|
||||
/// Returns None if the value is not of type JsonObject.
|
||||
pub fn as_json(&self) -> Option<&Map<String, serde_json::Value>> {
|
||||
if let Value::JsonObject(json) = self {
|
||||
Some(json)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for Value {
|
||||
@@ -230,6 +245,23 @@ impl From<PreTokenizedString> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Map<String, serde_json::Value>> for Value {
|
||||
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value {
|
||||
Value::JsonObject(json_object)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Value> for Value {
|
||||
fn from(json_value: serde_json::Value) -> Value {
|
||||
match json_value {
|
||||
serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
|
||||
_ => {
|
||||
panic!("Expected a json object.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod binary_serialize {
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
@@ -248,6 +280,7 @@ mod binary_serialize {
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
const EXT_CODE: u8 = 7;
|
||||
const JSON_OBJ_CODE: u8 = 8;
|
||||
|
||||
// extended types
|
||||
|
||||
@@ -296,8 +329,14 @@ mod binary_serialize {
|
||||
BYTES_CODE.serialize(writer)?;
|
||||
bytes.serialize(writer)
|
||||
}
|
||||
Value::JsonObject(ref map) => {
|
||||
JSON_OBJ_CODE.serialize(writer)?;
|
||||
serde_json::to_writer(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let type_code = u8::deserialize(reader)?;
|
||||
match type_code {
|
||||
@@ -347,6 +386,10 @@ mod binary_serialize {
|
||||
)),
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
let map = serde_json::from_reader(reader)?;
|
||||
Ok(Value::JsonObject(map))
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}", type_code),
|
||||
|
||||
@@ -4,11 +4,12 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use common::{BinarySerializable, HasLen, VInt};
|
||||
use lru::LruCache;
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
use super::footer::DocStoreFooter;
|
||||
use super::index::SkipIndex;
|
||||
use super::Compressor;
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::schema::Document;
|
||||
@@ -239,6 +240,60 @@ impl StoreReader {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
impl StoreReader {
|
||||
async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> {
|
||||
if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.byte_range.start) {
|
||||
self.cache_hits.fetch_add(1, Ordering::SeqCst);
|
||||
return Ok(block.clone());
|
||||
}
|
||||
|
||||
self.cache_misses.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
let compressed_block = self
|
||||
.data
|
||||
.slice(checkpoint.byte_range.clone())
|
||||
.read_bytes_async()
|
||||
.await?;
|
||||
let mut decompressed_block = vec![];
|
||||
self.compressor
|
||||
.decompress(compressed_block.as_slice(), &mut decompressed_block)?;
|
||||
|
||||
let block = OwnedBytes::new(decompressed_block);
|
||||
self.cache
|
||||
.lock()
|
||||
.unwrap()
|
||||
.put(checkpoint.byte_range.start, block.clone());
|
||||
|
||||
Ok(block)
|
||||
}
|
||||
|
||||
/// Fetches a document asynchronously.
|
||||
async fn get_document_bytes_async(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
|
||||
let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
|
||||
})?;
|
||||
let block = self.read_block_async(&checkpoint).await?;
|
||||
let mut cursor = &block[..];
|
||||
let cursor_len_before = cursor.len();
|
||||
for _ in checkpoint.doc_range.start..doc_id {
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
cursor = &cursor[doc_length..];
|
||||
}
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
let start_pos = cursor_len_before - cursor.len();
|
||||
let end_pos = cursor_len_before - cursor.len() + doc_length;
|
||||
Ok(block.slice(start_pos..end_pos))
|
||||
}
|
||||
|
||||
/// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a
|
||||
/// document and its start and end position within the block.
|
||||
pub(crate) async fn get_async(&self, doc_id: DocId) -> crate::Result<Document> {
|
||||
let mut doc_bytes = self.get_document_bytes_async(doc_id).await?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::Path;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user