Compare commits

..

22 Commits

Author SHA1 Message Date
Pascal Seitz
f5a716e827 update basic_search example 2024-05-30 21:56:22 +08:00
Meng Zhang
4143d31865 chore: fix build as the rev is gone (#2417) 2024-05-29 09:49:16 +08:00
Hamir Mahal
0c634adbe1 style: simplify strings with string interpolation (#2412)
* style: simplify strings with string interpolation

* fix: formatting
2024-05-27 09:16:47 +02:00
PSeitz
2e3641c2ae return CompactDocValue instead of trait (#2410)
The CompactDocValue is easier to handle than the trait in some cases like comparison
and conversion
2024-05-27 07:33:50 +02:00
Paul Masurel
b806122c81 Fixing flaky test (#2407) 2024-05-22 10:10:55 +09:00
PSeitz
e1679f3fb9 compact doc (#2402)
* compact doc

* add any value type

* pass references when building CompactDoc

* remove OwnedValue from API

* clippy

* clippy

* fail on large documents

* fmt

* cleanup

* cleanup

* implement Value for different types

fix serde_json date Value implementation

* fmt

* cleanup

* fmt

* cleanup

* store positions instead of pos+len

* remove nodes array

* remove mediumvec

* cleanup

* infallible serialize into vec

* remove positions indirection

* remove 24MB limitation in document

use u32 for Addr
Remove the 3 byte addressing limitation and use VInt instead

* cleanup

* extend test

* cleanup, add comments

* rename, remove pub
2024-05-21 10:16:08 +02:00
dependabot[bot]
5a80420b10 --- (#2406)
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-21 04:36:32 +02:00
dependabot[bot]
aa26ff5029 Update binggan requirement from 0.6.2 to 0.7.0 (#2401)
---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-17 02:53:25 +02:00
dependabot[bot]
e197b59258 Update itertools requirement from 0.12.0 to 0.13.0 (#2400)
Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version.
- [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-itertools/itertools/compare/v0.12.0...v0.13.0)

---
updated-dependencies:
- dependency-name: itertools
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-17 02:53:02 +02:00
PSeitz
5b7cca13e5 lower contention on AggregationLimits (#2394)
PR https://github.com/quickwit-oss/quickwit/pull/4962 fixes an issue
where the AggregationLimits are not passed correctly. Since the
AggregationLimits are shared properly we run into contention issues.

This PR includes some straightforward improvement to reduce contention,
by only calling if the memory changed and avoiding the second read.

We probably need some sharding with multiple counters or local caching before updating the
global after some threshold.
2024-05-15 12:25:40 +02:00
dependabot[bot]
a79590477e Update binggan requirement from 0.5.2 to 0.6.2 (#2399)
---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-15 05:40:37 +02:00
Paul Masurel
6181c1eb5e Small changes in the Executor API. (#2391)
Warning, this change is mildly not backward compatible
so I bumped tantivy's version.
2024-05-10 17:19:12 +09:00
Adam Reichold
1ee5f90761 Give allocation control to the caller instead of force a clone (#2389)
Achieved by moving the boxes out of the temporary reference wrappers which are
cloneable themselves, i.e. if required the caller can clone them already or
consume them to reuse existing allocations.
2024-05-09 16:01:13 +09:00
PSeitz
71f3b4e4e3 fix ReferenceValue API flaw (#2372)
* fix ReferenceValue API flaw

Remove `Facet` and `TokenizedString` values from the `ReferenceValue` API,
as this requires the trait value to have them stored somewhere.

Since `TokenizedString` is quite niche, I just copy it into a Box,
instead of designing a reference API around it.

* fix comment link
2024-05-09 06:14:42 +02:00
trinity-1686a
8cd7ddc535 run block decompression from executor (#2386)
* run block decompression from executor

* add a wrapper with is_closed to oneshot channel

* add cancelation test to Executor::spawn_blocking
2024-05-08 12:22:44 +02:00
Paul Masurel
2b76335a95 Removed usage of num_cpus (#2387)
* Removed usage of num_cpus
* handling error
2024-05-08 13:32:52 +09:00
PSeitz
c6b213d8f0 use bingang for agg benchmark (#2378)
* use bingang for agg benchmark

use bingang for agg benchmark, which includes memory consumption

Output:
```
full
histogram                     Memory: 15.8 KB              Avg: 10.9322ms  (+5.44%)    Median: 10.8790ms  (+9.28%)     Min: 10.7470ms    Max: 11.3263ms
histogram_hard_bounds         Memory: 15.5 KB              Avg: 5.1939ms  (+6.61%)     Median: 5.1722ms  (+10.98%)     Min: 5.0432ms     Max: 5.3910ms
histogram_with_avg_sub_agg    Memory: 48.7 KB              Avg: 23.8165ms  (+4.57%)    Median: 23.7264ms  (+10.06%)    Min: 23.4995ms    Max: 24.8107ms
dense
histogram                     Memory: 17.3 KB              Avg: 15.6810ms  (-8.54%)    Median: 15.6174ms  (-8.89%)    Min: 15.4953ms    Max: 16.0702ms
histogram_hard_bounds         Memory: 15.4 KB              Avg: 10.0720ms  (-7.33%)    Median: 10.0572ms  (-7.06%)    Min: 9.8500ms     Max: 10.4819ms
histogram_with_avg_sub_agg    Memory: 50.1 KB              Avg: 33.0993ms  (-7.04%)    Median: 32.9499ms  (-6.86%)    Min: 32.8284ms    Max: 34.0529ms
sparse
histogram                     Memory: 16.3 KB              Avg: 19.2325ms  (-0.44%)    Median: 19.1211ms  (-1.26%)    Min: 19.0348ms    Max: 19.7902ms
histogram_hard_bounds         Memory: 16.1 KB              Avg: 18.5179ms  (-0.61%)    Median: 18.4552ms  (-0.90%)    Min: 18.3799ms    Max: 19.0535ms
histogram_with_avg_sub_agg    Memory: 34.7 KB              Avg: 21.2589ms  (-0.69%)    Median: 21.1867ms  (-1.05%)    Min: 21.0342ms    Max: 21.9900ms
```

* add more bench with term as sub agg
2024-05-07 11:29:49 +02:00
PSeitz
eea70030bf cleanup top level exports (#2382)
remove some top level exports
2024-05-07 09:59:41 +02:00
PSeitz
92b5526310 allow more JSON values, fix i64 special case (#2383)
This changes three things:
- Reuse positions_per_path hashmap instead of allocating one per
  indexed JSON value
- Try to cast u64 values to i64 to streamline with search behaviour
- Allow top level json values to be of any type, instead of limiting it
  to JSON objects. Remove special JSON object handling method.

TODO: We probably should also try to check f64 to i64 and u64 when
indexing, as values may get converted to f64 by the JSON parser
2024-05-01 12:08:12 +02:00
PSeitz
99a59ad37e remove zero byte check (#2379)
remove zero byte checks in columnar. zero bytes are converted during serialization now.
unify code paths
extend test for expected column names
2024-04-26 06:03:28 +02:00
trinity-1686a
6a66a71cbb modify fastfield range query heuristic (#2375) 2024-04-25 10:06:11 +02:00
PSeitz
ff40764204 make convert_to_fast_value_and_append_to_json_term pub (#2370)
* make convert_to_fast_value_and_append_to_json_term pub

* clippy
2024-04-23 04:05:41 +02:00
72 changed files with 1884 additions and 1515 deletions

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.22.0" version = "0.23.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -15,12 +15,16 @@ rust-version = "1.63"
exclude = ["benches/*.json", "benches/*.txt"] exclude = ["benches/*.json", "benches/*.txt"]
[dependencies] [dependencies]
oneshot = "0.1.5" # Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" }
base64 = "0.22.0" base64 = "0.22.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
once_cell = "1.10.0" once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] } regex = { version = "1.5.5", default-features = false, features = [
"std",
"unicode",
] }
aho-corasick = "1.0" aho-corasick = "1.0"
tantivy-fst = "0.5" tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true } memmap2 = { version = "0.9.0", optional = true }
@@ -30,14 +34,15 @@ tempfile = { version = "3.3.0", optional = true }
log = "0.4.16" log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79" serde_json = "1.0.79"
num_cpus = "1.13.1"
fs4 = { version = "0.8.0", optional = true } fs4 = { version = "0.8.0", optional = true }
levenshtein_automata = "0.2.1" levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] } uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4" crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0" rust-stemmers = "1.2.0"
downcast-rs = "1.2.0" downcast-rs = "1.2.0"
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] } bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x",
] }
census = "0.4.2" census = "0.4.2"
rustc-hash = "1.1.0" rustc-hash = "1.1.0"
thiserror = "1.0.30" thiserror = "1.0.30"
@@ -48,26 +53,26 @@ smallvec = "1.8.0"
rayon = "1.5.2" rayon = "1.5.2"
lru = "0.12.0" lru = "0.12.0"
fastdivide = "0.4.0" fastdivide = "0.4.0"
itertools = "0.12.0" itertools = "0.13.0"
measure_time = "0.8.2" measure_time = "0.8.2"
arc-swap = "1.5.0" arc-swap = "1.5.0"
columnar = { version= "0.3", path="./columnar", package ="tantivy-columnar" } columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" }
sstable = { version= "0.3", path="./sstable", package ="tantivy-sstable", optional = true } sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true }
stacker = { version= "0.3", path="./stacker", package ="tantivy-stacker" } stacker = { version = "0.3", path = "./stacker", package = "tantivy-stacker" }
query-grammar = { version= "0.22.0", path="./query-grammar", package = "tantivy-query-grammar" } query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.6", path="./bitpacker" } tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version= "0.7", path = "./common/", package = "tantivy-common" } common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.3", path="./tokenizer-api", package="tantivy-tokenizer-api" } tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true } futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7" fnv = "1.0.7"
mediumvec = "1.3.0"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
[dev-dependencies] [dev-dependencies]
binggan = "0.8.0"
rand = "0.8.5" rand = "0.8.5"
maplit = "1.0.2" maplit = "1.0.2"
matches = "0.1.9" matches = "0.1.9"
@@ -82,7 +87,6 @@ time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [ postcard = { version = "1.0.4", features = [
"use-std", "use-std",
], default-features = false } ], default-features = false }
peakmem-alloc = "0.3.0"
[target.'cfg(not(windows))'.dev-dependencies] [target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false } criterion = { version = "0.5", default-features = false }
@@ -114,17 +118,26 @@ lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"] zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"] failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"] quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data. # Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision. # Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash. # Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"] compare_hash_only = ["stacker/compare_hash_only"]
[workspace] [workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"] members = [
"query-grammar",
"bitpacker",
"common",
"ownedbytes",
"stacker",
"sstable",
"tokenizer-api",
"columnar",
]
# Following the "fail" crate best practises, we isolate # Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points # tests that define specific behavior in fail check points
@@ -145,3 +158,7 @@ harness = false
[[bench]] [[bench]]
name = "index-bench" name = "index-bench"
harness = false harness = false
[[bench]]
name = "agg_bench"
harness = false

413
benches/agg_bench.rs Normal file
View File

@@ -0,0 +1,413 @@
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use tantivy::aggregation::agg_req::Aggregations;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use tantivy::{doc, Index, Term};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
/// Mini macro to register a function via its name
/// runner.register("average_u64", move |index| average_u64(index));
macro_rules! register {
($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| $func(index))
};
}
fn main() {
let inputs = vec![
("full", get_test_index_bench(Cardinality::Full).unwrap()),
(
"dense",
get_test_index_bench(Cardinality::OptionalDense).unwrap(),
),
(
"sparse",
get_test_index_bench(Cardinality::OptionalSparse).unwrap(),
),
(
"multivalue",
get_test_index_bench(Cardinality::Multivalued).unwrap(),
),
];
bench_agg(InputGroup::new_with_inputs(inputs));
}
fn bench_agg(mut group: InputGroup<Index>) {
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
register!(group, average_u64);
register!(group, average_f64);
register!(group, average_f64_u64);
register!(group, stats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits);
register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_few);
register!(group, range_agg_with_term_agg_many);
register!(group, histogram);
register!(group, histogram_hard_bounds);
register!(group, histogram_with_avg_sub_agg);
register!(group, avg_and_range_with_avg_sub_agg);
group.run();
}
fn exec_term_with_agg(index: &Index, agg_req: serde_json::Value) {
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let collector = get_collector(agg_req);
let searcher = reader.searcher();
black_box(searcher.search(&term_query, &collector).unwrap());
}
fn average_u64(index: &Index) {
let agg_req = json!({
"average": { "avg": { "field": "score", } }
});
exec_term_with_agg(index, agg_req)
}
fn average_f64(index: &Index) {
let agg_req = json!({
"average": { "avg": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn average_f64_u64(index: &Index) {
let agg_req = json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
});
exec_term_with_agg(index, agg_req)
}
fn stats_f64(index: &Index) {
let agg_req = json!({
"average_f64": { "stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) {
let agg_req = json!({
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
});
execute_agg(index, agg_req);
}
fn terms_few(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_many(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_many_order_by_term(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
});
execute_agg(index, agg_req);
}
fn terms_many_with_top_hits(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"top_hits": { "top_hits":
{
"sort": [
{ "score": "desc" }
],
"size": 2,
"doc_value_fields": ["score_f64"]
}
}
}
},
});
execute_agg(index, agg_req);
}
fn terms_many_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
let collector = get_collector(agg_req);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
black_box(searcher.search(&AllQuery, &collector).unwrap());
}
fn range_agg(index: &Index) {
let agg_req = json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
});
execute_agg(index, agg_req);
}
fn range_agg_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_few(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_few_terms" } },
}
},
});
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_many(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_many_terms" } },
}
},
});
execute_agg(index, agg_req);
}
fn histogram(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
});
execute_agg(index, agg_req);
}
fn histogram_hard_bounds(index: &Index) {
let agg_req = json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
});
execute_agg(index, agg_req);
}
fn histogram_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
}
});
execute_agg(index, agg_req);
}
fn avg_and_range_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
});
execute_agg(index, agg_req);
}
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
OptionalDense = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
OptionalSparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{num}"))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::OptionalDense {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::OptionalSparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::OptionalSparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}
Ok(index)
}

View File

@@ -18,7 +18,7 @@ fn benchmark(
benchmark_dynamic_json(b, input, schema, commit, parse_json) benchmark_dynamic_json(b, input, schema, commit, parse_json)
} else { } else {
_benchmark(b, input, schema, commit, parse_json, |schema, doc_json| { _benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
TantivyDocument::parse_json(&schema, doc_json).unwrap() TantivyDocument::parse_json(schema, doc_json).unwrap()
}) })
} }
} }
@@ -90,8 +90,7 @@ fn benchmark_dynamic_json(
) { ) {
let json_field = schema.get_field("json").unwrap(); let json_field = schema.get_field("json").unwrap();
_benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| { _benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
let json_val: serde_json::Map<String, serde_json::Value> = let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap();
serde_json::from_str(doc_json).unwrap();
tantivy::doc!(json_field=>json_val) tantivy::doc!(json_field=>json_val)
}) })
} }
@@ -138,15 +137,16 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
for (prefix, schema, is_dynamic) in benches { for (prefix, schema, is_dynamic) in benches {
for commit in [false, true] { for commit in [false, true] {
let suffix = if commit { "with-commit" } else { "no-commit" }; let suffix = if commit { "with-commit" } else { "no-commit" };
for parse_json in [false] { {
let parse_json = false;
// for parse_json in [false, true] { // for parse_json in [false, true] {
let suffix = if parse_json { let suffix = if parse_json {
format!("{}-with-json-parsing", suffix) format!("{suffix}-with-json-parsing")
} else { } else {
format!("{}", suffix) suffix.to_string()
}; };
let bench_name = format!("{}{}", prefix, suffix); let bench_name = format!("{prefix}{suffix}");
group.bench_function(bench_name, |b| { group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic) benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
}); });

View File

@@ -9,7 +9,7 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"] categories = ["database-implementations", "data-structures", "compression"]
[dependencies] [dependencies]
itertools = "0.12.0" itertools = "0.13.0"
fastdivide = "0.4.0" fastdivide = "0.4.0"
stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"} stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}

View File

@@ -59,22 +59,6 @@ pub struct ColumnarWriter {
buffers: SpareBuffers, buffers: SpareBuffers,
} }
#[inline]
fn mutate_or_create_column<V, TMutator>(
arena_hash_map: &mut ArenaHashMap,
column_name: &str,
updater: TMutator,
) where
V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V,
{
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
arena_hash_map.mutate_or_create(column_name.as_bytes(), updater);
}
impl ColumnarWriter { impl ColumnarWriter {
pub fn mem_usage(&self) -> usize { pub fn mem_usage(&self) -> usize {
self.arena.mem_usage() self.arena.mem_usage()
@@ -175,9 +159,8 @@ impl ColumnarWriter {
}, },
&mut self.dictionaries, &mut self.dictionaries,
); );
mutate_or_create_column( hash_map.mutate_or_create(
hash_map, column_name.as_bytes(),
column_name,
|column_opt: Option<StrOrBytesColumnWriter>| { |column_opt: Option<StrOrBytesColumnWriter>| {
let mut column_writer = if let Some(column_writer) = column_opt { let mut column_writer = if let Some(column_writer) = column_opt {
column_writer column_writer
@@ -192,24 +175,21 @@ impl ColumnarWriter {
); );
} }
ColumnType::Bool => { ColumnType::Bool => {
mutate_or_create_column( self.bool_field_hash_map.mutate_or_create(
&mut self.bool_field_hash_map, column_name.as_bytes(),
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(), |column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
); );
} }
ColumnType::DateTime => { ColumnType::DateTime => {
mutate_or_create_column( self.datetime_field_hash_map.mutate_or_create(
&mut self.datetime_field_hash_map, column_name.as_bytes(),
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(), |column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
); );
} }
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => { ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
let numerical_type = column_type.numerical_type().unwrap(); let numerical_type = column_type.numerical_type().unwrap();
mutate_or_create_column( self.numerical_field_hash_map.mutate_or_create(
&mut self.numerical_field_hash_map, column_name.as_bytes(),
column_name,
|column_opt: Option<NumericalColumnWriter>| { |column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.force_numerical_type(numerical_type); column.force_numerical_type(numerical_type);
@@ -217,9 +197,8 @@ impl ColumnarWriter {
}, },
); );
} }
ColumnType::IpAddr => mutate_or_create_column( ColumnType::IpAddr => self.ip_addr_field_hash_map.mutate_or_create(
&mut self.ip_addr_field_hash_map, column_name.as_bytes(),
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(), |column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
), ),
} }
@@ -232,9 +211,8 @@ impl ColumnarWriter {
numerical_value: T, numerical_value: T,
) { ) {
let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena);
mutate_or_create_column( hash_map.mutate_or_create(
hash_map, column_name.as_bytes(),
column_name,
|column_opt: Option<NumericalColumnWriter>| { |column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.record_numerical_value(doc, numerical_value.into(), arena); column.record_numerical_value(doc, numerical_value.into(), arena);
@@ -244,10 +222,6 @@ impl ColumnarWriter {
} }
pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) { pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena);
hash_map.mutate_or_create( hash_map.mutate_or_create(
column_name.as_bytes(), column_name.as_bytes(),
@@ -261,24 +235,30 @@ impl ColumnarWriter {
pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) { pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| { hash_map.mutate_or_create(
let mut column: ColumnWriter = column_opt.unwrap_or_default(); column_name.as_bytes(),
column.record(doc, val, arena); |column_opt: Option<ColumnWriter>| {
column let mut column: ColumnWriter = column_opt.unwrap_or_default();
}); column.record(doc, val, arena);
column
},
);
} }
pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) { pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) {
let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| { hash_map.mutate_or_create(
let mut column: ColumnWriter = column_opt.unwrap_or_default(); column_name.as_bytes(),
column.record( |column_opt: Option<ColumnWriter>| {
doc, let mut column: ColumnWriter = column_opt.unwrap_or_default();
NumericalValue::I64(datetime.into_timestamp_nanos()), column.record(
arena, doc,
); NumericalValue::I64(datetime.into_timestamp_nanos()),
column arena,
}); );
column
},
);
} }
pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) { pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
@@ -303,10 +283,6 @@ impl ColumnarWriter {
} }
pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) { pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena, dictionaries) = ( let (hash_map, arena, dictionaries) = (
&mut self.bytes_field_hash_map, &mut self.bytes_field_hash_map,
&mut self.arena, &mut self.arena,

View File

@@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
(result, vlen) (result, vlen)
} }
/// Write a `u32` as a vint payload. /// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> { pub fn write_u32_vint<W: io::Write + ?Sized>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8]; let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf); let data = serialize_vint_u32(val, &mut buf);
writer.write_all(data) writer.write_all(data)

View File

@@ -19,13 +19,14 @@ use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the // Normally you would use `MMapDirectory` instead to persist data on disk.
// sake of this example // https://docs.rs/tantivy/latest/tantivy/directory/struct.MmapDirectory.html
// But for this example, we will use a temporary directory `TempDir`.
let index_path = TempDir::new()?; let index_path = TempDir::new()?;
// # Defining the schema // # Defining the schema
// //
// The Tantivy index requires a very strict schema. // The Tantivy index requires a schema.
// The schema declares which fields are in the index, // The schema declares which fields are in the index,
// and for each field, its type and "the way it should // and for each field, its type and "the way it should
// be indexed". // be indexed".

View File

@@ -11,9 +11,10 @@ use columnar::Column;
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector}; use tantivy::collector::{Collector, SegmentCollector};
use tantivy::index::SegmentReader;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader}; use tantivy::{doc, Index, IndexWriter, Score};
#[derive(Default)] #[derive(Default)]
struct Stats { struct Stats {

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING}; use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
let opts = DateOptions::from(INDEXED) let opts = DateOptions::from(INDEXED)
.set_stored() .set_stored()
.set_fast() .set_fast()
.set_precision(tantivy::DateTimePrecision::Seconds); .set_precision(tantivy::schema::DateTimePrecision::Seconds);
// Add `occurred_at` date field type // Add `occurred_at` date field type
let occurred_at = schema_builder.add_date_field("occurred_at", opts); let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED); let event_type = schema_builder.add_text_field("event", STRING | STORED);
@@ -61,10 +61,12 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count_docs.len(), 1); assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs { for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?; let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
assert!(matches!( assert!(retrieved_doc
retrieved_doc.get_first(occurred_at), .get_first(occurred_at)
Some(OwnedValue::Date(_)) .unwrap()
)); .as_value()
.as_datetime()
.is_some(),);
assert_eq!( assert_eq!(
retrieved_doc.to_json(&schema), retrieved_doc.to_json(&schema),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"# r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#

View File

@@ -1,335 +0,0 @@
#![allow(unused_imports)]
#![allow(dead_code)]
use std::alloc::System;
use std::env::args;
use std::net::Ipv6Addr;
use columnar::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use common::{BinarySerializable, CountingWriter, DateTime, FixedSize};
use peakmem_alloc::*;
use tantivy::schema::{Field, FieldValue, OwnedValue, FAST, INDEXED, STRING, TEXT};
use tantivy::tokenizer::PreTokenizedString;
use tantivy::{doc, TantivyDocument};
const GH_LOGS: &str = include_str!("../benches/gh.json");
const HDFS_LOGS: &str = include_str!("../benches/hdfs.json");
#[global_allocator]
static GLOBAL: &PeakMemAlloc<System> = &INSTRUMENTED_SYSTEM;
fn main() {
dbg!(std::mem::size_of::<TantivyDocument>());
dbg!(std::mem::size_of::<DocContainerRef>());
dbg!(std::mem::size_of::<OwnedValue>());
dbg!(std::mem::size_of::<OwnedValueMedVec>());
dbg!(std::mem::size_of::<ValueContainerRef>());
dbg!(std::mem::size_of::<mediumvec::vec32::Vec32::<u8>>());
let filter = args().nth(1);
measure_fn(
test_hdfs::<TantivyDocument>,
"hdfs TantivyDocument",
&filter,
);
measure_fn(
test_hdfs::<TantivyDocumentMedVec>,
"hdfs TantivyDocumentMedVec",
&filter,
);
measure_fn(
test_hdfs::<DocContainerRef>,
"hdfs DocContainerRef",
&filter,
);
measure_fn(test_gh::<TantivyDocument>, "gh TantivyDocument", &filter);
measure_fn(
test_gh::<TantivyDocumentMedVec>,
"gh TantivyDocumentMedVec",
&filter,
);
measure_fn(test_gh::<DocContainerRef>, "gh DocContainerRef", &filter);
}
fn measure_fn<F: FnOnce()>(f: F, name: &str, filter: &Option<std::string::String>) {
if let Some(filter) = filter {
if !name.contains(filter) {
return;
}
}
GLOBAL.reset_peak_memory();
f();
println!("Peak Memory {} : {:#?}", GLOBAL.get_peak_memory(), name);
}
fn test_hdfs<T: From<TantivyDocument>>() {
let schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED);
schema_builder.add_text_field("body", TEXT);
schema_builder.add_text_field("severity", STRING);
schema_builder.build()
};
let mut docs: Vec<T> = Vec::with_capacity(HDFS_LOGS.lines().count());
for doc_json in HDFS_LOGS.lines() {
let doc = TantivyDocument::parse_json(&schema, doc_json)
.unwrap()
.into();
docs.push(doc);
}
}
fn test_gh<T: From<TantivyDocument>>() {
let schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", FAST);
schema_builder.build()
};
let mut docs: Vec<T> = Vec::with_capacity(GH_LOGS.lines().count());
for doc_json in GH_LOGS.lines() {
let json_field = schema.get_field("json").unwrap();
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val).into();
docs.push(doc);
}
}
#[derive(Clone, Debug, Default)]
#[allow(dead_code)]
pub struct TantivyDocumentMedVec {
field_values: mediumvec::Vec32<FieldValueMedVec>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct FieldValueMedVec {
pub field: Field,
pub value: OwnedValueMedVec,
}
/// This is a owned variant of `Value`, that can be passed around without lifetimes.
/// Represents the value of a any field.
/// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, PartialEq)]
pub enum OwnedValueMedVec {
/// A null value.
Null,
/// The str type is used for any text information.
Str(mediumvec::vec32::Vec32<u8>),
/// Unsigned 64-bits Integer `u64`
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64),
/// 64-bits Float `f64`
F64(f64),
/// Bool value
Bool(bool),
/// Date/time with nanoseconds precision
Date(DateTime),
Array(mediumvec::vec32::Vec32<Self>),
/// Dynamic object value.
Object(mediumvec::vec32::Vec32<(String, Self)>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
/// Pre-tokenized str type,
PreTokStr(Box<PreTokenizedString>),
/// Arbitrarily sized byte array
Bytes(mediumvec::vec32::Vec32<u8>),
}
impl From<TantivyDocument> for TantivyDocumentMedVec {
fn from(doc: TantivyDocument) -> Self {
let field_values = doc
.into_iter()
.map(|fv| FieldValueMedVec {
field: fv.field,
value: fv.value.into(),
})
.collect();
TantivyDocumentMedVec { field_values }
}
}
impl From<OwnedValue> for OwnedValueMedVec {
fn from(value: OwnedValue) -> Self {
match value {
OwnedValue::Null => OwnedValueMedVec::Null,
OwnedValue::Str(s) => {
let bytes = s.into_bytes();
let vec = mediumvec::vec32::Vec32::from_vec(bytes);
OwnedValueMedVec::Str(vec)
}
OwnedValue::U64(u) => OwnedValueMedVec::U64(u),
OwnedValue::I64(i) => OwnedValueMedVec::I64(i),
OwnedValue::F64(f) => OwnedValueMedVec::F64(f),
OwnedValue::Bool(b) => OwnedValueMedVec::Bool(b),
OwnedValue::Date(d) => OwnedValueMedVec::Date(d),
OwnedValue::Array(arr) => {
let arr = arr.into_iter().map(|v| v.into()).collect();
OwnedValueMedVec::Array(arr)
}
OwnedValue::Object(obj) => {
let obj = obj.into_iter().map(|(k, v)| (k, v.into())).collect();
OwnedValueMedVec::Object(obj)
}
OwnedValue::IpAddr(ip) => OwnedValueMedVec::IpAddr(ip),
_ => panic!("Unsupported value type {:?}", value),
}
}
}
#[repr(packed)]
pub struct FieldValueContainerRef {
pub field: u16,
pub value: ValueContainerRef,
}
#[repr(packed)]
struct DocContainerRef {
container: OwnedValueRefContainer,
field_values: mediumvec::Vec32<FieldValueContainerRef>,
}
#[derive(Default)]
struct OwnedValueRefContainer {
nodes: mediumvec::Vec32<ValueContainerRef>,
node_data: mediumvec::Vec32<u8>,
}
impl OwnedValueRefContainer {
fn shrink_to_fit(&mut self) {
self.nodes.shrink_to_fit();
self.node_data.shrink_to_fit();
}
}
impl From<TantivyDocument> for DocContainerRef {
fn from(doc: TantivyDocument) -> Self {
let mut container = OwnedValueRefContainer::default();
let field_values = doc
.into_iter()
.map(|fv| FieldValueContainerRef {
field: fv.field.field_id().try_into().unwrap(),
value: container.add_value(fv.value),
})
.collect();
container.shrink_to_fit();
Self {
field_values,
container,
}
}
}
// References to positions in two array, one for the OwnedValueRef and the other for the encoded
// bytes
#[derive(Debug, Clone, PartialEq)]
pub enum ValueContainerRef {
/// A null value.
Null,
/// The str type is used for any text information.
Str(u32),
/// Unsigned 64-bits Integer `u64`
U64(u32), // position of the serialized 8 bytes in the data array
/// Signed 64-bits Integer `i64`
I64(u32), // position of the serialized 8 bytes in the data array
/// 64-bits Float `f64`
F64(u32), // position of the serialized 8 bytes in the data array
/// Bool value
Bool(bool), // inlined bool
/// Date/time with nanoseconds precision
Date(u32), // position of the serialized 8 byte in the data array
Array(NodeAddress),
/// Dynamic object value.
Object(NodeAddress),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(u32), // position of the serialized 16 bytes in the data array
/// Arbitrarily sized byte array
Bytes(u32),
}
#[derive(Debug, Clone, PartialEq)]
pub struct NodeAddress {
pos: u32,
num_nodes: u32,
}
impl OwnedValueRefContainer {
pub fn add_value(&mut self, value: OwnedValue) -> ValueContainerRef {
match value {
OwnedValue::Null => ValueContainerRef::Null,
OwnedValue::U64(num) => ValueContainerRef::U64(write_into(&mut self.node_data, num)),
OwnedValue::I64(num) => ValueContainerRef::I64(write_into(&mut self.node_data, num)),
OwnedValue::F64(num) => ValueContainerRef::F64(write_into(&mut self.node_data, num)),
OwnedValue::Bool(b) => ValueContainerRef::Bool(b),
OwnedValue::Date(date) => ValueContainerRef::Date(write_into(
&mut self.node_data,
date.into_timestamp_nanos(),
)),
OwnedValue::Str(bytes) => {
ValueContainerRef::Str(write_into(&mut self.node_data, bytes))
}
OwnedValue::Bytes(bytes) => {
ValueContainerRef::Bytes(write_into(&mut self.node_data, bytes))
}
OwnedValue::Array(elements) => {
let pos = self.nodes.len() as u32;
let len = elements.len() as u32;
for elem in elements {
let ref_elem = self.add_value(elem);
self.nodes.push(ref_elem);
}
ValueContainerRef::Array(NodeAddress {
pos,
num_nodes: len,
})
}
OwnedValue::Object(entries) => {
let pos = self.nodes.len() as u32;
let len = entries.len() as u32;
for (key, value) in entries {
let ref_key = self.add_value(OwnedValue::Str(key));
let ref_value = self.add_value(value);
self.nodes.push(ref_key);
self.nodes.push(ref_value);
}
ValueContainerRef::Object(NodeAddress {
pos,
num_nodes: len,
})
}
OwnedValue::IpAddr(num) => {
ValueContainerRef::IpAddr(write_into(&mut self.node_data, num.to_u128()))
}
OwnedValue::PreTokStr(_) => todo!(),
OwnedValue::Facet(_) => todo!(),
}
}
}
fn write_into<T: BinarySerializable>(data: &mut mediumvec::Vec32<u8>, value: T) -> u32 {
let pos = data.len() as u32;
data.as_vec(|vec| value.serialize(vec).unwrap());
pos
}
fn write_into_2<T: BinarySerializable>(data: &mut mediumvec::Vec32<u8>, value: T) -> NodeAddress {
let pos = data.len() as u32;
let mut len = 0;
data.as_vec(|vec| {
let mut wrt = CountingWriter::wrap(vec);
value.serialize(&mut wrt).unwrap();
len = wrt.written_bytes() as u32;
});
NodeAddress {
pos,
num_nodes: len,
}
}
// impl From<ContainerDocRef> for TantivyDocument {
// fn from(doc: ContainerDocRef) -> Self {
// let mut doc2 = TantivyDocument::new();
// for fv in doc.field_values {
// let field = Field::from_field_id(fv.field as u32);
// let value = doc.container.get_value(fv.value);
// doc2.add(FieldValue::new(field, value));
//}
// doc2
//}

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
{ {
let facets = vec![ let facets = [
Facet::from("/ingredient/egg"), Facet::from("/ingredient/egg"),
Facet::from("/ingredient/oil"), Facet::from("/ingredient/oil"),
Facet::from("/ingredient/garlic"), Facet::from("/ingredient/garlic"),
@@ -94,9 +94,8 @@ fn main() -> tantivy::Result<()> {
.doc::<TantivyDocument>(*doc_id) .doc::<TantivyDocument>(*doc_id)
.unwrap() .unwrap()
.get_first(title) .get_first(title)
.and_then(|v| v.as_str()) .and_then(|v| v.as_str().map(|el| el.to_string()))
.unwrap() .unwrap()
.to_owned()
}) })
.collect(); .collect();
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]); assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);

View File

@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ))?;
println!("add doc {} from thread 1 - opstamp {}", i, opstamp); println!("add doc {i} from thread 1 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(20)); thread::sleep(Duration::from_millis(20));
} }
Result::<(), TantivyError>::Ok(()) Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
body => "Some great book description..." body => "Some great book description..."
))? ))?
}; };
println!("add doc {} from thread 2 - opstamp {}", i, opstamp); println!("add doc {i} from thread 2 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(10)); thread::sleep(Duration::from_millis(10));
} }
Result::<(), TantivyError>::Ok(()) Result::<(), TantivyError>::Ok(())

View File

@@ -7,10 +7,11 @@
// the list of documents containing a term, getting // the list of documents containing a term, getting
// its term frequency, and accessing its positions. // its term frequency, and accessing its positions.
use tantivy::postings::Postings;
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED}; use tantivy::{doc, DocSet, Index, IndexWriter, TERMINATED};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the // We first create a schema for the sake of the

View File

@@ -3,10 +3,11 @@ use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak}; use std::sync::{Arc, RwLock, Weak};
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::index::SegmentId;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT}; use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{ use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId, doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration,
SegmentReader, Warmer, SegmentReader, Warmer,
}; };

View File

@@ -1,585 +0,0 @@
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use test::{self, Bencher};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
Optional = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
Sparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::Optional {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::Sparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::Sparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}
Ok(index)
}
use paste::paste;
#[macro_export]
macro_rules! bench_all_cardinalities {
( $x:ident ) => {
paste! {
#[bench]
fn $x(b: &mut Bencher) {
[<$x _card>](b, Cardinality::Full)
}
#[bench]
fn [<$x _opt>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Optional)
}
#[bench]
fn [<$x _multi>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Multivalued)
}
#[bench]
fn [<$x _sparse>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Sparse)
}
}
};
}
bench_all_cardinalities!(bench_aggregation_average_u64);
fn bench_aggregation_average_u64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": { "avg": { "field": "score", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_stats_f64);
fn bench_aggregation_stats_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "stats": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_f64);
fn bench_aggregation_average_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_percentiles_f64);
fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_u64_and_f64);
fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_few);
fn bench_aggregation_terms_few_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_with_top_hits_agg);
fn bench_aggregation_terms_many_with_top_hits_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"top_hits": { "top_hits":
{
"sort": [
{ "score": "desc" }
],
"size": 2,
"doc_value_fields": ["score_f64"]
}
}
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_with_sub_agg);
fn bench_aggregation_terms_many_with_sub_agg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg);
fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many2);
fn bench_aggregation_terms_many2_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_order_by_term);
fn bench_aggregation_terms_many_order_by_term_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_range_only);
fn bench_aggregation_range_only_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_range_with_avg);
fn bench_aggregation_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
// hard bounds has a different algorithm, because it actually limits collection range
//
bench_all_cardinalities!(bench_aggregation_histogram_only_hard_bounds);
fn bench_aggregation_histogram_only_hard_bounds_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_histogram_with_avg);
fn bench_aggregation_histogram_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_histogram_only);
fn bench_aggregation_histogram_only_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_avg_and_range_with_avg);
fn bench_aggregation_avg_and_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
}

View File

@@ -81,10 +81,11 @@ impl AggregationLimits {
} }
} }
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> { pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
self.memory_consumption let prev_value = self
.fetch_add(num_bytes, Ordering::Relaxed); .memory_consumption
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?; .fetch_add(add_num_bytes, Ordering::Relaxed);
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(()) Ok(())
} }
@@ -94,11 +95,11 @@ impl AggregationLimits {
} }
fn validate_memory_consumption( fn validate_memory_consumption(
memory_consumption: &AtomicU64, memory_consumption: u64,
memory_limit: ByteCount, memory_limit: ByteCount,
) -> Result<(), AggregationError> { ) -> Result<(), AggregationError> {
// Load the estimated memory consumed by the aggregations // Load the estimated memory consumed by the aggregations
let memory_consumed: ByteCount = memory_consumption.load(Ordering::Relaxed).into(); let memory_consumed: ByteCount = memory_consumption.into();
if memory_consumed > memory_limit { if memory_consumed > memory_limit {
return Err(AggregationError::MemoryExceeded { return Err(AggregationError::MemoryExceeded {
limit: memory_limit, limit: memory_limit,
@@ -118,10 +119,11 @@ pub struct ResourceLimitGuard {
} }
impl ResourceLimitGuard { impl ResourceLimitGuard {
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> { pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
self.memory_consumption let prev_value = self
.fetch_add(num_bytes, Ordering::Relaxed); .memory_consumption
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?; .fetch_add(add_num_bytes, Ordering::Relaxed);
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(()) Ok(())
} }
} }

View File

@@ -17,7 +17,8 @@ use super::metric::{
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::VecWithNames; use super::VecWithNames;
use crate::aggregation::{f64_to_fastfield_u64, Key}; use crate::aggregation::{f64_to_fastfield_u64, Key};
use crate::{SegmentOrdinal, SegmentReader}; use crate::index::SegmentReader;
use crate::SegmentOrdinal;
#[derive(Default)] #[derive(Default)]
pub(crate) struct AggregationsWithAccessor { pub(crate) struct AggregationsWithAccessor {
@@ -334,8 +335,8 @@ fn get_missing_val(
} }
_ => { _ => {
return Err(crate::TantivyError::InvalidArgument(format!( return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {:?} for field {} is not supported for column type {:?}", "Missing value {missing:?} for field {field_name} is not supported for column \
missing, field_name, column_type type {column_type:?}"
))); )));
} }
}; };
@@ -402,7 +403,7 @@ fn get_dynamic_columns(
.iter() .iter()
.map(|h| h.open()) .map(|h| h.open())
.collect::<io::Result<_>>()?; .collect::<io::Result<_>>()?;
assert!(!ff_fields.is_empty(), "field {} not found", field_name); assert!(!ff_fields.is_empty(), "field {field_name} not found");
Ok(cols) Ok(cols)
} }

View File

@@ -331,9 +331,11 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
} }
let mem_delta = self.get_memory_consumption() - mem_pre; let mem_delta = self.get_memory_consumption() - mem_pre;
bucket_agg_accessor if mem_delta > 0 {
.limits bucket_agg_accessor
.add_memory_consumed(mem_delta as u64)?; .limits
.add_memory_consumed(mem_delta as u64)?;
}
Ok(()) Ok(())
} }

View File

@@ -324,9 +324,11 @@ impl SegmentAggregationCollector for SegmentTermCollector {
} }
let mem_delta = self.get_memory_consumption() - mem_pre; let mem_delta = self.get_memory_consumption() - mem_pre;
bucket_agg_accessor if mem_delta > 0 {
.limits bucket_agg_accessor
.add_memory_consumed(mem_delta as u64)?; .limits
.add_memory_consumed(mem_delta as u64)?;
}
Ok(()) Ok(())
} }
@@ -355,8 +357,7 @@ impl SegmentTermCollector {
) -> crate::Result<Self> { ) -> crate::Result<Self> {
if field_type == ColumnType::Bytes { if field_type == ColumnType::Bytes {
return Err(TantivyError::InvalidArgument(format!( return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {:?}", "terms aggregation is not supported for column type {field_type:?}"
field_type
))); )));
} }
let term_buckets = TermBuckets::default(); let term_buckets = TermBuckets::default();

View File

@@ -8,7 +8,8 @@ use super::segment_agg_result::{
}; };
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate; use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, SegmentOrdinal, SegmentReader, TantivyError}; use crate::index::SegmentReader;
use crate::{DocId, SegmentOrdinal, TantivyError};
/// The default max bucket count, before the aggregation fails. /// The default max bucket count, before the aggregation fails.
pub const DEFAULT_BUCKET_LIMIT: u32 = 65000; pub const DEFAULT_BUCKET_LIMIT: u32 = 65000;

View File

@@ -131,8 +131,8 @@ impl<'de> Deserialize<'de> for KeyOrder {
))?; ))?;
if key_order.next().is_some() { if key_order.next().is_some() {
return Err(serde::de::Error::custom(format!( return Err(serde::de::Error::custom(format!(
"Expected exactly one key-value pair in sort parameter of top_hits, found {:?}", "Expected exactly one key-value pair in sort parameter of top_hits, found \
key_order {key_order:?}"
))); )));
} }
Ok(Self { field, order }) Ok(Self { field, order })
@@ -144,27 +144,22 @@ fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
// Replace `*` glob with `.*` regex // Replace `*` glob with `.*` regex
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*")); let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
Regex::new(&sanitized.replace('*', ".*")).map_err(|e| { Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
crate::TantivyError::SchemaError(format!( crate::TantivyError::SchemaError(format!("Invalid regex '{glob}' in docvalue_fields: {e}"))
"Invalid regex '{}' in docvalue_fields: {}",
glob, e
))
}) })
} }
fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> { fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError( Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!( AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported, only `docvalue_fields` is supported in \ "The `{parameter}` parameter is not supported, only `docvalue_fields` is supported in \
`top_hits` aggregation", `top_hits` aggregation"
parameter
)), )),
)) ))
} }
fn unsupported_err(parameter: &str) -> crate::Result<()> { fn unsupported_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError( Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!( AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported in the `top_hits` aggregation", "The `{parameter}` parameter is not supported in the `top_hits` aggregation"
parameter
)), )),
)) ))
} }
@@ -217,8 +212,7 @@ impl TopHitsAggregation {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert!( assert!(
!fields.is_empty(), !fields.is_empty(),
"No fields matched the glob '{}' in docvalue_fields", "No fields matched the glob '{field}' in docvalue_fields"
field
); );
Ok(fields) Ok(fields)
}) })
@@ -254,7 +248,7 @@ impl TopHitsAggregation {
.map(|field| { .map(|field| {
let accessors = accessors let accessors = accessors
.get(field) .get(field)
.unwrap_or_else(|| panic!("field '{}' not found in accessors", field)); .unwrap_or_else(|| panic!("field '{field}' not found in accessors"));
let values: Vec<FastFieldValue> = accessors let values: Vec<FastFieldValue> = accessors
.iter() .iter()

View File

@@ -143,8 +143,6 @@ use std::fmt::Display;
#[cfg(test)] #[cfg(test)]
mod agg_tests; mod agg_tests;
mod agg_bench;
use core::fmt; use core::fmt;
pub use agg_limits::AggregationLimits; pub use agg_limits::AggregationLimits;
@@ -160,15 +158,14 @@ use serde::de::{self, Visitor};
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Deserializer, Serialize};
fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> { fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> {
let parsed = value.parse::<f64>().map_err(|_err| { let parsed = value
de::Error::custom(format!("Failed to parse f64 from string: {:?}", value)) .parse::<f64>()
})?; .map_err(|_err| de::Error::custom(format!("Failed to parse f64 from string: {value:?}")))?;
// Check if the parsed value is NaN or infinity // Check if the parsed value is NaN or infinity
if parsed.is_nan() || parsed.is_infinite() { if parsed.is_nan() || parsed.is_infinite() {
Err(de::Error::custom(format!( Err(de::Error::custom(format!(
"Value is not a valid f64 (NaN or Infinity): {:?}", "Value is not a valid f64 (NaN or Infinity): {value:?}"
value
))) )))
} else { } else {
Ok(parsed) Ok(parsed)

View File

@@ -598,7 +598,7 @@ mod tests {
let mid = n % 4; let mid = n % 4;
n /= 4; n /= 4;
let leaf = n % 5; let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf)) Facet::from(&format!("/top{top}/mid{mid}/leaf{leaf}"))
}) })
.collect(); .collect();
for i in 0..num_facets * 10 { for i in 0..num_facets * 10 {
@@ -737,7 +737,7 @@ mod tests {
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)] vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter() .into_iter()
.flat_map(|(c, count)| { .flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c)); let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet); let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count) iter::repeat(doc).take(count)
}) })
@@ -785,7 +785,7 @@ mod tests {
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)] let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter() .into_iter()
.flat_map(|(c, count)| { .flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c)); let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet); let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count) iter::repeat(doc).take(count)
}) })

View File

@@ -4,7 +4,8 @@ use std::marker::PhantomData;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::top_score_collector::TopNComputer; use super::top_score_collector::TopNComputer;
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader}; use crate::index::SegmentReader;
use crate::{DocAddress, DocId, SegmentOrdinal};
/// Contains a feature (field, score, etc.) of a document along with the document address. /// Contains a feature (field, score, etc.) of a document along with the document address.
/// ///

View File

@@ -1,19 +1,25 @@
use rayon::{ThreadPool, ThreadPoolBuilder}; use std::sync::Arc;
#[cfg(feature = "quickwit")]
use futures_util::{future::Either, FutureExt};
use crate::TantivyError; use crate::TantivyError;
/// Search executor whether search request are single thread or multithread. /// Executor makes it possible to run tasks in single thread or
/// /// in a thread pool.
/// We don't expose Rayon thread pool directly here for several reasons. #[derive(Clone)]
///
/// First dependency hell. It is not a good idea to expose the
/// API of a dependency, knowing it might conflict with a different version
/// used by the client. Second, we may stop using rayon in the future.
pub enum Executor { pub enum Executor {
/// Single thread variant of an Executor /// Single thread variant of an Executor
SingleThread, SingleThread,
/// Thread pool variant of an Executor /// Thread pool variant of an Executor
ThreadPool(ThreadPool), ThreadPool(Arc<rayon::ThreadPool>),
}
#[cfg(feature = "quickwit")]
impl From<Arc<rayon::ThreadPool>> for Executor {
fn from(thread_pool: Arc<rayon::ThreadPool>) -> Self {
Executor::ThreadPool(thread_pool)
}
} }
impl Executor { impl Executor {
@@ -24,11 +30,11 @@ impl Executor {
/// Creates an Executor that dispatches the tasks in a thread pool. /// Creates an Executor that dispatches the tasks in a thread pool.
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> { pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> {
let pool = ThreadPoolBuilder::new() let pool = rayon::ThreadPoolBuilder::new()
.num_threads(num_threads) .num_threads(num_threads)
.thread_name(move |num| format!("{prefix}{num}")) .thread_name(move |num| format!("{prefix}{num}"))
.build()?; .build()?;
Ok(Executor::ThreadPool(pool)) Ok(Executor::ThreadPool(Arc::new(pool)))
} }
/// Perform a map in the thread pool. /// Perform a map in the thread pool.
@@ -91,11 +97,36 @@ impl Executor {
} }
} }
} }
/// Spawn a task on the pool, returning a future completing on task success.
///
/// If the task panic, returns `Err(())`.
#[cfg(feature = "quickwit")]
pub fn spawn_blocking<T: Send + 'static>(
&self,
cpu_intensive_task: impl FnOnce() -> T + Send + 'static,
) -> impl std::future::Future<Output = Result<T, ()>> {
match self {
Executor::SingleThread => Either::Left(std::future::ready(Ok(cpu_intensive_task()))),
Executor::ThreadPool(pool) => {
let (sender, receiver) = oneshot::channel();
pool.spawn(|| {
if sender.is_closed() {
return;
}
let task_result = cpu_intensive_task();
let _ = sender.send(task_result);
});
let res = receiver.map(|res| res.map_err(|_| ()));
Either::Right(res)
}
}
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::Executor; use super::Executor;
#[test] #[test]
@@ -147,4 +178,62 @@ mod tests {
assert_eq!(result[i], i * 2); assert_eq!(result[i], i * 2);
} }
} }
#[cfg(feature = "quickwit")]
#[test]
fn test_cancel_cpu_intensive_tasks() {
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
let counter: Arc<AtomicU64> = Default::default();
let other_counter: Arc<AtomicU64> = Default::default();
let mut futures = Vec::new();
let mut other_futures = Vec::new();
let (tx, rx) = crossbeam_channel::bounded::<()>(0);
let rx = Arc::new(rx);
let executor = Executor::multi_thread(3, "search-test").unwrap();
for i in 0..1000 {
let counter_clone: Arc<AtomicU64> = counter.clone();
let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
let rx_clone = rx.clone();
let rx_clone2 = rx.clone();
let fut = executor.spawn_blocking(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone.recv().unwrap();
});
futures.push(fut);
let other_fut = executor.spawn_blocking(move || {
other_counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone2.recv().unwrap();
});
other_futures.push(other_fut);
}
// We execute 100 futures.
for i in 0..100 {
tx.send(()).unwrap();
}
let counter_val = counter.load(Ordering::SeqCst);
let other_counter_val = other_counter.load(Ordering::SeqCst);
assert!(counter_val >= 30);
assert!(other_counter_val >= 30);
drop(other_futures);
// We execute 100 futures.
for i in 0..100 {
tx.send(()).unwrap();
}
let counter_val2 = counter.load(Ordering::SeqCst);
assert!(counter_val2 >= counter_val + 100 - 6);
let other_counter_val2 = other_counter.load(Ordering::SeqCst);
assert!(other_counter_val2 <= other_counter_val + 6);
}
} }

View File

@@ -31,7 +31,7 @@ use crate::{DateTime, DocId, Term};
/// position 1. /// position 1.
/// As a result, with lemmatization, "The Smiths" will match our object. /// As a result, with lemmatization, "The Smiths" will match our object.
/// ///
/// Worse, if a same term is appears in the second object, a non increasing value would be pushed /// Worse, if a same term appears in the second object, a non increasing value would be pushed
/// to the position recorder probably provoking a panic. /// to the position recorder probably provoking a panic.
/// ///
/// This problem is solved for regular multivalued object by offsetting the position /// This problem is solved for regular multivalued object by offsetting the position
@@ -50,7 +50,7 @@ use crate::{DateTime, DocId, Term};
/// We can therefore afford working with a map that is not imperfect. It is fine if several /// We can therefore afford working with a map that is not imperfect. It is fine if several
/// path map to the same index position as long as the probability is relatively low. /// path map to the same index position as long as the probability is relatively low.
#[derive(Default)] #[derive(Default)]
struct IndexingPositionsPerPath { pub(crate) struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>, positions_per_path: FxHashMap<u32, IndexingPosition>,
} }
@@ -58,6 +58,9 @@ impl IndexingPositionsPerPath {
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition { fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
self.positions_per_path.entry(id).or_default() self.positions_per_path.entry(id).or_default()
} }
pub fn clear(&mut self) {
self.positions_per_path.clear();
}
} }
/// Convert JSON_PATH_SEGMENT_SEP to a dot. /// Convert JSON_PATH_SEGMENT_SEP to a dot.
@@ -68,36 +71,6 @@ pub fn json_path_sep_to_dot(path: &mut str) {
} }
} }
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_json_values<'a, V: Value<'a>>(
doc: DocId,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
json_path_writer: &mut JsonPathWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
json_path_writer.clear();
json_path_writer.set_expand_dots(expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_visitor_res in json_visitors {
let json_visitor = json_visitor_res?;
index_json_object::<V>(
doc,
json_visitor,
text_analyzer,
term_buffer,
json_path_writer,
postings_writer,
ctx,
&mut positions_per_path,
);
}
Ok(())
}
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn index_json_object<'a, V: Value<'a>>( fn index_json_object<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
@@ -126,7 +99,7 @@ fn index_json_object<'a, V: Value<'a>>(
} }
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn index_json_value<'a, V: Value<'a>>( pub(crate) fn index_json_value<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
json_value: V, json_value: V,
text_analyzer: &mut TextAnalyzer, text_analyzer: &mut TextAnalyzer,
@@ -166,12 +139,18 @@ fn index_json_value<'a, V: Value<'a>>(
); );
} }
ReferenceValueLeaf::U64(val) => { ReferenceValueLeaf::U64(val) => {
// try to parse to i64, since when querying we will apply the same logic and prefer
// i64 values
set_path_id( set_path_id(
term_buffer, term_buffer,
ctx.path_to_unordered_id ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()), .get_or_allocate_unordered_id(json_path_writer.as_str()),
); );
term_buffer.append_type_and_fast_value(val); if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx); postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
} }
ReferenceValueLeaf::I64(val) => { ReferenceValueLeaf::I64(val) => {
@@ -257,10 +236,7 @@ fn index_json_value<'a, V: Value<'a>>(
/// Tries to infer a JSON type from a string and append it to the term. /// Tries to infer a JSON type from a string and append it to the term.
/// ///
/// The term must be json + JSON path. /// The term must be json + JSON path.
pub(crate) fn convert_to_fast_value_and_append_to_json_term( pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
mut term: Term,
phrase: &str,
) -> Option<Term> {
assert_eq!( assert_eq!(
term.value() term.value()
.as_json_value_bytes() .as_json_value_bytes()
@@ -362,14 +338,14 @@ mod tests {
let mut term = Term::from_field_json_path(field, "attributes.color", false); let mut term = Term::from_field_json_path(field, "attributes.color", false);
term.append_type_and_str("red"); term.append_type_and_str("red");
assert_eq!( assert_eq!(
format!("{:?}", term), format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")" "Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
); );
let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false); let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
term.append_type_and_fast_value(400i64); term.append_type_and_fast_value(400i64);
assert_eq!( assert_eq!(
format!("{:?}", term), format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)" "Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
); );
} }

View File

@@ -4,13 +4,13 @@ use std::{fmt, io};
use crate::collector::Collector; use crate::collector::Collector;
use crate::core::Executor; use crate::core::Executor;
use crate::index::SegmentReader; use crate::index::{SegmentId, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query}; use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize; use crate::schema::document::DocumentDeserialize;
use crate::schema::{Schema, Term}; use crate::schema::{Schema, Term};
use crate::space_usage::SearcherSpaceUsage; use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader}; use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject}; use crate::{DocAddress, Index, Opstamp, TrackedObject};
/// Identifies the searcher generation accessed by a [`Searcher`]. /// Identifies the searcher generation accessed by a [`Searcher`].
/// ///
@@ -109,8 +109,9 @@ impl Searcher {
&self, &self,
doc_address: DocAddress, doc_address: DocAddress,
) -> crate::Result<D> { ) -> crate::Result<D> {
let executor = self.inner.index.search_executor();
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id).await store_reader.get_async(doc_address.doc_id, executor).await
} }
/// Access the schema associated with the index of this searcher. /// Access the schema associated with the index of this searcher.

View File

@@ -1,12 +1,14 @@
use crate::collector::Count; use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback}; use crate::directory::{RamDirectory, WatchCallback};
use crate::index::SegmentId;
use crate::indexer::{LogMergePolicy, NoMergePolicy}; use crate::indexer::{LogMergePolicy, NoMergePolicy};
use crate::postings::Postings;
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT}; use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::tokenizer::TokenizerManager; use crate::tokenizer::TokenizerManager;
use crate::{ use crate::{
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings, Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
ReloadPolicy, SegmentId, TantivyDocument, Term, TantivyDocument, Term,
}; };
#[test] #[test]
@@ -417,7 +419,7 @@ fn test_non_text_json_term_freq() {
let inv_idx = segment_reader.inverted_index(field).unwrap(); let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::from_field_json_path(field, "tenant_id", false); let mut term = Term::from_field_json_path(field, "tenant_id", false);
term.append_type_and_fast_value(75u64); term.append_type_and_fast_value(75i64);
let postings = inv_idx let postings = inv_idx
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -451,7 +453,7 @@ fn test_non_text_json_term_freq_bitpacked() {
let inv_idx = segment_reader.inverted_index(field).unwrap(); let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::from_field_json_path(field, "tenant_id", false); let mut term = Term::from_field_json_path(field, "tenant_id", false);
term.append_type_and_fast_value(75u64); term.append_type_and_fast_value(75i64);
let mut postings = inv_idx let mut postings = inv_idx
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -566,7 +566,7 @@ mod tests {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap(); let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let num_paths = 10; let num_paths = 10;
let paths: Vec<PathBuf> = (0..num_paths) let paths: Vec<PathBuf> = (0..num_paths)
.map(|i| PathBuf::from(&*format!("file_{}", i))) .map(|i| PathBuf::from(&*format!("file_{i}")))
.collect(); .collect();
{ {
for path in &paths { for path in &paths {

View File

@@ -62,8 +62,7 @@ impl FacetReader {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::schema::document::Value; use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument}; use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test] #[test]
@@ -89,7 +88,9 @@ mod tests {
let doc = searcher let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32)) .doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap(); .unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet()); let value = doc
.get_first(facet_field)
.and_then(|v| v.as_value().as_facet());
assert_eq!(value, None); assert_eq!(value, None);
} }
@@ -146,8 +147,11 @@ mod tests {
facet_ords.extend(facet_reader.facet_ords(0u32)); facet_ords.extend(facet_reader.facet_ords(0u32));
assert_eq!(&facet_ords, &[0u64]); assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet()); let value: Option<Facet> = doc
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); .get_first(facet_field)
.and_then(|v| v.as_facet())
.map(|facet| Facet::from_encoded_string(facet.to_string()));
assert_eq!(value, Facet::from_text("/a/b").ok());
Ok(()) Ok(())
} }

View File

@@ -80,7 +80,7 @@ mod tests {
use std::path::Path; use std::path::Path;
use columnar::StrColumn; use columnar::StrColumn;
use common::{ByteCount, HasLen, TerminatingWrite}; use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use rand::prelude::SliceRandom; use rand::prelude::SliceRandom;
use rand::rngs::StdRng; use rand::rngs::StdRng;
@@ -88,14 +88,15 @@ mod tests {
use super::*; use super::*;
use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::index::SegmentId;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::schema::{ use crate::schema::{
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument, DateOptions, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT, TantivyDocument, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
}; };
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager}; use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader}; use crate::{Index, IndexWriter, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| { pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();

View File

@@ -1,14 +1,14 @@
use std::io; use std::io;
use columnar::{ColumnarWriter, NumericalValue}; use columnar::{ColumnarWriter, NumericalValue};
use common::JsonPathWriter; use common::{DateTimePrecision, JsonPathWriter};
use tokenizer_api::Token; use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError}; use crate::{DocId, TantivyError};
/// Only index JSON down to a depth of 20. /// Only index JSON down to a depth of 20.
/// This is mostly to guard us from a stack overflow triggered by malicious input. /// This is mostly to guard us from a stack overflow triggered by malicious input.
@@ -183,8 +183,7 @@ impl FastFieldsWriter {
.record_datetime(doc_id, field_name, truncated_datetime); .record_datetime(doc_id, field_name, truncated_datetime);
} }
ReferenceValueLeaf::Facet(val) => { ReferenceValueLeaf::Facet(val) => {
self.columnar_writer self.columnar_writer.record_str(doc_id, field_name, val);
.record_str(doc_id, field_name, val.encoded_str());
} }
ReferenceValueLeaf::Bytes(val) => { ReferenceValueLeaf::Bytes(val) => {
self.columnar_writer.record_bytes(doc_id, field_name, val); self.columnar_writer.record_bytes(doc_id, field_name, val);

View File

@@ -3,7 +3,7 @@ use std::fmt;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
use std::path::Path; use std::path::Path;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::thread::available_parallelism;
use super::segment::Segment; use super::segment::Segment;
use super::segment_reader::merge_field_meta_data; use super::segment_reader::merge_field_meta_data;
@@ -252,9 +252,8 @@ impl IndexBuilder {
let field_type = entry.field_type().value_type(); let field_type = entry.field_type().value_type();
if !supported_field_types.contains(&field_type) { if !supported_field_types.contains(&field_type) {
return Err(TantivyError::InvalidArgument(format!( return Err(TantivyError::InvalidArgument(format!(
"Unsupported field type in sort_by_field: {:?}. Supported field types: \ "Unsupported field type in sort_by_field: {field_type:?}. Supported field \
{:?} ", types: {supported_field_types:?} ",
field_type, supported_field_types,
))); )));
} }
} }
@@ -293,7 +292,7 @@ pub struct Index {
directory: ManagedDirectory, directory: ManagedDirectory,
schema: Schema, schema: Schema,
settings: IndexSettings, settings: IndexSettings,
executor: Arc<Executor>, executor: Executor,
tokenizers: TokenizerManager, tokenizers: TokenizerManager,
fast_field_tokenizers: TokenizerManager, fast_field_tokenizers: TokenizerManager,
inventory: SegmentMetaInventory, inventory: SegmentMetaInventory,
@@ -318,29 +317,25 @@ impl Index {
/// ///
/// By default the executor is single thread, and simply runs in the calling thread. /// By default the executor is single thread, and simply runs in the calling thread.
pub fn search_executor(&self) -> &Executor { pub fn search_executor(&self) -> &Executor {
self.executor.as_ref() &self.executor
} }
/// Replace the default single thread search executor pool /// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads. /// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> { pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?); self.executor = Executor::multi_thread(num_threads, "tantivy-search-")?;
Ok(()) Ok(())
} }
/// Custom thread pool by a outer thread pool. /// Custom thread pool by a outer thread pool.
pub fn set_shared_multithread_executor( pub fn set_executor(&mut self, executor: Executor) {
&mut self, self.executor = executor;
shared_thread_pool: Arc<Executor>,
) -> crate::Result<()> {
self.executor = shared_thread_pool.clone();
Ok(())
} }
/// Replace the default single thread search executor pool /// Replace the default single thread search executor pool
/// by a thread pool with as many threads as there are CPUs on the system. /// by a thread pool with as many threads as there are CPUs on the system.
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> { pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
let default_num_threads = num_cpus::get(); let default_num_threads = available_parallelism()?.get();
self.set_multithread_executor(default_num_threads) self.set_multithread_executor(default_num_threads)
} }
@@ -418,7 +413,7 @@ impl Index {
schema, schema,
tokenizers: TokenizerManager::default(), tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(), fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()), executor: Executor::single_thread(),
inventory, inventory,
} }
} }
@@ -621,7 +616,7 @@ impl Index {
&self, &self,
memory_budget_in_bytes: usize, memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> { ) -> crate::Result<IndexWriter<D>> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD); let mut num_threads = std::cmp::min(available_parallelism()?.get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads; let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN { if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
num_threads = (memory_budget_in_bytes / MEMORY_BUDGET_NUM_BYTES_MIN).max(1); num_threads = (memory_budget_in_bytes / MEMORY_BUDGET_NUM_BYTES_MIN).max(1);

View File

@@ -1,5 +1,3 @@
//! # Index Module
//!
//! The `index` module in Tantivy contains core components to read and write indexes. //! The `index` module in Tantivy contains core components to read and write indexes.
//! //!
//! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s. //! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s.

View File

@@ -318,14 +318,14 @@ impl SegmentReader {
if create_canonical { if create_canonical {
// Without expand dots enabled dots need to be escaped. // Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\."); let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path); let full_path = format!("{field_name}.{escaped_json_path}");
let full_path_unescaped = format!("{}.{}", field_name, &json_path); let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string()); map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path full_path
} else { } else {
// With expand dots enabled, we can use '.' instead of '\u{1}'. // With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path); json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path) format!("{field_name}.{json_path}")
} }
}; };
indexed_fields.extend( indexed_fields.extend(

View File

@@ -246,8 +246,9 @@ impl DeleteCursor {
mod tests { mod tests {
use super::{DeleteOperation, DeleteQueue}; use super::{DeleteOperation, DeleteQueue};
use crate::index::SegmentReader;
use crate::query::{Explanation, Scorer, Weight}; use crate::query::{Explanation, Scorer, Weight};
use crate::{DocId, Score, SegmentReader}; use crate::{DocId, Score};
struct DummyWeight; struct DummyWeight;
impl Weight for DummyWeight { impl Weight for DummyWeight {

View File

@@ -306,12 +306,10 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
assert_eq!( assert!(searcher
searcher .doc::<TantivyDocument>(DocAddress::new(0, 0))?
.doc::<TantivyDocument>(DocAddress::new(0, 0))? .get_first(my_string_field)
.get_first(my_string_field), .is_none());
None
);
assert_eq!( assert_eq!(
searcher searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))? .doc::<TantivyDocument>(DocAddress::new(0, 3))?
@@ -344,7 +342,7 @@ mod tests_indexsorting {
Some("blublub") Some("blublub")
); );
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None); assert!(doc.get_first(my_string_field).is_none());
} }
// sort by field desc // sort by field desc
let index = create_test_index( let index = create_test_index(

View File

@@ -814,10 +814,9 @@ mod tests {
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema, self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
}; };
use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{ use crate::{
@@ -1980,7 +1979,13 @@ mod tests {
.unwrap(); .unwrap();
// test store iterator // test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) { for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap(); let id = doc
.unwrap()
.get_first(id_field)
.unwrap()
.as_value()
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id)); assert!(expected_ids_and_num_occurrences.contains_key(&id));
} }
// test store random access // test store random access
@@ -2013,7 +2018,7 @@ mod tests {
let mut bool2 = doc.get_all(multi_bools); let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_eq!(None, bool2.next()) assert!(bool2.next().is_none())
} }
} }
} }

View File

@@ -144,9 +144,9 @@ mod tests {
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use super::*; use super::*;
use crate::index::SegmentMetaInventory; use crate::index::{SegmentId, SegmentMetaInventory};
use crate::schema;
use crate::schema::INDEXED; use crate::schema::INDEXED;
use crate::{schema, SegmentId};
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default); static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);

View File

@@ -1,7 +1,8 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::ops::Deref; use std::ops::Deref;
use crate::{Inventory, Opstamp, SegmentId, TrackedObject}; use crate::index::SegmentId;
use crate::{Inventory, Opstamp, TrackedObject};
#[derive(Default)] #[derive(Default)]
pub(crate) struct MergeOperationInventory(Inventory<InnerMergeOperation>); pub(crate) struct MergeOperationInventory(Inventory<InnerMergeOperation>);

View File

@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError}; use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::index::{Segment, SegmentReader}; use crate::index::{Segment, SegmentComponent, SegmentReader};
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
use crate::indexer::SegmentSerializer; use crate::indexer::SegmentSerializer;
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
@@ -21,8 +21,7 @@ use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::store::StoreWriter; use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal}; use crate::termdict::{TermMerger, TermOrdinal};
use crate::{ use crate::{
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
SegmentComponent, SegmentOrdinal,
}; };
/// Segment's max doc must be `< MAX_DOC_LIMIT`. /// Segment's max doc must be `< MAX_DOC_LIMIT`.
@@ -794,17 +793,16 @@ mod tests {
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE, BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
}; };
use crate::collector::{Count, FacetCollector}; use crate::collector::{Count, FacetCollector};
use crate::index::Index; use crate::index::{Index, SegmentId};
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT, TextFieldIndexing, Value, INDEXED, TEXT,
}; };
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{ use crate::{
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings, assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
IndexSortByField, IndexWriter, Order, Searcher, SegmentId, IndexSortByField, IndexWriter, Order, Searcher,
}; };
#[test] #[test]
@@ -911,15 +909,24 @@ mod tests {
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("af b")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c")); assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d")); assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c d")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;

View File

@@ -3,15 +3,15 @@ mod tests {
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::index::Index; use crate::index::Index;
use crate::postings::Postings;
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions, TextFieldIndexing, TextOptions, Value,
}; };
use crate::{ use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings, DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
TantivyDocument, Term, Term,
}; };
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index { fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
@@ -280,13 +280,16 @@ mod tests {
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos)) .doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap(); .unwrap();
assert_eq!( assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(), doc.get_first(my_text_field).unwrap().as_value().as_str(),
Some("blubber") Some("blubber")
); );
let doc = searcher let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0)) .doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap(); .unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000)); assert_eq!(
doc.get_first(int_field).unwrap().as_value().as_u64(),
Some(1000)
);
} }
} }

View File

@@ -182,7 +182,7 @@ mod tests_mmap {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"}))) .add_document(doc!(field=>json!({format!("{field_name_in}"): "test1", format!("num{field_name_in}"): 10})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"}))) .add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"})))
@@ -216,7 +216,7 @@ mod tests_mmap {
let test_query = |query_str: &str| { let test_query = |query_str: &str| {
let query = parse_query.parse_query(query_str).unwrap(); let query = parse_query.parse_query(query_str).unwrap();
let num_docs = searcher.search(&query, &Count).unwrap(); let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1, "{}", query_str); assert_eq!(num_docs, 1, "{query_str}");
}; };
test_query(format!("json.{field_name_out}:test1").as_str()); test_query(format!("json.{field_name_out}:test1").as_str());
test_query(format!("json.a{field_name_out}:test2").as_str()); test_query(format!("json.a{field_name_out}:test2").as_str());
@@ -260,6 +260,64 @@ mod tests_mmap {
"test6", "test6",
); );
test_agg(format!("json.{field_name_out}a").as_str(), "test7"); test_agg(format!("json.{field_name_out}a").as_str(), "test7");
// `.` is stored as `\u{0001}` internally in tantivy
let field_name_out_internal = if field_name_out == "." {
"\u{0001}"
} else {
field_name_out
};
let mut fields = reader.searcher().segment_readers()[0]
.inverted_index(field)
.unwrap()
.list_encoded_fields()
.unwrap();
assert_eq!(fields.len(), 8);
fields.sort();
let mut expected_fields = vec![
(format!("a{field_name_out_internal}"), Type::Str),
(format!("a{field_name_out_internal}a"), Type::Str),
(
format!("a{field_name_out_internal}a{field_name_out_internal}"),
Type::Str,
),
(
format!("a{field_name_out_internal}\u{1}ab{field_name_out_internal}"),
Type::Str,
),
(
format!("a{field_name_out_internal}\u{1}a{field_name_out_internal}"),
Type::Str,
),
(format!("{field_name_out_internal}a"), Type::Str),
(format!("{field_name_out_internal}"), Type::Str),
(format!("num{field_name_out_internal}"), Type::I64),
];
expected_fields.sort();
assert_eq!(fields, expected_fields);
// Check columnar reader
let mut columns = reader.searcher().segment_readers()[0]
.fast_fields()
.columnar()
.list_columns()
.unwrap()
.into_iter()
.map(|(name, _)| name)
.collect::<Vec<_>>();
let mut expected_columns = vec![
format!("json\u{1}{field_name_out_internal}"),
format!("json\u{1}{field_name_out_internal}a"),
format!("json\u{1}a{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}a"),
format!("json\u{1}a{field_name_out_internal}a{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}\u{1}ab{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}\u{1}a{field_name_out_internal}"),
format!("json\u{1}num{field_name_out_internal}"),
];
columns.sort();
expected_columns.sort();
assert_eq!(columns, expected_columns);
} }
#[test] #[test]
@@ -532,10 +590,10 @@ mod tests_mmap {
let query_parser = QueryParser::for_index(&index, vec![]); let query_parser = QueryParser::for_index(&index, vec![]);
// Test if field name can be queried // Test if field name can be queried
for (indexed_field, val) in fields_and_vals.iter() { for (indexed_field, val) in fields_and_vals.iter() {
let query_str = &format!("{}:{}", indexed_field, val); let query_str = &format!("{indexed_field}:{val}");
let query = query_parser.parse_query(query_str).unwrap(); let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap(); let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val); assert!(!count_docs.is_empty(), "{indexed_field}:{val}");
} }
// Test if field name can be used for aggregation // Test if field name can be used for aggregation
for (field_name, val) in fields_and_vals.iter() { for (field_name, val) in fields_and_vals.iter() {

View File

@@ -5,20 +5,20 @@ use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation; use super::operation::AddOperation;
use crate::core::json_utils::index_json_values;
use crate::fastfield::FastFieldsWriter; use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::index::Segment; use crate::index::{Segment, SegmentComponent};
use crate::indexer::segment_serializer::SegmentSerializer; use crate::indexer::segment_serializer::SegmentSerializer;
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
use crate::postings::{ use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition, compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter, PerFieldPostingsWriter, PostingsWriter,
}; };
use crate::schema::document::{Document, ReferenceValue, Value}; use crate::schema::document::{Document, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED}; use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter}; use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError}; use crate::{DocId, Opstamp, TantivyError};
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
/// ///
@@ -68,6 +68,7 @@ pub struct SegmentWriter {
pub(crate) fast_field_writers: FastFieldsWriter, pub(crate) fast_field_writers: FastFieldsWriter,
pub(crate) fieldnorms_writer: FieldNormsWriter, pub(crate) fieldnorms_writer: FieldNormsWriter,
pub(crate) json_path_writer: JsonPathWriter, pub(crate) json_path_writer: JsonPathWriter,
pub(crate) json_positions_per_path: IndexingPositionsPerPath,
pub(crate) doc_opstamps: Vec<Opstamp>, pub(crate) doc_opstamps: Vec<Opstamp>,
per_field_text_analyzers: Vec<TextAnalyzer>, per_field_text_analyzers: Vec<TextAnalyzer>,
term_buffer: Term, term_buffer: Term,
@@ -119,6 +120,7 @@ impl SegmentWriter {
per_field_postings_writers, per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema), fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
json_path_writer: JsonPathWriter::default(), json_path_writer: JsonPathWriter::default(),
json_positions_per_path: IndexingPositionsPerPath::default(),
segment_serializer, segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager( fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema, &schema,
@@ -204,8 +206,7 @@ impl SegmentWriter {
// Used to help with linting and type checking. // Used to help with linting and type checking.
let value = value_access as D::Value<'_>; let value = value_access as D::Value<'_>;
let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default(); let mut indexing_position = IndexingPosition::default();
postings_writer.index_text( postings_writer.index_text(
@@ -228,7 +229,7 @@ impl SegmentWriter {
&mut self.per_field_text_analyzers[field.field_id() as usize]; &mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text) text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() { } else if let Some(tok_str) = value.as_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone())) BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else { } else {
continue; continue;
}; };
@@ -342,26 +343,24 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => { FieldType::JsonObject(json_options) => {
let text_analyzer = let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize]; &mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it = values.map(|value_access| {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
match value { self.json_positions_per_path.clear();
ReferenceValue::Object(object_iter) => Ok(object_iter), self.json_path_writer
_ => Err(make_schema_error()), .set_expand_dots(json_options.is_expand_dots_enabled());
} for json_value in values {
}); self.json_path_writer.clear();
index_json_values::<D::Value<'_>>(
doc_id, index_json_value(
json_values_it, doc_id,
text_analyzer, json_value,
json_options.is_expand_dots_enabled(), text_analyzer,
term_buffer, term_buffer,
postings_writer, &mut self.json_path_writer,
&mut self.json_path_writer, postings_writer,
ctx, ctx,
)?; &mut self.json_positions_per_path,
);
}
} }
FieldType::IpAddr(_) => { FieldType::IpAddr(_) => {
let mut num_vals = 0; let mut num_vals = 0;
@@ -498,19 +497,19 @@ mod tests {
use crate::collector::{Count, TopDocs}; use crate::collector::{Count, TopDocs};
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::fastfield::FastValue; use crate::fastfield::FastValue;
use crate::postings::TermInfo; use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser}; use crate::query::{PhraseQuery, QueryParser};
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT, Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
STORED, STRING, TEXT,
}; };
use crate::store::{Compressor, StoreReader, StoreWriter}; use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token}; use crate::tokenizer::{PreTokenizedString, Token};
use crate::{ use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument, DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, TantivyDocument, Term,
Term, TERMINATED, TERMINATED,
}; };
#[test] #[test]
@@ -555,9 +554,15 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap(); let doc = reader.get::<TantivyDocument>(0).unwrap();
assert_eq!(doc.field_values().len(), 2); assert_eq!(doc.field_values().count(), 2);
assert_eq!(doc.field_values()[0].value().as_str(), Some("A")); assert_eq!(
assert_eq!(doc.field_values()[1].value().as_str(), Some("title")); doc.get_all(text_field).next().unwrap().as_value().as_str(),
Some("A")
);
assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_value().as_str(),
Some("title")
);
} }
#[test] #[test]
fn test_simple_json_indexing() { fn test_simple_json_indexing() {
@@ -597,12 +602,51 @@ mod tests {
assert_eq!(score_docs.len(), 2); assert_eq!(score_docs.len(), 2);
} }
#[test]
fn test_flat_json_indexing() {
// A JSON Object that contains mixed values on the first level
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
// Text, i64, u64
writer.add_document(doc!(json_field=>"b")).unwrap();
writer
.add_document(doc!(json_field=>OwnedValue::I64(10i64)))
.unwrap();
writer
.add_document(doc!(json_field=>OwnedValue::U64(55u64)))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "a"})))
.unwrap();
writer.commit().unwrap();
let search_and_expect = |query| {
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let text_query = query_parser.parse_query(query).unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 1);
};
search_and_expect("my_field:a");
search_and_expect("b");
search_and_expect("10");
search_and_expect("55");
}
#[test] #[test]
fn test_json_indexing() { fn test_json_indexing() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT); let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str( let json_val: serde_json::Value = serde_json::from_str(
r#"{ r#"{
"toto": "titi", "toto": "titi",
"float": -0.2, "float": -0.2,
@@ -630,14 +674,10 @@ mod tests {
doc_id: 0u32, doc_id: 0u32,
}) })
.unwrap(); .unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>( let serdeser_json_val = serde_json::from_str::<serde_json::Value>(&doc.to_json(&schema))
&doc.to_json(&schema),
)
.unwrap()
.get("json")
.unwrap()[0]
.as_object()
.unwrap() .unwrap()
.get("json")
.unwrap()[0]
.clone(); .clone();
assert_eq!(json_val, serdeser_json_val); assert_eq!(json_val, serdeser_json_val);
let segment_reader = searcher.segment_reader(0u32); let segment_reader = searcher.segment_reader(0u32);
@@ -801,7 +841,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STRING); let json_field = schema_builder.add_json_field("json", STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = let json_val: serde_json::Value =
serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap(); serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
let doc = doc!(json_field=>json_val); let doc = doc!(json_field=>json_val);
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -841,7 +881,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT); let json_field = schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str( let json_val: serde_json::Value = serde_json::from_str(
r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#, r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
) )
.unwrap(); .unwrap();

View File

@@ -216,11 +216,6 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
)]
pub use self::snippet::{Snippet, SnippetGenerator};
#[doc(hidden)] #[doc(hidden)]
pub use crate::core::json_utils; pub use crate::core::json_utils;
pub use crate::core::{Executor, Searcher, SearcherGeneration}; pub use crate::core::{Executor, Searcher, SearcherGeneration};
@@ -228,16 +223,10 @@ pub use crate::directory::Directory;
#[allow(deprecated)] // Remove with index sorting #[allow(deprecated)] // Remove with index sorting
pub use crate::index::{ pub use crate::index::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader, Segment, SegmentMeta, SegmentReader,
}; };
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
)]
pub use crate::indexer::PreparedCommit;
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::postings::Postings; pub use crate::schema::{Document, TantivyDocument, Term};
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term};
/// Index format version. /// Index format version.
const INDEX_FORMAT_VERSION: u32 = 6; const INDEX_FORMAT_VERSION: u32 = 6;
@@ -392,9 +381,10 @@ pub mod tests {
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader; use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::postings::Postings;
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::schema::*; use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy}; use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() { pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
@@ -446,7 +436,6 @@ pub mod tests {
} }
#[test] #[test]
#[cfg(not(feature = "lz4"))]
fn test_version_string() { fn test_version_string() {
use regex::Regex; use regex::Regex;
let regex_ptn = Regex::new( let regex_ptn = Regex::new(
@@ -946,7 +935,7 @@ pub mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT); let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str( let json_val: serde_json::Value = serde_json::from_str(
r#"{ r#"{
"signed": 2, "signed": 2,
"float": 2.0, "float": 2.0,
@@ -1036,13 +1025,16 @@ pub mod tests {
text_field => "some other value", text_field => "some other value",
other_text_field => "short"); other_text_field => "short");
assert_eq!(document.len(), 3); assert_eq!(document.len(), 3);
let values: Vec<&OwnedValue> = document.get_all(text_field).collect(); let values: Vec<OwnedValue> = document.get_all(text_field).map(OwnedValue::from).collect();
assert_eq!(values.len(), 2); assert_eq!(values.len(), 2);
assert_eq!(values[0].as_str(), Some("tantivy")); assert_eq!(values[0].as_ref().as_str(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value")); assert_eq!(values[1].as_ref().as_str(), Some("some other value"));
let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect(); let values: Vec<OwnedValue> = document
.get_all(other_text_field)
.map(OwnedValue::from)
.collect();
assert_eq!(values.len(), 1); assert_eq!(values.len(), 1);
assert_eq!(values[0].as_str(), Some("short")); assert_eq!(values[0].as_ref().as_str(), Some("short"));
} }
#[test] #[test]
@@ -1109,9 +1101,9 @@ pub mod tests {
#[test] #[test]
fn test_update_via_delete_insert() -> crate::Result<()> { fn test_update_via_delete_insert() -> crate::Result<()> {
use crate::collector::Count; use crate::collector::Count;
use crate::index::SegmentId;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::SegmentId;
const DOC_COUNT: u64 = 2u64; const DOC_COUNT: u64 = 2u64;

View File

@@ -41,6 +41,7 @@
/// ); /// );
/// # } /// # }
/// ``` /// ```
#[macro_export] #[macro_export]
macro_rules! doc( macro_rules! doc(
() => { () => {
@@ -52,7 +53,7 @@ macro_rules! doc(
{ {
let mut document = $crate::TantivyDocument::default(); let mut document = $crate::TantivyDocument::default();
$( $(
document.add_field_value($field, $value); document.add_field_value($field, &$value);
)* )*
document document
} }

View File

@@ -56,7 +56,7 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer { impl InvertedIndexSerializer {
/// Open a new `InvertedIndexSerializer` for the given segment /// Open a new `InvertedIndexSerializer` for the given segment
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> { pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
use crate::SegmentComponent::{Positions, Postings, Terms}; use crate::index::SegmentComponent::{Positions, Postings, Terms};
let inv_index_serializer = InvertedIndexSerializer { let inv_index_serializer = InvertedIndexSerializer {
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?), terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?), postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),

View File

@@ -1,8 +1,9 @@
use super::Scorer; use super::Scorer;
use crate::docset::TERMINATED; use crate::docset::TERMINATED;
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Weight}; use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader}; use crate::{DocId, DocSet, Score, Searcher};
/// `EmptyQuery` is a dummy `Query` in which no document matches. /// `EmptyQuery` is a dummy `Query` in which no document matches.
/// ///

View File

@@ -138,8 +138,7 @@ impl FuzzyTermQuery {
if json_path_type != Type::Str { if json_path_type != Type::Str {
return Err(InvalidArgument(format!( return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \ "The fuzzy term query requires a string path type for a json term. Found \
{:?}", {json_path_type:?}"
json_path_type
))); )));
} }
} }

View File

@@ -180,7 +180,7 @@ impl MoreLikeThis {
let facets: Vec<&str> = values let facets: Vec<&str> = values
.iter() .iter()
.map(|value| { .map(|value| {
value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| { value.as_facet().ok_or_else(|| {
TantivyError::InvalidArgument("invalid field value".to_string()) TantivyError::InvalidArgument("invalid field value".to_string())
}) })
}) })
@@ -220,7 +220,7 @@ impl MoreLikeThis {
let mut token_stream = tokenizer.token_stream(text); let mut token_stream = tokenizer.token_stream(text);
token_stream.process(sink); token_stream.process(sink);
} else if let Some(tok_str) = value.as_pre_tokenized_text() { } else if let Some(tok_str) = value.as_pre_tokenized_text() {
let mut token_stream = PreTokenizedStream::from(tok_str.clone()); let mut token_stream = PreTokenizedStream::from(*tok_str.clone());
token_stream.process(sink); token_stream.process(sink);
} }
} }

View File

@@ -174,7 +174,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
} }
fn size_hint(&self) -> u32 { fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block self.column.num_docs()
} }
} }

View File

@@ -185,7 +185,7 @@ mod test {
Err(crate::TantivyError::InvalidArgument(msg)) => { Err(crate::TantivyError::InvalidArgument(msg)) => {
assert!(msg.contains("error: unclosed group")) assert!(msg.contains("error: unclosed group"))
} }
res => panic!("unexpected result: {:?}", res), res => panic!("unexpected result: {res:?}"),
} }
} }
} }

View File

@@ -127,6 +127,7 @@ impl Scorer for TermScorer {
mod tests { mod tests {
use proptest::prelude::*; use proptest::prelude::*;
use crate::index::SegmentId;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -134,8 +135,7 @@ mod tests {
use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery}; use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT}; use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::{ use crate::{
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, SegmentId, Term, assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, Term, TERMINATED,
TERMINATED,
}; };
#[test] #[test]

View File

@@ -179,9 +179,10 @@ mod tests {
use super::Warmer; use super::Warmer;
use crate::core::searcher::SearcherGeneration; use crate::core::searcher::SearcherGeneration;
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::index::SegmentId;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::{Schema, INDEXED}; use crate::schema::{Schema, INDEXED};
use crate::{Index, IndexSettings, ReloadPolicy, Searcher, SegmentId}; use crate::{Index, IndexSettings, ReloadPolicy, Searcher};
#[derive(Default)] #[derive(Default)]
struct TestWarmer { struct TestWarmer {

View File

@@ -873,7 +873,7 @@ mod tests {
); );
let facet = Facet::from_text("/hello/world").unwrap(); let facet = Facet::from_text("/hello/world").unwrap();
let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into()); let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
let value = deserialize_value(result); let value = deserialize_value(result);
assert_eq!(value, crate::schema::OwnedValue::Facet(facet)); assert_eq!(value, crate::schema::OwnedValue::Facet(facet));
@@ -881,7 +881,8 @@ mod tests {
text: "hello, world".to_string(), text: "hello, world".to_string(),
tokens: vec![Token::default(), Token::default()], tokens: vec![Token::default(), Token::default()],
}; };
let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into()); let result =
serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into());
let value = deserialize_value(result); let value = deserialize_value(result);
assert_eq!(value, crate::schema::OwnedValue::PreTokStr(pre_tok_str)); assert_eq!(value, crate::schema::OwnedValue::PreTokStr(pre_tok_str));
} }

View File

@@ -1,93 +1,64 @@
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::io::{self, Read, Write};
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use common::DateTime; use columnar::MonotonicallyMappableToU128;
use common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, DateTime, VInt};
use serde_json::Map; use serde_json::Map;
pub use CompactDoc as TantivyDocument;
use super::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::document::{ use crate::schema::document::{
DeserializeError, Document, DocumentDeserialize, DocumentDeserializer, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
}; };
use crate::schema::field_type::ValueParsingError; use crate::schema::field_type::ValueParsingError;
use crate::schema::field_value::FieldValueIter; use crate::schema::{Facet, Field, NamedFieldDocument, OwnedValue, Schema};
use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, OwnedValue, Schema};
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
/// TantivyDocument provides a default implementation of the `Document` trait. #[repr(packed)]
/// It is the object that can be indexed and then searched for. #[derive(Debug, Clone)]
/// /// A field value pair in the compact tantivy document
/// Documents are fundamentally a collection of unordered couples `(field, value)`. struct FieldValueAddr {
/// In this list, one field may appear more than once. pub field: u16,
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)] pub value_addr: ValueAddr,
pub struct TantivyDocument {
field_values: Vec<FieldValue>,
} }
impl Document for TantivyDocument { #[derive(Debug, Clone)]
type Value<'a> = &'a OwnedValue; /// The default document in tantivy. It encodes data in a compact form.
type FieldsValuesIter<'a> = FieldValueIter<'a>; pub struct CompactDoc {
/// `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
/// includes all the data of the document and also metadata like where the nodes are located
/// in an object or array.
pub node_data: Vec<u8>,
/// The root (Field, Value) pairs
field_values: Vec<FieldValueAddr>,
}
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> { impl Default for CompactDoc {
FieldValueIter(self.field_values.iter()) fn default() -> Self {
Self::new()
} }
} }
impl DocumentDeserialize for TantivyDocument { impl CompactDoc {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut field_values = Vec::with_capacity(deserializer.size_hint());
while let Some((field, value)) = deserializer.next_field()? {
field_values.push(FieldValue::new(field, value));
}
Ok(Self { field_values })
}
}
impl From<Vec<FieldValue>> for TantivyDocument {
fn from(field_values: Vec<FieldValue>) -> Self {
Self { field_values }
}
}
impl PartialEq for TantivyDocument {
fn eq(&self, other: &Self) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() {
let value = serde_json::to_string(field_value.value()).unwrap();
field_value_set
.entry(field_value.field())
.or_default()
.insert(value);
}
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values);
let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values);
self_field_values.eq(&other_field_values)
}
}
impl Eq for TantivyDocument {}
impl IntoIterator for TantivyDocument {
type Item = FieldValue;
type IntoIter = std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter()
}
}
impl TantivyDocument {
/// Creates a new, empty document object /// Creates a new, empty document object
pub fn new() -> TantivyDocument { /// The reserved capacity is for the total serialized data
TantivyDocument::default() pub fn with_capacity(bytes: usize) -> CompactDoc {
CompactDoc {
node_data: Vec::with_capacity(bytes),
field_values: Vec::with_capacity(4),
}
}
/// Creates a new, empty document object
pub fn new() -> CompactDoc {
CompactDoc::with_capacity(1024)
}
/// Skrinks the capacity of the document to fit the data
pub fn shrink_to_fit(&mut self) {
self.node_data.shrink_to_fit();
self.field_values.shrink_to_fit();
} }
/// Returns the length of the document. /// Returns the length of the document.
@@ -99,83 +70,111 @@ impl TantivyDocument {
pub fn add_facet<F>(&mut self, field: Field, path: F) pub fn add_facet<F>(&mut self, field: Field, path: F)
where Facet: From<F> { where Facet: From<F> {
let facet = Facet::from(path); let facet = Facet::from(path);
let value = OwnedValue::Facet(facet); self.add_leaf_field_value(field, ReferenceValueLeaf::Facet(facet.encoded_str()));
self.add_field_value(field, value);
} }
/// Add a text field. /// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) { pub fn add_text<S: AsRef<str>>(&mut self, field: Field, text: S) {
let value = OwnedValue::Str(text.to_string()); self.add_leaf_field_value(field, ReferenceValueLeaf::Str(text.as_ref()));
self.add_field_value(field, value);
} }
/// Add a pre-tokenized text field. /// Add a pre-tokenized text field.
pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) { pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
self.add_field_value(field, pre_tokenized_text); self.add_leaf_field_value(field, pre_tokenized_text);
} }
/// Add a u64 field /// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) { pub fn add_u64(&mut self, field: Field, value: u64) {
self.add_field_value(field, value); self.add_leaf_field_value(field, value);
} }
/// Add a IP address field. Internally only Ipv6Addr is used. /// Add a IP address field. Internally only Ipv6Addr is used.
pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) { pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
self.add_field_value(field, value); self.add_leaf_field_value(field, value);
} }
/// Add a i64 field /// Add a i64 field
pub fn add_i64(&mut self, field: Field, value: i64) { pub fn add_i64(&mut self, field: Field, value: i64) {
self.add_field_value(field, value); self.add_leaf_field_value(field, value);
} }
/// Add a f64 field /// Add a f64 field
pub fn add_f64(&mut self, field: Field, value: f64) { pub fn add_f64(&mut self, field: Field, value: f64) {
self.add_field_value(field, value); self.add_leaf_field_value(field, value);
} }
/// Add a bool field /// Add a bool field
pub fn add_bool(&mut self, field: Field, value: bool) { pub fn add_bool(&mut self, field: Field, value: bool) {
self.add_field_value(field, value); self.add_leaf_field_value(field, value);
} }
/// Add a date field with unspecified time zone offset /// Add a date field with unspecified time zone offset
pub fn add_date(&mut self, field: Field, value: DateTime) { pub fn add_date(&mut self, field: Field, value: DateTime) {
self.add_field_value(field, value); self.add_leaf_field_value(field, value);
} }
/// Add a bytes field /// Add a bytes field
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) { pub fn add_bytes(&mut self, field: Field, value: &[u8]) {
self.add_field_value(field, value.into()); self.add_leaf_field_value(field, value);
} }
/// Add a dynamic object field /// Add a dynamic object field
pub fn add_object(&mut self, field: Field, object: BTreeMap<String, OwnedValue>) { pub fn add_object(&mut self, field: Field, object: BTreeMap<String, OwnedValue>) {
self.add_field_value(field, object); self.add_field_value(field, &OwnedValue::from(object));
} }
/// Add a (field, value) to the document. /// Add a (field, value) to the document.
pub fn add_field_value<T: Into<OwnedValue>>(&mut self, field: Field, typed_val: T) { ///
/// `OwnedValue` implements Value, which should be easiest to use, but is not the most
/// performant.
pub fn add_field_value<'a, V: Value<'a>>(&mut self, field: Field, value: V) {
let field_value = FieldValueAddr {
field: field
.field_id()
.try_into()
.expect("support only up to u16::MAX field ids"),
value_addr: self.add_value(value),
};
self.field_values.push(field_value);
}
/// Add a (field, leaf value) to the document.
/// Leaf values don't have nested values.
pub fn add_leaf_field_value<'a, T: Into<ReferenceValueLeaf<'a>>>(
&mut self,
field: Field,
typed_val: T,
) {
let value = typed_val.into(); let value = typed_val.into();
let field_value = FieldValue { field, value }; let field_value = FieldValueAddr {
field: field
.field_id()
.try_into()
.expect("support only up to u16::MAX field ids"),
value_addr: self.add_value_leaf(value),
};
self.field_values.push(field_value); self.field_values.push(field_value);
} }
/// field_values accessor /// field_values accessor
pub fn field_values(&self) -> &[FieldValue] { pub fn field_values(&self) -> impl Iterator<Item = (Field, CompactDocValue<'_>)> {
&self.field_values self.field_values.iter().map(|field_val| {
let field = Field::from_field_id(field_val.field as u32);
let val = self.get_compact_doc_value(field_val.value_addr);
(field, val)
})
} }
/// Returns all of the `FieldValue`s associated the given field /// Returns all of the `ReferenceValue`s associated the given field
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &OwnedValue> { pub fn get_all(&self, field: Field) -> impl Iterator<Item = CompactDocValue<'_>> + '_ {
self.field_values self.field_values
.iter() .iter()
.filter(move |field_value| field_value.field() == field) .filter(move |field_value| Field::from_field_id(field_value.field as u32) == field)
.map(FieldValue::value) .map(|val| self.get_compact_doc_value(val.value_addr))
} }
/// Returns the first `FieldValue` associated the given field /// Returns the first `ReferenceValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<&OwnedValue> { pub fn get_first(&self, field: Field) -> Option<CompactDocValue<'_>> {
self.get_all(field).next() self.get_all(field).next()
} }
@@ -183,12 +182,12 @@ impl TantivyDocument {
pub fn convert_named_doc( pub fn convert_named_doc(
schema: &Schema, schema: &Schema,
named_doc: NamedFieldDocument, named_doc: NamedFieldDocument,
) -> Result<TantivyDocument, DocParsingError> { ) -> Result<Self, DocParsingError> {
let mut document = TantivyDocument::new(); let mut document = Self::new();
for (field_name, values) in named_doc.0 { for (field_name, values) in named_doc.0 {
if let Ok(field) = schema.get_field(&field_name) { if let Ok(field) = schema.get_field(&field_name) {
for value in values { for value in values {
document.add_field_value(field, value); document.add_field_value(field, &value);
} }
} }
} }
@@ -196,7 +195,7 @@ impl TantivyDocument {
} }
/// Build a document object from a json-object. /// Build a document object from a json-object.
pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<TantivyDocument, DocParsingError> { pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<Self, DocParsingError> {
let json_obj: Map<String, serde_json::Value> = let json_obj: Map<String, serde_json::Value> =
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?; serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
Self::from_json_object(schema, json_obj) Self::from_json_object(schema, json_obj)
@@ -206,8 +205,8 @@ impl TantivyDocument {
pub fn from_json_object( pub fn from_json_object(
schema: &Schema, schema: &Schema,
json_obj: Map<String, serde_json::Value>, json_obj: Map<String, serde_json::Value>,
) -> Result<TantivyDocument, DocParsingError> { ) -> Result<Self, DocParsingError> {
let mut doc = TantivyDocument::default(); let mut doc = Self::default();
for (field_name, json_value) in json_obj { for (field_name, json_value) in json_obj {
if let Ok(field) = schema.get_field(&field_name) { if let Ok(field) = schema.get_field(&field_name) {
let field_entry = schema.get_field_entry(field); let field_entry = schema.get_field_entry(field);
@@ -218,20 +217,482 @@ impl TantivyDocument {
let value = field_type let value = field_type
.value_from_json(json_item) .value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value); doc.add_field_value(field, &value);
} }
} }
_ => { _ => {
let value = field_type let value = field_type
.value_from_json(json_value) .value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value); doc.add_field_value(field, &value);
} }
} }
} }
} }
Ok(doc) Ok(doc)
} }
fn add_value_leaf(&mut self, leaf: ReferenceValueLeaf) -> ValueAddr {
let type_id = ValueType::from(&leaf);
// Write into `node_data` and return u32 position as its address
// Null and bool are inlined into the address
let val_addr = match leaf {
ReferenceValueLeaf::Null => 0,
ReferenceValueLeaf::Str(bytes) => {
write_bytes_into(&mut self.node_data, bytes.as_bytes())
}
ReferenceValueLeaf::Facet(bytes) => {
write_bytes_into(&mut self.node_data, bytes.as_bytes())
}
ReferenceValueLeaf::Bytes(bytes) => write_bytes_into(&mut self.node_data, bytes),
ReferenceValueLeaf::U64(num) => write_into(&mut self.node_data, num),
ReferenceValueLeaf::I64(num) => write_into(&mut self.node_data, num),
ReferenceValueLeaf::F64(num) => write_into(&mut self.node_data, num),
ReferenceValueLeaf::Bool(b) => b as u32,
ReferenceValueLeaf::Date(date) => {
write_into(&mut self.node_data, date.into_timestamp_nanos())
}
ReferenceValueLeaf::IpAddr(num) => write_into(&mut self.node_data, num.to_u128()),
ReferenceValueLeaf::PreTokStr(pre_tok) => write_into(&mut self.node_data, *pre_tok),
};
ValueAddr { type_id, val_addr }
}
/// Adds a value and returns in address into the
fn add_value<'a, V: Value<'a>>(&mut self, value: V) -> ValueAddr {
let value = value.as_value();
let type_id = ValueType::from(&value);
match value {
ReferenceValue::Leaf(leaf) => self.add_value_leaf(leaf),
ReferenceValue::Array(elements) => {
// addresses of the elements in node_data
// Reusing a vec would be nicer, but it's not easy because of the recursion
// A global vec would work if every writer get it's discriminator
let mut addresses = Vec::new();
for elem in elements {
let value_addr = self.add_value(elem);
write_into(&mut addresses, value_addr);
}
ValueAddr {
type_id,
val_addr: write_bytes_into(&mut self.node_data, &addresses),
}
}
ReferenceValue::Object(entries) => {
// addresses of the elements in node_data
let mut addresses = Vec::new();
for (key, value) in entries {
let key_addr = self.add_value_leaf(ReferenceValueLeaf::Str(key));
let value_addr = self.add_value(value);
write_into(&mut addresses, key_addr);
write_into(&mut addresses, value_addr);
}
ValueAddr {
type_id,
val_addr: write_bytes_into(&mut self.node_data, &addresses),
}
}
}
}
/// Get CompactDocValue for address
fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> {
CompactDocValue {
container: self,
value_addr,
}
}
/// get &[u8] reference from node_data
fn extract_bytes(&self, addr: Addr) -> &[u8] {
binary_deserialize_bytes(self.get_slice(addr))
}
/// get &str reference from node_data
fn extract_str(&self, addr: Addr) -> &str {
let data = self.extract_bytes(addr);
// Utf-8 checks would have a noticeable performance overhead here
unsafe { std::str::from_utf8_unchecked(data) }
}
/// deserialized owned value from node_data
fn read_from<T: BinarySerializable>(&self, addr: Addr) -> io::Result<T> {
let data_slice = &self.node_data[addr as usize..];
let mut cursor = std::io::Cursor::new(data_slice);
T::deserialize(&mut cursor)
}
/// get slice from address. The returned slice is open ended
fn get_slice(&self, addr: Addr) -> &[u8] {
&self.node_data[addr as usize..]
}
}
/// BinarySerializable alternative to read references
fn binary_deserialize_bytes(data: &[u8]) -> &[u8] {
let (len, bytes_read) = read_u32_vint_no_advance(data);
&data[bytes_read..bytes_read + len as usize]
}
/// Write bytes and return the position of the written data.
///
/// BinarySerializable alternative to write references
fn write_bytes_into(vec: &mut Vec<u8>, data: &[u8]) -> u32 {
let pos = vec.len() as u32;
let mut buf = [0u8; 8];
let len_vint_bytes = serialize_vint_u32(data.len() as u32, &mut buf);
vec.extend_from_slice(len_vint_bytes);
vec.extend_from_slice(data);
pos
}
/// Serialize and return the position
fn write_into<T: BinarySerializable>(vec: &mut Vec<u8>, value: T) -> u32 {
let pos = vec.len() as u32;
value.serialize(vec).unwrap();
pos
}
impl PartialEq for CompactDoc {
fn eq(&self, other: &Self) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |doc: &CompactDoc| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in doc.field_values.iter() {
let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into();
let value = serde_json::to_string(&value).unwrap();
field_value_set
.entry(Field::from_field_id(field_value.field as u32))
.or_default()
.insert(value);
}
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> = convert_to_comparable_map(self);
let other_field_values: HashMap<Field, HashSet<String>> = convert_to_comparable_map(other);
self_field_values.eq(&other_field_values)
}
}
impl Eq for CompactDoc {}
impl DocumentDeserialize for CompactDoc {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut doc = CompactDoc::default();
// TODO: Deserializing into OwnedValue is wasteful. The deserializer should be able to work
// on slices and referenced data.
while let Some((field, value)) = deserializer.next_field::<OwnedValue>()? {
doc.add_field_value(field, &value);
}
Ok(doc)
}
}
/// A value of Compact Doc needs a reference to the container to extract its payload
#[derive(Debug, Clone, Copy)]
pub struct CompactDocValue<'a> {
container: &'a CompactDoc,
value_addr: ValueAddr,
}
impl PartialEq for CompactDocValue<'_> {
fn eq(&self, other: &Self) -> bool {
let value1: OwnedValue = (*self).into();
let value2: OwnedValue = (*other).into();
value1 == value2
}
}
impl<'a> From<CompactDocValue<'a>> for OwnedValue {
fn from(value: CompactDocValue) -> Self {
value.as_value().into()
}
}
impl<'a> Value<'a> for CompactDocValue<'a> {
type ArrayIter = CompactDocArrayIter<'a>;
type ObjectIter = CompactDocObjectIter<'a>;
fn as_value(&self) -> ReferenceValue<'a, Self> {
self.get_ref_value().unwrap()
}
}
impl<'a> CompactDocValue<'a> {
fn get_ref_value(&self) -> io::Result<ReferenceValue<'a, CompactDocValue<'a>>> {
let addr = self.value_addr.val_addr;
match self.value_addr.type_id {
ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
ValueType::Str => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Str(str_ref).into())
}
ValueType::Facet => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Facet(str_ref).into())
}
ValueType::Bytes => {
let data = self.container.extract_bytes(addr);
Ok(ReferenceValueLeaf::Bytes(data).into())
}
ValueType::U64 => self
.container
.read_from::<u64>(addr)
.map(ReferenceValueLeaf::U64)
.map(Into::into),
ValueType::I64 => self
.container
.read_from::<i64>(addr)
.map(ReferenceValueLeaf::I64)
.map(Into::into),
ValueType::F64 => self
.container
.read_from::<f64>(addr)
.map(ReferenceValueLeaf::F64)
.map(Into::into),
ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()),
ValueType::Date => self
.container
.read_from::<i64>(addr)
.map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
.map(Into::into),
ValueType::IpAddr => self
.container
.read_from::<u128>(addr)
.map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
.map(Into::into),
ValueType::PreTokStr => self
.container
.read_from::<PreTokenizedString>(addr)
.map(Into::into)
.map(ReferenceValueLeaf::PreTokStr)
.map(Into::into),
ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
self.container,
addr,
)?)),
ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
self.container,
addr,
)?)),
}
}
}
/// The address in the vec
type Addr = u32;
#[derive(Clone, Copy, Default)]
#[repr(packed)]
/// The value type and the address to its payload in the container.
struct ValueAddr {
type_id: ValueType,
/// This is the address to the value in the vec, except for bool and null, which are inlined
val_addr: Addr,
}
impl BinarySerializable for ValueAddr {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
self.type_id.serialize(writer)?;
VInt(self.val_addr as u64).serialize(writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let type_id = ValueType::deserialize(reader)?;
let val_addr = VInt::deserialize(reader)?.0 as u32;
Ok(ValueAddr { type_id, val_addr })
}
}
impl std::fmt::Debug for ValueAddr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let val_addr = self.val_addr;
f.write_fmt(format_args!("{:?} at {:?}", self.type_id, val_addr))
}
}
/// A enum representing a value for tantivy to index.
///
/// Any changes need to be reflected in `BinarySerializable` for `ValueType`
///
/// We can't use [schema::Type] or [columnar::ColumnType] here, because they are missing
/// some items like Array and PreTokStr.
#[derive(Default, Clone, Copy, Debug, PartialEq)]
#[repr(u8)]
pub enum ValueType {
/// A null value.
#[default]
Null = 0,
/// The str type is used for any text information.
Str = 1,
/// Unsigned 64-bits Integer `u64`
U64 = 2,
/// Signed 64-bits Integer `i64`
I64 = 3,
/// 64-bits Float `f64`
F64 = 4,
/// Date/time with nanoseconds precision
Date = 5,
/// Facet
Facet = 6,
/// Arbitrarily sized byte array
Bytes = 7,
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr = 8,
/// Bool value
Bool = 9,
/// Pre-tokenized str type,
PreTokStr = 10,
/// Object
Object = 11,
/// Pre-tokenized str type,
Array = 12,
}
impl BinarySerializable for ValueType {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
(*self as u8).serialize(writer)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let num = u8::deserialize(reader)?;
let type_id = if (0..=12).contains(&num) {
unsafe { std::mem::transmute(num) }
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid value type id: {num}"),
));
};
Ok(type_id)
}
}
impl<'a, V: Value<'a>> From<&ReferenceValue<'a, V>> for ValueType {
fn from(value: &ReferenceValue<'a, V>) -> Self {
match value {
ReferenceValue::Leaf(leaf) => leaf.into(),
ReferenceValue::Array(_) => ValueType::Array,
ReferenceValue::Object(_) => ValueType::Object,
}
}
}
impl<'a> From<&ReferenceValueLeaf<'a>> for ValueType {
fn from(value: &ReferenceValueLeaf<'a>) -> Self {
match value {
ReferenceValueLeaf::Null => ValueType::Null,
ReferenceValueLeaf::Str(_) => ValueType::Str,
ReferenceValueLeaf::U64(_) => ValueType::U64,
ReferenceValueLeaf::I64(_) => ValueType::I64,
ReferenceValueLeaf::F64(_) => ValueType::F64,
ReferenceValueLeaf::Bool(_) => ValueType::Bool,
ReferenceValueLeaf::Date(_) => ValueType::Date,
ReferenceValueLeaf::IpAddr(_) => ValueType::IpAddr,
ReferenceValueLeaf::PreTokStr(_) => ValueType::PreTokStr,
ReferenceValueLeaf::Facet(_) => ValueType::Facet,
ReferenceValueLeaf::Bytes(_) => ValueType::Bytes,
}
}
}
#[derive(Debug, Clone)]
/// The Iterator for the object values in the compact document
pub struct CompactDocObjectIter<'a> {
container: &'a CompactDoc,
node_addresses_slice: &'a [u8],
}
impl<'a> CompactDocObjectIter<'a> {
fn new(container: &'a CompactDoc, addr: Addr) -> io::Result<Self> {
// Objects are `&[ValueAddr]` serialized into bytes
let node_addresses_slice = container.extract_bytes(addr);
Ok(Self {
container,
node_addresses_slice,
})
}
}
impl<'a> Iterator for CompactDocObjectIter<'a> {
type Item = (&'a str, CompactDocValue<'a>);
fn next(&mut self) -> Option<Self::Item> {
if self.node_addresses_slice.is_empty() {
return None;
}
let key_addr = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let key = self.container.extract_str(key_addr.val_addr);
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue {
container: self.container,
value_addr: value,
};
Some((key, value))
}
}
#[derive(Debug, Clone)]
/// The Iterator for the array values in the compact document
pub struct CompactDocArrayIter<'a> {
container: &'a CompactDoc,
node_addresses_slice: &'a [u8],
}
impl<'a> CompactDocArrayIter<'a> {
fn new(container: &'a CompactDoc, addr: Addr) -> io::Result<Self> {
// Arrays are &[ValueAddr] serialized into bytes
let node_addresses_slice = container.extract_bytes(addr);
Ok(Self {
container,
node_addresses_slice,
})
}
}
impl<'a> Iterator for CompactDocArrayIter<'a> {
type Item = CompactDocValue<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.node_addresses_slice.is_empty() {
return None;
}
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue {
container: self.container,
value_addr: value,
};
Some(value)
}
}
impl Document for CompactDoc {
type Value<'a> = CompactDocValue<'a>;
type FieldsValuesIter<'a> = FieldValueIterRef<'a>;
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
FieldValueIterRef {
slice: self.field_values.iter(),
container: self,
}
}
}
/// A helper wrapper for creating an iterator over the field values
pub struct FieldValueIterRef<'a> {
slice: std::slice::Iter<'a, FieldValueAddr>,
container: &'a CompactDoc,
}
impl<'a> Iterator for FieldValueIterRef<'a> {
type Item = (Field, CompactDocValue<'a>);
fn next(&mut self) -> Option<Self::Item> {
self.slice.next().map(|field_value| {
(
Field::from_field_id(field_value.field as u32),
CompactDocValue::<'a> {
container: self.container,
value_addr: field_value.value_addr,
},
)
})
}
} }
/// Error that may happen when deserializing /// Error that may happen when deserializing
@@ -264,7 +725,40 @@ mod tests {
let text_field = schema_builder.add_text_field("title", TEXT); let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = TantivyDocument::default(); let mut doc = TantivyDocument::default();
doc.add_text(text_field, "My title"); doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1); assert_eq!(doc.field_values().count(), 1);
let schema = schema_builder.build();
let _val = doc.get_first(text_field).unwrap();
let _json = doc.to_named_doc(&schema);
}
#[test]
fn test_json_value() {
let json_str = r#"{
"toto": "titi",
"float": -0.2,
"bool": true,
"unsigned": 1,
"signed": -2,
"complexobject": {
"field.with.dot": 1
},
"date": "1985-04-12T23:20:50.52Z",
"my_arr": [2, 3, {"my_key": "two tokens"}, 4, {"nested_array": [2, 5, 6, [7, 8, {"a": [{"d": {"e":[99]}}, 9000]}, 9, 10], [5, 5]]}]
}"#;
let json_val: std::collections::BTreeMap<String, OwnedValue> =
serde_json::from_str(json_str).unwrap();
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let mut doc = TantivyDocument::default();
doc.add_object(json_field, json_val);
let schema = schema_builder.build();
let json = doc.to_json(&schema);
let actual_json: serde_json::Value = serde_json::from_str(&json).unwrap();
let expected_json: serde_json::Value = serde_json::from_str(json_str).unwrap();
assert_eq!(actual_json["json"][0], expected_json);
} }
// TODO: Should this be re-added with the serialize method // TODO: Should this be re-added with the serialize method

View File

@@ -5,21 +5,39 @@
//! and don't care about some of the more specialised types or only want to customise //! and don't care about some of the more specialised types or only want to customise
//! part of the document structure. //! part of the document structure.
use std::collections::{btree_map, hash_map, BTreeMap, HashMap}; use std::collections::{btree_map, hash_map, BTreeMap, HashMap};
use std::iter::Empty;
use std::net::Ipv6Addr;
use common::DateTime;
use serde_json::Number; use serde_json::Number;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use super::facet::Facet;
use super::ReferenceValueLeaf; use super::ReferenceValueLeaf;
use crate::schema::document::{ use crate::schema::document::{
ArrayAccess, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer, ArrayAccess, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
ObjectAccess, ReferenceValue, Value, ValueDeserialize, ValueDeserializer, ValueVisitor, ObjectAccess, ReferenceValue, Value, ValueDeserialize, ValueDeserializer, ValueVisitor,
}; };
use crate::schema::Field; use crate::schema::Field;
use crate::tokenizer::PreTokenizedString;
// Serde compatibility support. // Serde compatibility support.
pub fn can_be_rfc3339_date_time(text: &str) -> bool {
if let Some(&first_byte) = text.as_bytes().first() {
if first_byte.is_ascii_digit() {
return true;
}
}
false
}
impl<'a> Value<'a> for &'a serde_json::Value { impl<'a> Value<'a> for &'a serde_json::Value {
type ArrayIter = std::slice::Iter<'a, serde_json::Value>; type ArrayIter = std::slice::Iter<'a, serde_json::Value>;
type ObjectIter = JsonObjectIter<'a>; type ObjectIter = JsonObjectIter<'a>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> { fn as_value(&self) -> ReferenceValue<'a, Self> {
match self { match self {
serde_json::Value::Null => ReferenceValueLeaf::Null.into(), serde_json::Value::Null => ReferenceValueLeaf::Null.into(),
@@ -35,7 +53,19 @@ impl<'a> Value<'a> for &'a serde_json::Value {
panic!("Unsupported serde_json number {number}"); panic!("Unsupported serde_json number {number}");
} }
} }
serde_json::Value::String(val) => ReferenceValueLeaf::Str(val).into(), serde_json::Value::String(text) => {
if can_be_rfc3339_date_time(text) {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(time::UtcOffset::UTC);
ReferenceValueLeaf::Date(DateTime::from_utc(dt_utc)).into()
}
Err(_) => ReferenceValueLeaf::Str(text).into(),
}
} else {
ReferenceValueLeaf::Str(text).into()
}
}
serde_json::Value::Array(elements) => ReferenceValue::Array(elements.iter()), serde_json::Value::Array(elements) => ReferenceValue::Array(elements.iter()),
serde_json::Value::Object(object) => { serde_json::Value::Object(object) => {
ReferenceValue::Object(JsonObjectIter(object.iter())) ReferenceValue::Object(JsonObjectIter(object.iter()))
@@ -44,6 +74,126 @@ impl<'a> Value<'a> for &'a serde_json::Value {
} }
} }
impl<'a> Value<'a> for &'a String {
type ArrayIter = Empty<&'a String>;
type ObjectIter = Empty<(&'a str, &'a String)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
}
}
impl<'a> Value<'a> for &'a Facet {
type ArrayIter = Empty<&'a Facet>;
type ObjectIter = Empty<(&'a str, &'a Facet)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Facet(self.encoded_str()))
}
}
impl<'a> Value<'a> for &'a u64 {
type ArrayIter = Empty<&'a u64>;
type ObjectIter = Empty<(&'a str, &'a u64)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::U64(**self))
}
}
impl<'a> Value<'a> for &'a i64 {
type ArrayIter = Empty<&'a i64>;
type ObjectIter = Empty<(&'a str, &'a i64)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::I64(**self))
}
}
impl<'a> Value<'a> for &'a f64 {
type ArrayIter = Empty<&'a f64>;
type ObjectIter = Empty<(&'a str, &'a f64)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::F64(**self))
}
}
impl<'a> Value<'a> for &'a bool {
type ArrayIter = Empty<&'a bool>;
type ObjectIter = Empty<(&'a str, &'a bool)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bool(**self))
}
}
impl<'a> Value<'a> for &'a str {
type ArrayIter = Empty<&'a str>;
type ObjectIter = Empty<(&'a str, &'a str)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
}
}
impl<'a> Value<'a> for &'a &'a str {
type ArrayIter = Empty<&'a &'a str>;
type ObjectIter = Empty<(&'a str, &'a &'a str)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
}
}
impl<'a> Value<'a> for &'a [u8] {
type ArrayIter = Empty<&'a [u8]>;
type ObjectIter = Empty<(&'a str, &'a [u8])>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
}
}
impl<'a> Value<'a> for &'a &'a [u8] {
type ArrayIter = Empty<&'a &'a [u8]>;
type ObjectIter = Empty<(&'a str, &'a &'a [u8])>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
}
}
impl<'a> Value<'a> for &'a Vec<u8> {
type ArrayIter = Empty<&'a Vec<u8>>;
type ObjectIter = Empty<(&'a str, &'a Vec<u8>)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
}
}
impl<'a> Value<'a> for &'a DateTime {
type ArrayIter = Empty<&'a DateTime>;
type ObjectIter = Empty<(&'a str, &'a DateTime)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Date(**self))
}
}
impl<'a> Value<'a> for &'a Ipv6Addr {
type ArrayIter = Empty<&'a Ipv6Addr>;
type ObjectIter = Empty<(&'a str, &'a Ipv6Addr)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::IpAddr(**self))
}
}
impl<'a> Value<'a> for &'a PreTokenizedString {
type ArrayIter = Empty<&'a PreTokenizedString>;
type ObjectIter = Empty<(&'a str, &'a PreTokenizedString)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(Box::new((*self).clone())))
}
}
impl ValueDeserialize for serde_json::Value { impl ValueDeserialize for serde_json::Value {
fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError> fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
where D: ValueDeserializer<'de> { where D: ValueDeserializer<'de> {

View File

@@ -172,7 +172,9 @@ pub use self::de::{
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess, ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor, ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
}; };
pub use self::default_document::{DocParsingError, TantivyDocument}; pub use self::default_document::{
CompactDocArrayIter, CompactDocObjectIter, CompactDocValue, DocParsingError, TantivyDocument,
};
pub use self::owned_value::OwnedValue; pub use self::owned_value::OwnedValue;
pub(crate) use self::se::BinaryDocumentSerializer; pub(crate) use self::se::BinaryDocumentSerializer;
pub use self::value::{ReferenceValue, ReferenceValueLeaf, Value}; pub use self::value::{ReferenceValue, ReferenceValueLeaf, Value};
@@ -233,7 +235,7 @@ pub trait Document: Send + Sync + 'static {
let field_name = schema.get_field_name(field); let field_name = schema.get_field_name(field);
let values: Vec<OwnedValue> = field_values let values: Vec<OwnedValue> = field_values
.into_iter() .into_iter()
.map(|val| val.as_value().into()) .map(|val| OwnedValue::from(val.as_value()))
.collect(); .collect();
field_map.insert(field_name.to_string(), values); field_map.insert(field_name.to_string(), values);
} }

View File

@@ -8,6 +8,7 @@ use serde::de::{MapAccess, SeqAccess};
use time::format_description::well_known::Rfc3339; use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime; use time::OffsetDateTime;
use super::existing_type_impls::can_be_rfc3339_date_time;
use super::ReferenceValueLeaf; use super::ReferenceValueLeaf;
use crate::schema::document::{ use crate::schema::document::{
ArrayAccess, DeserializeError, ObjectAccess, ReferenceValue, Value, ValueDeserialize, ArrayAccess, DeserializeError, ObjectAccess, ReferenceValue, Value, ValueDeserialize,
@@ -65,13 +66,13 @@ impl<'a> Value<'a> for &'a OwnedValue {
match self { match self {
OwnedValue::Null => ReferenceValueLeaf::Null.into(), OwnedValue::Null => ReferenceValueLeaf::Null.into(),
OwnedValue::Str(val) => ReferenceValueLeaf::Str(val).into(), OwnedValue::Str(val) => ReferenceValueLeaf::Str(val).into(),
OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val).into(), OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val.clone().into()).into(),
OwnedValue::U64(val) => ReferenceValueLeaf::U64(*val).into(), OwnedValue::U64(val) => ReferenceValueLeaf::U64(*val).into(),
OwnedValue::I64(val) => ReferenceValueLeaf::I64(*val).into(), OwnedValue::I64(val) => ReferenceValueLeaf::I64(*val).into(),
OwnedValue::F64(val) => ReferenceValueLeaf::F64(*val).into(), OwnedValue::F64(val) => ReferenceValueLeaf::F64(*val).into(),
OwnedValue::Bool(val) => ReferenceValueLeaf::Bool(*val).into(), OwnedValue::Bool(val) => ReferenceValueLeaf::Bool(*val).into(),
OwnedValue::Date(val) => ReferenceValueLeaf::Date(*val).into(), OwnedValue::Date(val) => ReferenceValueLeaf::Date(*val).into(),
OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val).into(), OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val.encoded_str()).into(),
OwnedValue::Bytes(val) => ReferenceValueLeaf::Bytes(val).into(), OwnedValue::Bytes(val) => ReferenceValueLeaf::Bytes(val).into(),
OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(), OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(),
OwnedValue::Array(array) => ReferenceValue::Array(array.iter()), OwnedValue::Array(array) => ReferenceValue::Array(array.iter()),
@@ -183,7 +184,7 @@ impl serde::Serialize for OwnedValue {
OwnedValue::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)), OwnedValue::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)),
OwnedValue::Object(ref obj) => { OwnedValue::Object(ref obj) => {
let mut map = serializer.serialize_map(Some(obj.len()))?; let mut map = serializer.serialize_map(Some(obj.len()))?;
for &(ref k, ref v) in obj { for (k, v) in obj {
map.serialize_entry(k, v)?; map.serialize_entry(k, v)?;
} }
map.end() map.end()
@@ -277,11 +278,13 @@ impl<'a, V: Value<'a>> From<ReferenceValue<'a, V>> for OwnedValue {
ReferenceValueLeaf::I64(val) => OwnedValue::I64(val), ReferenceValueLeaf::I64(val) => OwnedValue::I64(val),
ReferenceValueLeaf::F64(val) => OwnedValue::F64(val), ReferenceValueLeaf::F64(val) => OwnedValue::F64(val),
ReferenceValueLeaf::Date(val) => OwnedValue::Date(val), ReferenceValueLeaf::Date(val) => OwnedValue::Date(val),
ReferenceValueLeaf::Facet(val) => OwnedValue::Facet(val.clone()), ReferenceValueLeaf::Facet(val) => {
OwnedValue::Facet(Facet::from_encoded_string(val.to_string()))
}
ReferenceValueLeaf::Bytes(val) => OwnedValue::Bytes(val.to_vec()), ReferenceValueLeaf::Bytes(val) => OwnedValue::Bytes(val.to_vec()),
ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val), ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val),
ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val), ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val),
ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(val.clone()), ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()),
}, },
ReferenceValue::Array(val) => { ReferenceValue::Array(val) => {
OwnedValue::Array(val.map(|v| v.as_value().into()).collect()) OwnedValue::Array(val.map(|v| v.as_value().into()).collect())
@@ -373,16 +376,6 @@ impl From<BTreeMap<String, OwnedValue>> for OwnedValue {
} }
} }
fn can_be_rfc3339_date_time(text: &str) -> bool {
if let Some(&first_byte) = text.as_bytes().first() {
if first_byte.is_ascii_digit() {
return true;
}
}
false
}
impl From<serde_json::Value> for OwnedValue { impl From<serde_json::Value> for OwnedValue {
fn from(value: serde_json::Value) -> Self { fn from(value: serde_json::Value) -> Self {
match value { match value {
@@ -470,6 +463,7 @@ mod tests {
let mut doc = TantivyDocument::default(); let mut doc = TantivyDocument::default();
doc.add_bytes(bytes_field, "".as_bytes()); doc.add_bytes(bytes_field, "".as_bytes());
let json_string = doc.to_json(&schema); let json_string = doc.to_json(&schema);
assert_eq!(json_string, r#"{"my_bytes":[""]}"#); assert_eq!(json_string, r#"{"my_bytes":[""]}"#);
} }

View File

@@ -25,6 +25,7 @@ where W: Write
/// Attempts to serialize a given document and write the output /// Attempts to serialize a given document and write the output
/// to the writer. /// to the writer.
#[inline]
pub(crate) fn serialize_doc<D>(&mut self, doc: &D) -> io::Result<()> pub(crate) fn serialize_doc<D>(&mut self, doc: &D) -> io::Result<()>
where D: Document { where D: Document {
let stored_field_values = || { let stored_field_values = || {
@@ -57,9 +58,8 @@ where W: Write
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::Other, io::ErrorKind::Other,
format!( format!(
"Unexpected number of entries written to serializer, expected {} entries, got \ "Unexpected number of entries written to serializer, expected \
{} entries", {num_field_values} entries, got {actual_length} entries",
num_field_values, actual_length,
), ),
)); ));
} }
@@ -121,7 +121,7 @@ where W: Write
ReferenceValueLeaf::Facet(val) => { ReferenceValueLeaf::Facet(val) => {
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?; self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
val.serialize(self.writer) Cow::Borrowed(val).serialize(self.writer)
} }
ReferenceValueLeaf::Bytes(val) => { ReferenceValueLeaf::Bytes(val) => {
self.write_type_code(type_codes::BYTES_CODE)?; self.write_type_code(type_codes::BYTES_CODE)?;
@@ -428,7 +428,7 @@ mod tests {
); );
let facet = Facet::from_text("/hello/world").unwrap(); let facet = Facet::from_text("/hello/world").unwrap();
let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into()); let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
let expected = binary_repr!( let expected = binary_repr!(
type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(), type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(),
); );
@@ -441,7 +441,8 @@ mod tests {
text: "hello, world".to_string(), text: "hello, world".to_string(),
tokens: vec![Token::default(), Token::default()], tokens: vec![Token::default(), Token::default()],
}; };
let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into()); let result =
serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into());
let expected = binary_repr!( let expected = binary_repr!(
type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str, type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str,
); );
@@ -678,6 +679,7 @@ mod tests {
); );
} }
#[inline]
fn serialize_doc<D: Document>(doc: &D, schema: &Schema) -> Vec<u8> { fn serialize_doc<D: Document>(doc: &D, schema: &Schema) -> Vec<u8> {
let mut writer = Vec::new(); let mut writer = Vec::new();

View File

@@ -3,7 +3,6 @@ use std::net::Ipv6Addr;
use common::DateTime; use common::DateTime;
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
/// A single field value. /// A single field value.
@@ -28,7 +27,7 @@ pub trait Value<'a>: Send + Sync + Debug {
} }
#[inline] #[inline]
/// If the Value is a String, returns the associated str. Returns None otherwise. /// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> { fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
if let ReferenceValue::Leaf(val) = self.as_value() { if let ReferenceValue::Leaf(val) = self.as_value() {
Some(val) Some(val)
@@ -82,8 +81,9 @@ pub trait Value<'a>: Send + Sync + Debug {
#[inline] #[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None /// If the Value is a pre-tokenized string, returns the associated string. Returns None
/// otherwise. /// otherwise.
fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> { fn as_pre_tokenized_text(&self) -> Option<Box<PreTokenizedString>> {
self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text()) self.as_leaf()
.and_then(|leaf| leaf.into_pre_tokenized_text())
} }
#[inline] #[inline]
@@ -94,7 +94,7 @@ pub trait Value<'a>: Send + Sync + Debug {
#[inline] #[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise. /// If the Value is a facet, returns the associated facet. Returns None otherwise.
fn as_facet(&self) -> Option<&'a Facet> { fn as_facet(&self) -> Option<&'a str> {
self.as_leaf().and_then(|leaf| leaf.as_facet()) self.as_leaf().and_then(|leaf| leaf.as_facet())
} }
@@ -132,7 +132,7 @@ pub trait Value<'a>: Send + Sync + Debug {
} }
/// A enum representing a leaf value for tantivy to index. /// A enum representing a leaf value for tantivy to index.
#[derive(Clone, Copy, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub enum ReferenceValueLeaf<'a> { pub enum ReferenceValueLeaf<'a> {
/// A null value. /// A null value.
Null, Null,
@@ -146,8 +146,9 @@ pub enum ReferenceValueLeaf<'a> {
F64(f64), F64(f64),
/// Date/time with nanoseconds precision /// Date/time with nanoseconds precision
Date(DateTime), Date(DateTime),
/// Facet /// Facet string needs to match the format of
Facet(&'a Facet), /// [Facet::encoded_str](crate::schema::Facet::encoded_str).
Facet(&'a str),
/// Arbitrarily sized byte array /// Arbitrarily sized byte array
Bytes(&'a [u8]), Bytes(&'a [u8]),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
@@ -155,7 +156,70 @@ pub enum ReferenceValueLeaf<'a> {
/// Bool value /// Bool value
Bool(bool), Bool(bool),
/// Pre-tokenized str type, /// Pre-tokenized str type,
PreTokStr(&'a PreTokenizedString), PreTokStr(Box<PreTokenizedString>),
}
impl From<u64> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: u64) -> Self {
ReferenceValueLeaf::U64(value)
}
}
impl From<i64> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: i64) -> Self {
ReferenceValueLeaf::I64(value)
}
}
impl From<f64> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: f64) -> Self {
ReferenceValueLeaf::F64(value)
}
}
impl From<bool> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: bool) -> Self {
ReferenceValueLeaf::Bool(value)
}
}
impl<'a> From<&'a str> for ReferenceValueLeaf<'a> {
#[inline]
fn from(value: &'a str) -> Self {
ReferenceValueLeaf::Str(value)
}
}
impl<'a> From<&'a [u8]> for ReferenceValueLeaf<'a> {
#[inline]
fn from(value: &'a [u8]) -> Self {
ReferenceValueLeaf::Bytes(value)
}
}
impl From<DateTime> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: DateTime) -> Self {
ReferenceValueLeaf::Date(value)
}
}
impl From<Ipv6Addr> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: Ipv6Addr) -> Self {
ReferenceValueLeaf::IpAddr(value)
}
}
impl From<PreTokenizedString> for ReferenceValueLeaf<'_> {
#[inline]
fn from(val: PreTokenizedString) -> Self {
ReferenceValueLeaf::PreTokStr(Box::new(val))
}
} }
impl<'a, T: Value<'a> + ?Sized> From<ReferenceValueLeaf<'a>> for ReferenceValue<'a, T> { impl<'a, T: Value<'a> + ?Sized> From<ReferenceValueLeaf<'a>> for ReferenceValue<'a, T> {
@@ -259,9 +323,9 @@ impl<'a> ReferenceValueLeaf<'a> {
} }
#[inline] #[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None /// If the Value is a pre-tokenized string, consumes it and returns the string.
/// otherwise. /// Returns None otherwise.
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> { pub fn into_pre_tokenized_text(self) -> Option<Box<PreTokenizedString>> {
if let Self::PreTokStr(val) = self { if let Self::PreTokStr(val) = self {
Some(val) Some(val)
} else { } else {
@@ -281,7 +345,7 @@ impl<'a> ReferenceValueLeaf<'a> {
#[inline] #[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise. /// If the Value is a facet, returns the associated facet. Returns None otherwise.
pub fn as_facet(&self) -> Option<&'a Facet> { pub fn as_facet(&self) -> Option<&'a str> {
if let Self::Facet(val) = self { if let Self::Facet(val) = self {
Some(val) Some(val)
} else { } else {
@@ -322,6 +386,16 @@ where V: Value<'a>
} }
} }
#[inline]
/// If the Value is a leaf, consume it and return the leaf. Returns None otherwise.
pub fn into_leaf(self) -> Option<ReferenceValueLeaf<'a>> {
if let Self::Leaf(val) = self {
Some(val)
} else {
None
}
}
#[inline] #[inline]
/// If the Value is a String, returns the associated str. Returns None otherwise. /// If the Value is a String, returns the associated str. Returns None otherwise.
pub fn as_str(&self) -> Option<&'a str> { pub fn as_str(&self) -> Option<&'a str> {
@@ -365,10 +439,11 @@ where V: Value<'a>
} }
#[inline] #[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None /// If the Value is a pre-tokenized string, consumes it and returns the string.
/// otherwise. /// Returns None otherwise.
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> { pub fn into_pre_tokenized_text(self) -> Option<Box<PreTokenizedString>> {
self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text()) self.into_leaf()
.and_then(|leaf| leaf.into_pre_tokenized_text())
} }
#[inline] #[inline]
@@ -379,7 +454,7 @@ where V: Value<'a>
#[inline] #[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise. /// If the Value is a facet, returns the associated facet. Returns None otherwise.
pub fn as_facet(&self) -> Option<&'a Facet> { pub fn as_facet(&self) -> Option<&'a str> {
self.as_leaf().and_then(|leaf| leaf.as_facet()) self.as_leaf().and_then(|leaf| leaf.as_facet())
} }

View File

@@ -568,21 +568,21 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap(); let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap();
assert_eq!( assert_eq!(
&OwnedValue::Str("100".to_string()), OwnedValue::Str("100".to_string()),
doc.get_first(text_field).unwrap() doc.get_first(text_field).unwrap().into()
); );
let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap(); let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap();
assert_eq!( assert_eq!(
&OwnedValue::Str("true".to_string()), OwnedValue::Str("true".to_string()),
doc.get_first(text_field).unwrap() doc.get_first(text_field).unwrap().into()
); );
// Not sure if this null coercion is the best approach // Not sure if this null coercion is the best approach
let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap(); let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap();
assert_eq!( assert_eq!(
&OwnedValue::Str("null".to_string()), OwnedValue::Str("null".to_string()),
doc.get_first(text_field).unwrap() doc.get_first(text_field).unwrap().into()
); );
} }
@@ -595,9 +595,18 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#; let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&OwnedValue::I64(100), doc.get_first(i64_field).unwrap()); assert_eq!(
assert_eq!(&OwnedValue::U64(100), doc.get_first(u64_field).unwrap()); OwnedValue::I64(100),
assert_eq!(&OwnedValue::F64(100.0), doc.get_first(f64_field).unwrap()); doc.get_first(i64_field).unwrap().into()
);
assert_eq!(
OwnedValue::U64(100),
doc.get_first(u64_field).unwrap().into()
);
assert_eq!(
OwnedValue::F64(100.0),
doc.get_first(f64_field).unwrap().into()
);
} }
#[test] #[test]
@@ -607,11 +616,17 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let doc_json = r#"{"bool": "true"}"#; let doc_json = r#"{"bool": "true"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&OwnedValue::Bool(true), doc.get_first(bool_field).unwrap()); assert_eq!(
OwnedValue::Bool(true),
doc.get_first(bool_field).unwrap().into()
);
let doc_json = r#"{"bool": "false"}"#; let doc_json = r#"{"bool": "false"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&OwnedValue::Bool(false), doc.get_first(bool_field).unwrap()); assert_eq!(
OwnedValue::Bool(false),
doc.get_first(bool_field).unwrap().into()
);
} }
#[test] #[test]
@@ -644,7 +659,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#; let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let date = doc.get_first(date_field).unwrap(); let date = OwnedValue::from(doc.get_first(date_field).unwrap());
// Time zone is converted to UTC // Time zone is converted to UTC
assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}")); assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
} }

View File

@@ -1,46 +0,0 @@
use crate::schema::{Field, OwnedValue};
/// `FieldValue` holds together a `Field` and its `Value`.
#[allow(missing_docs)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct FieldValue {
pub field: Field,
pub value: OwnedValue,
}
impl FieldValue {
/// Constructor
pub fn new(field: Field, value: OwnedValue) -> FieldValue {
FieldValue { field, value }
}
/// Field accessor
pub fn field(&self) -> Field {
self.field
}
/// Value accessor
pub fn value(&self) -> &OwnedValue {
&self.value
}
}
impl From<FieldValue> for OwnedValue {
fn from(field_value: FieldValue) -> Self {
field_value.value
}
}
/// A helper wrapper for creating standard iterators
/// out of the fields iterator trait.
pub struct FieldValueIter<'a>(pub(crate) std::slice::Iter<'a, FieldValue>);
impl<'a> Iterator for FieldValueIter<'a> {
type Item = (Field, &'a OwnedValue);
fn next(&mut self) -> Option<Self::Item> {
self.0
.next()
.map(|field_value| (field_value.field, &field_value.value))
}
}

View File

@@ -1,7 +1,6 @@
use std::ops::BitOr; use std::ops::BitOr;
use crate::schema::{NumericOptions, TextOptions}; use crate::schema::{DateOptions, NumericOptions, TextOptions};
use crate::DateOptions;
#[derive(Clone)] #[derive(Clone)]
pub struct StoredFlag; pub struct StoredFlag;

View File

@@ -114,7 +114,6 @@ pub(crate) mod term;
mod field_entry; mod field_entry;
mod field_type; mod field_type;
mod field_value;
mod bytes_options; mod bytes_options;
mod date_time_options; mod date_time_options;
@@ -138,7 +137,6 @@ pub use self::facet_options::FacetOptions;
pub use self::field::Field; pub use self::field::Field;
pub use self::field_entry::FieldEntry; pub use self::field_entry::FieldEntry;
pub use self::field_type::{FieldType, Type}; pub use self::field_type::{FieldType, Type};
pub use self::field_value::FieldValue;
pub use self::flags::{COERCE, FAST, INDEXED, STORED}; pub use self::flags::{COERCE, FAST, INDEXED, STORED};
pub use self::index_record_option::IndexRecordOption; pub use self::index_record_option::IndexRecordOption;
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions}; pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};

View File

@@ -645,15 +645,15 @@ mod tests {
let doc = let doc =
TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap(); TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
assert_eq!( assert_eq!(
doc.get_all(title).collect::<Vec<_>>(), doc.get_all(title).map(OwnedValue::from).collect::<Vec<_>>(),
vec![ vec![
&OwnedValue::from("title1".to_string()), OwnedValue::from("title1".to_string()),
&OwnedValue::from("title2".to_string()) OwnedValue::from("title2".to_string())
] ]
); );
assert_eq!( assert_eq!(
doc.get_all(val).collect::<Vec<_>>(), doc.get_all(val).map(OwnedValue::from).collect::<Vec<_>>(),
vec![&OwnedValue::from(14u64), &OwnedValue::from(-1i64)] vec![OwnedValue::from(14u64), OwnedValue::from(-1i64)]
); );
} }
@@ -682,7 +682,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
{ {
let doc = TantivyDocument::parse_json(&schema, "{}").unwrap(); let doc = TantivyDocument::parse_json(&schema, "{}").unwrap();
assert!(doc.field_values().is_empty()); assert!(doc.field_values().next().is_none());
} }
{ {
let doc = TantivyDocument::parse_json( let doc = TantivyDocument::parse_json(

View File

@@ -12,8 +12,8 @@ use std::collections::HashMap;
use common::ByteCount; use common::ByteCount;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::index::SegmentComponent;
use crate::schema::Field; use crate::schema::Field;
use crate::SegmentComponent;
/// Enum containing any of the possible space usage results for segment components. /// Enum containing any of the possible space usage results for segment components.
pub enum ComponentSpaceUsage { pub enum ComponentSpaceUsage {
@@ -115,7 +115,7 @@ impl SegmentSpaceUsage {
/// Use the components directly if this is somehow in performance critical code. /// Use the components directly if this is somehow in performance critical code.
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage { pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
use self::ComponentSpaceUsage::*; use self::ComponentSpaceUsage::*;
use crate::SegmentComponent::*; use crate::index::SegmentComponent::*;
match component { match component {
Postings => PerField(self.postings().clone()), Postings => PerField(self.postings().clone()),
Positions => PerField(self.positions().clone()), Positions => PerField(self.positions().clone()),

View File

@@ -59,9 +59,8 @@ pub mod tests {
use super::*; use super::*;
use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT, self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT,
}; };
use crate::{Index, IndexWriter, Term}; use crate::{Index, IndexWriter, Term};
@@ -92,8 +91,8 @@ pub mod tests {
StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap(); StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap();
for i in 0..num_docs { for i in 0..num_docs {
let mut doc = TantivyDocument::default(); let mut doc = TantivyDocument::default();
doc.add_field_value(field_body, LOREM.to_string()); doc.add_text(field_body, LOREM);
doc.add_field_value(field_title, format!("Doc {i}")); doc.add_text(field_title, format!("Doc {i}"));
store_writer.store(&doc, &schema).unwrap(); store_writer.store(&doc, &schema).unwrap();
} }
store_writer.close().unwrap(); store_writer.close().unwrap();
@@ -119,10 +118,11 @@ pub mod tests {
let store = StoreReader::open(store_file, 10)?; let store = StoreReader::open(store_file, 10)?;
for i in 0..NUM_DOCS as u32 { for i in 0..NUM_DOCS as u32 {
assert_eq!( assert_eq!(
*store store
.get::<TantivyDocument>(i)? .get::<TantivyDocument>(i)?
.get_first(field_title) .get_first(field_title)
.unwrap() .unwrap()
.as_value()
.as_str() .as_str()
.unwrap(), .unwrap(),
format!("Doc {i}") format!("Doc {i}")
@@ -131,7 +131,13 @@ pub mod tests {
for doc in store.iter::<TantivyDocument>(Some(&alive_bitset)) { for doc in store.iter::<TantivyDocument>(Some(&alive_bitset)) {
let doc = doc?; let doc = doc?;
let title_content = doc.get_first(field_title).unwrap().as_str().unwrap(); let title_content = doc
.get_first(field_title)
.unwrap()
.as_value()
.as_str()
.unwrap()
.to_string();
if !title_content.starts_with("Doc ") { if !title_content.starts_with("Doc ") {
panic!("unexpected title_content {title_content}"); panic!("unexpected title_content {title_content}");
} }

View File

@@ -18,6 +18,8 @@ use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize};
use crate::space_usage::StoreSpaceUsage; use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint; use crate::store::index::Checkpoint;
use crate::DocId; use crate::DocId;
#[cfg(feature = "quickwit")]
use crate::Executor;
pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100; pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100;
@@ -341,7 +343,11 @@ impl StoreReader {
/// In most cases use [`get_async`](Self::get_async) /// In most cases use [`get_async`](Self::get_async)
/// ///
/// Loads and decompresses a block asynchronously. /// Loads and decompresses a block asynchronously.
async fn read_block_async(&self, checkpoint: &Checkpoint) -> io::Result<Block> { async fn read_block_async(
&self,
checkpoint: &Checkpoint,
executor: &Executor,
) -> io::Result<Block> {
let cache_key = checkpoint.byte_range.start; let cache_key = checkpoint.byte_range.start;
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) { if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
return Ok(block); return Ok(block);
@@ -353,8 +359,12 @@ impl StoreReader {
.read_bytes_async() .read_bytes_async()
.await?; .await?;
let decompressed_block = let decompressor = self.decompressor;
OwnedBytes::new(self.decompressor.decompress(compressed_block.as_ref())?); let maybe_decompressed_block = executor
.spawn_blocking(move || decompressor.decompress(compressed_block.as_ref()))
.await
.expect("decompression panicked");
let decompressed_block = OwnedBytes::new(maybe_decompressed_block?);
self.cache self.cache
.put_into_cache(cache_key, decompressed_block.clone()); .put_into_cache(cache_key, decompressed_block.clone());
@@ -363,15 +373,23 @@ impl StoreReader {
} }
/// Reads raw bytes of a given document asynchronously. /// Reads raw bytes of a given document asynchronously.
pub async fn get_document_bytes_async(&self, doc_id: DocId) -> crate::Result<OwnedBytes> { pub async fn get_document_bytes_async(
&self,
doc_id: DocId,
executor: &Executor,
) -> crate::Result<OwnedBytes> {
let checkpoint = self.block_checkpoint(doc_id)?; let checkpoint = self.block_checkpoint(doc_id)?;
let block = self.read_block_async(&checkpoint).await?; let block = self.read_block_async(&checkpoint, executor).await?;
Self::get_document_bytes_from_block(block, doc_id, &checkpoint) Self::get_document_bytes_from_block(block, doc_id, &checkpoint)
} }
/// Fetches a document asynchronously. Async version of [`get`](Self::get). /// Fetches a document asynchronously. Async version of [`get`](Self::get).
pub async fn get_async<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> { pub async fn get_async<D: DocumentDeserialize>(
let mut doc_bytes = self.get_document_bytes_async(doc_id).await?; &self,
doc_id: DocId,
executor: &Executor,
) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?; .map_err(crate::TantivyError::from)?;
@@ -385,8 +403,7 @@ mod tests {
use super::*; use super::*;
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::schema::document::Value; use crate::schema::{Field, TantivyDocument, Value};
use crate::schema::{Field, TantivyDocument};
use crate::store::tests::write_lorem_ipsum_store; use crate::store::tests::write_lorem_ipsum_store;
use crate::store::Compressor; use crate::store::Compressor;
use crate::Directory; use crate::Directory;
@@ -394,7 +411,7 @@ mod tests {
const BLOCK_SIZE: usize = 16_384; const BLOCK_SIZE: usize = 16_384;
fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> { fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_str()) doc.get_first(*field).and_then(|f| f.as_value().as_str())
} }
#[test] #[test]

View File

@@ -93,7 +93,7 @@ fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes
let fst = Fst::new(bytes).map_err(|err| { let fst = Fst::new(bytes).map_err(|err| {
io::Error::new( io::Error::new(
io::ErrorKind::InvalidData, io::ErrorKind::InvalidData,
format!("Fst data is corrupted: {:?}", err), format!("Fst data is corrupted: {err:?}"),
) )
})?; })?;
Ok(tantivy_fst::Map::from(fst)) Ok(tantivy_fst::Map::from(fst))

View File

@@ -95,7 +95,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
#[test] #[test]
fn test_term_dictionary_stream() -> crate::Result<()> { fn test_term_dictionary_stream() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32) let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i)) .map(|i| (format!("doc{i:0>6}"), i))
.collect(); .collect();
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
@@ -156,7 +156,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
#[test] #[test]
fn test_stream_range() -> crate::Result<()> { fn test_stream_range() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32) let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i)) .map(|i| (format!("doc{i:0>6}"), i))
.collect(); .collect();
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();

View File

@@ -96,7 +96,7 @@ mod tests {
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet)); tokens.push(format!("{facet}"));
}; };
FacetTokenizer::default() FacetTokenizer::default()
.token_stream(facet.encoded_str()) .token_stream(facet.encoded_str())
@@ -116,7 +116,7 @@ mod tests {
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet)); tokens.push(format!("{facet}"));
}; };
FacetTokenizer::default() FacetTokenizer::default()
.token_stream(facet.encoded_str()) // ok test .token_stream(facet.encoded_str()) // ok test