mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Compare commits
4 Commits
0.18
...
termmap_pe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
54fc557a6d | ||
|
|
7f5a409d6f | ||
|
|
50ee43ab79 | ||
|
|
2d1beeb6be |
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
components: rustfmt, clippy
|
||||
|
||||
- name: Run tests
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
||||
|
||||
- name: Run tests quickwit feature
|
||||
run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Tantivy 0.18
|
||||
Unreleased
|
||||
================================
|
||||
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :
|
||||
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
|
||||
@@ -11,7 +11,6 @@ Tantivy 0.18
|
||||
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
|
||||
- Add support for fastfield on text fields (@PSeitz)
|
||||
- Add terms aggregation (@PSeitz)
|
||||
- Add support for zstd compression (@kryesh)
|
||||
|
||||
Tantivy 0.17
|
||||
================================
|
||||
|
||||
96
Cargo.toml
96
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.18.0"
|
||||
version = "0.17.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -10,72 +10,71 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1.3"
|
||||
base64 = "0.13.0"
|
||||
oneshot = "0.1"
|
||||
base64 = "0.13"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
tantivy-fst = "0.3.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.11", optional = true }
|
||||
crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap2 = {version = "0.5", optional=true}
|
||||
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
num_cpus = "1.13.1"
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
log = "0.4.14"
|
||||
serde = { version = "1.0.126", features = ["derive"] }
|
||||
serde_json = "1.0.64"
|
||||
num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2.1"
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
|
||||
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.3", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2.0"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
crossbeam = "0.8.1"
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version = "0.2", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
|
||||
census = "0.4.0"
|
||||
census = "0.4"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.30"
|
||||
thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5.0"
|
||||
murmurhash32 = "0.2.0"
|
||||
time = { version = "0.3.9", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.7.5"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.10.3"
|
||||
measure_time = "0.8.2"
|
||||
pretty_assertions = "1.2.1"
|
||||
serde_cbor = { version = "0.11.2", optional = true }
|
||||
async-trait = "0.1.53"
|
||||
fail = "0.5"
|
||||
murmurhash32 = "0.2"
|
||||
time = { version = "0.3.7", features = ["serde-well-known"] }
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.7.0"
|
||||
fastdivide = "0.4"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.8.0"
|
||||
pretty_assertions = "1.1.0"
|
||||
serde_cbor = {version="0.11", optional=true}
|
||||
async-trait = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8.5"
|
||||
rand = "0.8.3"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
proptest = "1.0.0"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.10"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
pprof = { version = "0.9.0", features = ["flamegraph", "criterion"] }
|
||||
futures = "0.3.21"
|
||||
pprof = {version= "0.8", features=["flamegraph", "criterion"]}
|
||||
futures = "0.3.15"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5.0"
|
||||
version = "0.5"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
@@ -94,7 +93,6 @@ mmap = ["fs2", "tempfile", "memmap2"]
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.2.0"
|
||||
version = "0.1.1"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.3.0"
|
||||
version = "0.2.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
@@ -10,7 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.3", path="../ownedbytes" }
|
||||
ownedbytes = { version="0.2", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless by setting it as
|
||||
// default query parser field.
|
||||
// to make tantivy partially schemaless.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
@@ -11,6 +10,10 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// We need two fields:
|
||||
// - a timestamp
|
||||
// - a json object field
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
@@ -40,8 +43,7 @@ fn main() -> tantivy::Result<()> {
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard",
|
||||
"event_type": "holiday-sale"
|
||||
"description": "das keyboard"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
@@ -51,9 +53,6 @@ fn main() -> tantivy::Result<()> {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type and attributes
|
||||
// By setting attributes as a default field it allows omitting attributes itself, e.g. "target",
|
||||
// instead of "attributes.target"
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
@@ -71,34 +70,10 @@ fn main() -> tantivy::Result<()> {
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
{
|
||||
// The sub-fields in the json field marked as default field still need to be explicitly
|
||||
// addressed
|
||||
let query = query_parser.parse_query("click AND 133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
{
|
||||
// Default json fields are ignored if they collide with the schema
|
||||
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
// # Query via full attribute path
|
||||
{
|
||||
// This only searches in our schema's `event_type` field
|
||||
let query = query_parser.parse_query("event_type:click")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 2);
|
||||
}
|
||||
{
|
||||
// Default json fields can still be accessed by full path
|
||||
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
let query = query_parser
|
||||
.parse_query("event_type:click AND cart.product_id:133")
|
||||
.unwrap();
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.2.0"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
@@ -9,8 +9,8 @@ description = "Fast field codecs used by tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
|
||||
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.3.0"
|
||||
version = "0.2.0"
|
||||
edition = "2018"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.18.0"
|
||||
version = "0.15.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::Occur;
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
@@ -34,8 +34,7 @@ fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
take_while(|c| !SPECIAL_CHARS.contains(&c)),
|
||||
),
|
||||
'\\',
|
||||
satisfy(|_| true), /* if the next character is not a special char, the \ will be treated
|
||||
* as the \ character. */
|
||||
satisfy(|c| SPECIAL_CHARS.contains(&c)),
|
||||
))
|
||||
.skip(char(':'))
|
||||
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
|
||||
@@ -517,27 +516,15 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name() {
|
||||
fn test_field_name() -> TestParseResult {
|
||||
assert_eq!(
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\ field:a"#),
|
||||
Ok(("my field".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"にんじん:a"#),
|
||||
Ok(("にんじん".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\field:a"#),
|
||||
Ok((r#"my\field"#.to_string(), "a"))
|
||||
);
|
||||
assert!(super::field_name().parse("my field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("\\(1\\+1\\):2"),
|
||||
@@ -547,21 +534,14 @@ mod test {
|
||||
super::field_name().parse("my_field_name:a"),
|
||||
Ok(("my_field_name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("myfield.b:hello").unwrap(),
|
||||
("myfield.b".to_string(), "hello")
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"myfield\.b:hello"#).unwrap(),
|
||||
(r#"myfield\.b"#.to_string(), "hello")
|
||||
);
|
||||
assert!(super::field_name().parse("my_field_name").is_err());
|
||||
assert!(super::field_name().parse(":a").is_err());
|
||||
assert!(super::field_name().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("_my_field:a"),
|
||||
Ok(("_my_field".to_string(), "a"))
|
||||
super::field_name().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -230,7 +230,8 @@ pub enum BucketResult {
|
||||
impl BucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
BucketResult::from_intermediate_and_req(empty_bucket, req)
|
||||
|
||||
Ok(BucketResult::from_intermediate_and_req(empty_bucket, req)?)
|
||||
}
|
||||
|
||||
fn from_intermediate_and_req(
|
||||
|
||||
@@ -1364,29 +1364,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_invalid_request() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(true)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"histogram".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 0.0,
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_res = exec_request(agg_req, &index);
|
||||
|
||||
assert!(agg_res.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,8 +81,7 @@ pub struct TermsAggregation {
|
||||
///
|
||||
/// Should never be smaller than size.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
#[serde(alias = "shard_size")]
|
||||
pub split_size: Option<u32>,
|
||||
pub shard_size: Option<u32>,
|
||||
|
||||
/// The get more accurate results, we fetch more than `size` from each segment.
|
||||
///
|
||||
@@ -97,11 +96,11 @@ pub struct TermsAggregation {
|
||||
/// doc_count returned by each shard. It’s the sum of the size of the largest bucket on
|
||||
/// each segment that didn’t fit into `shard_size`.
|
||||
///
|
||||
/// Defaults to true when ordering by count desc.
|
||||
/// Defaults to true when ordering by counts desc.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub show_term_doc_count_error: Option<bool>,
|
||||
|
||||
/// Filter all terms that are lower than `min_doc_count`. Defaults to 1.
|
||||
/// Filter all terms than are lower `min_doc_count`. Defaults to 1.
|
||||
///
|
||||
/// **Expensive**: When set to 0, this will return all terms in the field.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
@@ -144,7 +143,7 @@ pub(crate) struct TermsAggregationInternal {
|
||||
/// Increasing this value is will increase the cost for more accuracy.
|
||||
pub segment_size: u32,
|
||||
|
||||
/// Filter all terms that are lower than `min_doc_count`. Defaults to 1.
|
||||
/// Filter all terms than are lower `min_doc_count`. Defaults to 1.
|
||||
///
|
||||
/// *Expensive*: When set to 0, this will return all terms in the field.
|
||||
pub min_doc_count: u64,
|
||||
@@ -573,7 +572,7 @@ mod tests {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "string_id".to_string(),
|
||||
size: Some(2),
|
||||
split_size: Some(2),
|
||||
shard_size: Some(2),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
@@ -1211,51 +1210,6 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(agg_req, agg_req_deser);
|
||||
|
||||
let elasticsearch_compatible_json = json!(
|
||||
{
|
||||
"term_agg_test":{
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"split_size": 2u64,
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// test alias shard_size, split_size
|
||||
let agg_req: Aggregations = vec![(
|
||||
"term_agg_test".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "string_id".to_string(),
|
||||
split_size: Some(2),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req_deser: Aggregations =
|
||||
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
|
||||
.unwrap();
|
||||
assert_eq!(agg_req, agg_req_deser);
|
||||
|
||||
let elasticsearch_compatible_json = json!(
|
||||
{
|
||||
"term_agg_test":{
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"shard_size": 2u64,
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let agg_req_deser: Aggregations =
|
||||
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
|
||||
.unwrap();
|
||||
assert_eq!(agg_req, agg_req_deser);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,9 +24,7 @@ use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
|
||||
}
|
||||
|
||||
|
||||
@@ -20,8 +20,7 @@
|
||||
//!
|
||||
//! #### Limitations
|
||||
//!
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and
|
||||
//! fast fields on text fields.
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
|
||||
//!
|
||||
//! # JSON Format
|
||||
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
|
||||
|
||||
@@ -92,7 +92,7 @@ mod histogram_collector;
|
||||
pub use histogram_collector::HistogramCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use super::{Collector, SegmentCollector};
|
||||
use crate::collector::Fruit;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
/// MultiFruit keeps Fruits from every nested Collector
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
|
||||
}
|
||||
@@ -80,17 +79,12 @@ impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||
}
|
||||
}
|
||||
|
||||
/// FruitHandle stores reference to the corresponding collector inside MultiCollector
|
||||
pub struct FruitHandle<TFruit: Fruit> {
|
||||
pos: usize,
|
||||
_phantom: PhantomData<TFruit>,
|
||||
}
|
||||
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// Extract a typed fruit off a multifruit.
|
||||
///
|
||||
/// This function involves downcasting and can panic if the multifruit was
|
||||
/// created using faulty code.
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*boxed_fruit
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crossbeam::channel;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Search executor whether search request are single thread or multithread.
|
||||
///
|
||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
||||
@@ -48,19 +47,16 @@ impl Executor {
|
||||
match self {
|
||||
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
||||
Executor::ThreadPool(pool) => {
|
||||
let args: Vec<A> = args.collect();
|
||||
let num_fruits = args.len();
|
||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
||||
let num_fruits = args_with_indices.len();
|
||||
let fruit_receiver = {
|
||||
let (fruit_sender, fruit_receiver) = crossbeam_channel::unbounded();
|
||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||
pool.scope(|scope| {
|
||||
for (idx, arg) in args.into_iter().enumerate() {
|
||||
// We name references for f and fruit_sender_ref because we do not
|
||||
// want these two to be moved into the closure.
|
||||
let f_ref = &f;
|
||||
let fruit_sender_ref = &fruit_sender;
|
||||
scope.spawn(move |_| {
|
||||
let fruit = f_ref(arg);
|
||||
if let Err(err) = fruit_sender_ref.send((idx, fruit)) {
|
||||
for arg_with_idx in args_with_indices {
|
||||
scope.spawn(|_| {
|
||||
let (idx, arg) = arg_with_idx;
|
||||
let fruit = f(arg);
|
||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||
error!(
|
||||
"Failed to send search task. It probably means all search \
|
||||
threads have panicked. {:?}",
|
||||
@@ -75,19 +71,18 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
let mut result_placeholders: Vec<Option<R>> =
|
||||
std::iter::repeat_with(|| None).take(num_fruits).collect();
|
||||
// This is lame, but safe.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
let fruit = fruit_res?;
|
||||
result_placeholders[pos] = Some(fruit);
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
let results: Vec<R> = result_placeholders.into_iter().flatten().collect();
|
||||
if results.len() != num_fruits {
|
||||
return Err(TantivyError::InternalError(
|
||||
"One of the mapped execution failed.".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(results)
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +74,6 @@ fn load_metas(
|
||||
pub struct IndexBuilder {
|
||||
schema: Option<Schema>,
|
||||
index_settings: IndexSettings,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
}
|
||||
impl Default for IndexBuilder {
|
||||
fn default() -> Self {
|
||||
@@ -87,7 +86,6 @@ impl IndexBuilder {
|
||||
Self {
|
||||
schema: None,
|
||||
index_settings: IndexSettings::default(),
|
||||
tokenizer_manager: TokenizerManager::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,12 +103,6 @@ impl IndexBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the tokenizers .
|
||||
pub fn tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
|
||||
self.tokenizer_manager = tokenizers;
|
||||
self
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
@@ -162,8 +154,7 @@ impl IndexBuilder {
|
||||
if !Index::exists(&*dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let mut index = Index::open(dir)?;
|
||||
index.set_tokenizers(self.tokenizer_manager.clone());
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == self.get_expect_schema()? {
|
||||
Ok(index)
|
||||
} else {
|
||||
@@ -185,8 +176,7 @@ impl IndexBuilder {
|
||||
)?;
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings;
|
||||
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
index.set_tokenizers(self.tokenizer_manager);
|
||||
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
@@ -314,11 +304,6 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/// Setter for the tokenizer manager.
|
||||
pub fn set_tokenizers(&mut self, tokenizers: TokenizerManager) {
|
||||
self.tokenizers = tokenizers;
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
@@ -329,31 +314,20 @@ impl Index {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let indexing_options_opt = match field_type {
|
||||
FieldType::JsonObject(options) => options.get_text_indexing_options(),
|
||||
FieldType::Str(options) => options.get_indexing_options(),
|
||||
_ => {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
)))
|
||||
}
|
||||
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
|
||||
_ => None,
|
||||
};
|
||||
let indexing_options = indexing_options_opt.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No indexing options set for field {:?}",
|
||||
field_entry
|
||||
))
|
||||
})?;
|
||||
|
||||
tokenizer_manager
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No Tokenizer found for field {:?}",
|
||||
field_entry
|
||||
))
|
||||
})
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => Ok(tokenizer),
|
||||
None => Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a default `IndexReader` for the given index.
|
||||
@@ -583,8 +557,7 @@ impl fmt::Debug for Index {
|
||||
mod tests {
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
|
||||
use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -600,21 +573,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_tokenizer_manager() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("num_likes", INDEXED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = IndexBuilder::new()
|
||||
// set empty tokenizer manager
|
||||
.tokenizers(TokenizerManager::new())
|
||||
.schema(schema)
|
||||
.create_in_ram()
|
||||
.unwrap();
|
||||
assert!(index.tokenizers().get("raw").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
@@ -744,7 +702,7 @@ mod tests {
|
||||
.try_into()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
let (sender, receiver) = crossbeam_channel::unbounded();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
@@ -779,7 +737,7 @@ mod tests {
|
||||
reader: &IndexReader,
|
||||
) -> crate::Result<()> {
|
||||
let mut reader_index = reader.index();
|
||||
let (sender, receiver) = crossbeam_channel::unbounded();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _watch_handle = reader_index
|
||||
.directory_mut()
|
||||
.watch(WatchCallback::new(move || {
|
||||
|
||||
@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
|
||||
///
|
||||
/// Contains settings which are applied on the whole
|
||||
/// index, like presort documents.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
@@ -248,26 +248,7 @@ pub struct IndexSettings {
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
#[serde(default = "default_docstore_blocksize")]
|
||||
/// The size of each block that will be compressed and written to disk
|
||||
pub docstore_blocksize: usize,
|
||||
}
|
||||
|
||||
/// Must be a function to be compatible with serde defaults
|
||||
fn default_docstore_blocksize() -> usize {
|
||||
16_384
|
||||
}
|
||||
|
||||
impl Default for IndexSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sort_by_field: None,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_blocksize: default_docstore_blocksize(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
/// Presorting documents can greatly performance
|
||||
@@ -420,7 +401,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
|
||||
@@ -110,7 +110,7 @@ mod tests {
|
||||
let tmp_file = tmp_dir.path().join("watched.txt");
|
||||
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(100);
|
||||
|
||||
let watcher = FileWatcher::new(&tmp_file);
|
||||
@@ -153,7 +153,7 @@ mod tests {
|
||||
let tmp_file = tmp_dir.path().join("watched.txt");
|
||||
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(100);
|
||||
|
||||
let watcher = FileWatcher::new(&tmp_file);
|
||||
|
||||
@@ -181,7 +181,7 @@ fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
|
||||
|
||||
fn test_watch(directory: &dyn Directory) {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(500);
|
||||
|
||||
let handle = directory
|
||||
|
||||
@@ -300,7 +300,7 @@ impl IntFastFieldWriter {
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
///
|
||||
/// Values on text fast fields are skipped.
|
||||
/// Values for string fast fields are skipped.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use smallvec::smallvec;
|
||||
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
@@ -288,7 +289,7 @@ impl IndexWriter {
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
@@ -325,7 +326,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
fn drop_sender(&mut self) {
|
||||
let (sender, _receiver) = crossbeam_channel::bounded(1);
|
||||
let (sender, _receiver) = channel::bounded(1);
|
||||
self.operation_sender = sender;
|
||||
}
|
||||
|
||||
@@ -531,7 +532,7 @@ impl IndexWriter {
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) {
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
self.operation_sender = document_sender;
|
||||
self.index_writer_status = IndexWriterStatus::from(document_receiver);
|
||||
}
|
||||
|
||||
@@ -92,7 +92,7 @@ impl Drop for IndexWriterBomb {
|
||||
mod tests {
|
||||
use std::mem;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
|
||||
use super::IndexWriterStatus;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use murmurhash32::murmurhash2;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::schema::Type;
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
@@ -57,7 +57,7 @@ struct IndexingPositionsPerPath {
|
||||
impl IndexingPositionsPerPath {
|
||||
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
|
||||
self.positions_per_path
|
||||
.entry(murmurhash2(term.as_slice()))
|
||||
.entry(murmurhash2(term.value_bytes()))
|
||||
.or_insert_with(Default::default)
|
||||
}
|
||||
}
|
||||
@@ -199,81 +199,16 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime {
|
||||
}
|
||||
}
|
||||
|
||||
// Tries to infer a JSON type from a string
|
||||
pub(crate) fn convert_to_fast_value_and_get_term(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
phrase: &str,
|
||||
) -> Option<Term> {
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
return Some(set_fastvalue_and_get_term(
|
||||
json_term_writer,
|
||||
DateTime::from_utc(dt_utc),
|
||||
));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
|
||||
}
|
||||
None
|
||||
}
|
||||
// helper function to generate a Term from a json fastvalue
|
||||
pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
value: T,
|
||||
) -> Term {
|
||||
json_term_writer.set_fast_value(value);
|
||||
json_term_writer.term().clone()
|
||||
}
|
||||
|
||||
// helper function to generate a list of terms with their positions from a textual json value
|
||||
pub(crate) fn set_string_and_get_terms(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
value: &str,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
) -> Vec<(usize, Term)> {
|
||||
let mut positions_and_terms = Vec::<(usize, Term)>::new();
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
let term_num_bytes = json_term_writer.term_buffer.as_slice().len();
|
||||
let mut token_stream = text_analyzer.token_stream(value);
|
||||
token_stream.process(&mut |token| {
|
||||
json_term_writer.term_buffer.truncate(term_num_bytes);
|
||||
json_term_writer
|
||||
.term_buffer
|
||||
.append_bytes(token.text.as_bytes());
|
||||
positions_and_terms.push((token.position, json_term_writer.term().clone()));
|
||||
});
|
||||
positions_and_terms
|
||||
}
|
||||
|
||||
pub struct JsonTermWriter<'a> {
|
||||
term_buffer: &'a mut Term,
|
||||
path_stack: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a> JsonTermWriter<'a> {
|
||||
pub fn from_field_and_json_path(
|
||||
field: Field,
|
||||
json_path: &str,
|
||||
term_buffer: &'a mut Term,
|
||||
) -> Self {
|
||||
term_buffer.set_field(Type::Json, field);
|
||||
let mut json_term_writer = Self::wrap(term_buffer);
|
||||
for segment in json_path.split('.') {
|
||||
json_term_writer.push_path_segment(segment);
|
||||
}
|
||||
json_term_writer
|
||||
}
|
||||
|
||||
pub fn wrap(term_buffer: &'a mut Term) -> Self {
|
||||
term_buffer.clear_with_type(Type::Json);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(5);
|
||||
path_stack.push(5); // magic number?
|
||||
Self {
|
||||
term_buffer,
|
||||
path_stack,
|
||||
@@ -315,8 +250,8 @@ impl<'a> JsonTermWriter<'a> {
|
||||
/// Returns the json path of the term being currently built.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn path(&self) -> &[u8] {
|
||||
let end_of_path = self.path_stack.last().cloned().unwrap_or(6);
|
||||
&self.term().as_slice()[5..end_of_path - 1]
|
||||
let end_of_path = self.path_stack.last().cloned().unwrap_or(6); // TODO remove magic number
|
||||
&self.term().value_bytes()[..end_of_path - 1]
|
||||
}
|
||||
|
||||
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
@@ -386,10 +321,7 @@ mod tests {
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
assert_eq!(json_writer.term().value_bytes(), b"color\x00sred")
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -401,8 +333,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(-4i64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||
json_writer.term().value_bytes(),
|
||||
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -415,8 +347,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4u64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||
json_writer.term().value_bytes(),
|
||||
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -429,8 +361,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4.0f64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||
json_writer.term().value_bytes(),
|
||||
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -445,8 +377,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
|
||||
json_writer.term().value_bytes(),
|
||||
b"attribute\x01color\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -460,10 +392,7 @@ mod tests {
|
||||
json_writer.push_path_segment("hue");
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
assert_eq!(json_writer.term().value_bytes(), b"color\x00sred")
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -21,13 +21,11 @@ pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub(crate) use self::json_term_writer::{
|
||||
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
|
||||
};
|
||||
pub(crate) use self::json_term_writer::JsonTermWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
|
||||
@@ -39,10 +39,9 @@ impl SegmentSerializer {
|
||||
|
||||
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
|
||||
let compressor = segment.index().settings().docstore_compression;
|
||||
let blocksize = segment.index().settings().docstore_blocksize;
|
||||
Ok(SegmentSerializer {
|
||||
segment,
|
||||
store_writer: StoreWriter::new(store_write, compressor, blocksize),
|
||||
store_writer: StoreWriter::new(store_write, compressor),
|
||||
fast_field_serializer,
|
||||
fieldnorms_serializer: Some(fieldnorms_serializer),
|
||||
postings_serializer,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
@@ -26,7 +27,7 @@ use crate::indexer::{
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
use crate::{FutureResult, Opstamp, TantivyError};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
@@ -72,12 +73,10 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)
|
||||
)));
|
||||
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
))));
|
||||
directory.sync_directory()?;
|
||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
|
||||
@@ -6,8 +6,7 @@ use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::indexer::json_term_writer::index_json_values;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::{
|
||||
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
serialize_postings, IndexingContext, IndexingPosition, PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
@@ -16,25 +15,6 @@ use crate::tokenizer::{
|
||||
};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
/// Returns the recommended initial table size as a power of 2.
|
||||
///
|
||||
/// Note this is a very dumb way to compute log2, but it is easier to proofread that way.
|
||||
fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
|
||||
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
||||
(10..20) // We cap it at 2^19 = 512K capacity.
|
||||
.map(|power| 1 << power)
|
||||
.take_while(|capacity| compute_table_size(*capacity) < table_memory_upper_bound)
|
||||
.last()
|
||||
.ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(format!(
|
||||
"per thread memory budget (={per_thread_memory_budget}) is too small. Raise the \
|
||||
memory budget or lower the number of threads."
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
fn remap_doc_opstamps(
|
||||
opstamps: Vec<Opstamp>,
|
||||
doc_id_mapping_opt: Option<&DocIdMapping>,
|
||||
@@ -78,12 +58,11 @@ impl SegmentWriter {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
_memory_budget_in_bytes: usize,
|
||||
segment: Segment,
|
||||
schema: Schema,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||
let per_field_text_analyzers = schema
|
||||
@@ -106,7 +85,7 @@ impl SegmentWriter {
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
ctx: IndexingContext::new(table_size),
|
||||
ctx: IndexingContext::new(),
|
||||
per_field_postings_writers,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
|
||||
segment_serializer,
|
||||
@@ -149,6 +128,7 @@ impl SegmentWriter {
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.ctx.mem_usage()
|
||||
+ self.fieldnorms_writer.mem_usage()
|
||||
+ self.per_field_postings_writers.mem_usage()
|
||||
+ self.fast_field_writers.mem_usage()
|
||||
+ self.segment_serializer.mem_usage()
|
||||
}
|
||||
@@ -223,7 +203,7 @@ impl SegmentWriter {
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
|
||||
for mut token_stream in token_streams {
|
||||
assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
// assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
&mut *token_stream,
|
||||
@@ -372,10 +352,9 @@ fn remap_and_write(
|
||||
.segment_mut()
|
||||
.open_write(SegmentComponent::Store)?;
|
||||
let compressor = serializer.segment().index().settings().docstore_compression;
|
||||
let block_size = serializer.segment().index().settings().docstore_blocksize;
|
||||
let old_store_writer = std::mem::replace(
|
||||
&mut serializer.store_writer,
|
||||
StoreWriter::new(store_write, compressor, block_size),
|
||||
StoreWriter::new(store_write, compressor),
|
||||
);
|
||||
old_store_writer.close()?;
|
||||
let store_read = StoreReader::open(
|
||||
@@ -419,7 +398,6 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::compute_initial_table_size;
|
||||
use crate::collector::Count;
|
||||
use crate::indexer::json_term_writer::JsonTermWriter;
|
||||
use crate::postings::TermInfo;
|
||||
@@ -430,15 +408,6 @@ mod tests {
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{DateTime, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
|
||||
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
|
||||
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 17);
|
||||
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
|
||||
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_for_store() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,27 +1,24 @@
|
||||
use crate::postings::stacker::{MemoryArena, TermHashMap};
|
||||
use crate::postings::stacker::MemoryArena;
|
||||
|
||||
/// IndexingContext contains all of the transient memory arenas
|
||||
/// required for building the inverted index.
|
||||
pub(crate) struct IndexingContext {
|
||||
/// The term index is an adhoc hashmap,
|
||||
/// itself backed by a dedicated memory arena.
|
||||
pub term_index: TermHashMap,
|
||||
/// Arena is a memory arena that stores posting lists / term frequencies / positions.
|
||||
pub arena: MemoryArena,
|
||||
pub arena_terms: MemoryArena,
|
||||
}
|
||||
|
||||
impl IndexingContext {
|
||||
/// Create a new IndexingContext given the size of the term hash map.
|
||||
pub(crate) fn new(table_size: usize) -> IndexingContext {
|
||||
let term_index = TermHashMap::new(table_size);
|
||||
pub(crate) fn new() -> IndexingContext {
|
||||
IndexingContext {
|
||||
arena: MemoryArena::new(),
|
||||
term_index,
|
||||
arena_terms: MemoryArena::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the memory usage for the inverted index memory arenas, in bytes.
|
||||
pub(crate) fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.arena.mem_usage()
|
||||
self.arena.mem_usage() + self.arena_terms.mem_usage()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::io;
|
||||
|
||||
use super::stacker::TermHashMap;
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
@@ -26,6 +27,14 @@ impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
fn mem_usage(&self) -> usize {
|
||||
self.str_posting_writer.mem_usage() + self.non_str_posting_writer.mem_usage()
|
||||
}
|
||||
|
||||
fn term_map(&self) -> &TermHashMap {
|
||||
self.str_posting_writer.term_map()
|
||||
}
|
||||
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
@@ -74,6 +83,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
&self.str_posting_writer.term_map,
|
||||
serializer,
|
||||
)?;
|
||||
} else {
|
||||
@@ -83,6 +93,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
&self.str_posting_writer.term_map,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, Pos
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
|
||||
@@ -10,9 +10,10 @@ pub(crate) struct PerFieldPostingsWriter {
|
||||
|
||||
impl PerFieldPostingsWriter {
|
||||
pub fn for_schema(schema: &Schema) -> Self {
|
||||
let num_fields = schema.num_fields();
|
||||
let per_field_postings_writers = schema
|
||||
.fields()
|
||||
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry))
|
||||
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry, num_fields))
|
||||
.collect();
|
||||
PerFieldPostingsWriter {
|
||||
per_field_postings_writers,
|
||||
@@ -26,9 +27,19 @@ impl PerFieldPostingsWriter {
|
||||
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter {
|
||||
self.per_field_postings_writers[field.field_id() as usize].as_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn mem_usage(&self) -> usize {
|
||||
self.per_field_postings_writers
|
||||
.iter()
|
||||
.map(|postings_writer| postings_writer.mem_usage())
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
|
||||
fn posting_writer_from_field_entry(
|
||||
field_entry: &FieldEntry,
|
||||
_num_fields: usize,
|
||||
) -> Box<dyn PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Range;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
|
||||
use super::stacker::Addr;
|
||||
use super::stacker::{Addr, TermHashMap};
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
@@ -21,31 +20,6 @@ use crate::DocId;
|
||||
|
||||
const POSITION_GAP: u32 = 1;
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(term, _, _)| term.field())
|
||||
.enumerate();
|
||||
let mut prev_field_opt = None;
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
for (offset, field) in term_offsets_it {
|
||||
if Some(field) != prev_field_opt {
|
||||
prev_field_opt = Some(field);
|
||||
fields.push(field);
|
||||
offsets.push(offset);
|
||||
}
|
||||
}
|
||||
offsets.push(term_offsets.len());
|
||||
let mut field_offsets = vec![];
|
||||
for i in 0..fields.len() {
|
||||
field_offsets.push((fields[i], offsets[i]..offsets[i + 1]));
|
||||
}
|
||||
field_offsets
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
@@ -57,23 +31,23 @@ pub(crate) fn serialize_postings(
|
||||
schema: &Schema,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(ctx.term_index.len());
|
||||
term_offsets.extend(ctx.term_index.iter());
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
for (field, byte_offsets) in field_offsets {
|
||||
for (field, _) in schema.fields() {
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(postings_writer.term_map().len());
|
||||
term_offsets.extend(postings_writer.term_map().iter(&ctx.arena_terms));
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::Facet(_) => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let unordered_term_ids = term_offsets.iter().map(|&(_, _, bucket)| bucket);
|
||||
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
@@ -87,16 +61,10 @@ pub(crate) fn serialize_postings(
|
||||
FieldType::JsonObject(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens(), fieldnorm_reader)?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[byte_offsets],
|
||||
doc_id_map,
|
||||
&ctx,
|
||||
&mut field_serializer,
|
||||
)?;
|
||||
postings_writer.serialize(&term_offsets, doc_id_map, &ctx, &mut field_serializer)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
@@ -128,6 +96,10 @@ pub(crate) trait PostingsWriter {
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
fn mem_usage(&self) -> usize;
|
||||
|
||||
fn term_map(&self) -> &TermHashMap;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
@@ -148,7 +120,7 @@ pub(crate) trait PostingsWriter {
|
||||
indexing_position: &mut IndexingPosition,
|
||||
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
let end_of_path_idx = term_buffer.as_slice().len();
|
||||
let end_of_path_idx = term_buffer.value_bytes().len();
|
||||
let mut num_tokens = 0;
|
||||
let mut end_position = 0;
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
@@ -188,6 +160,7 @@ pub(crate) trait PostingsWriter {
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
pub(crate) term_map: TermHashMap,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
@@ -206,9 +179,10 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
term_index: &TermHashMap,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let recorder: Rec = ctx.term_index.read(addr);
|
||||
let recorder: Rec = term_index.read(addr, &ctx.arena_terms);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
@@ -218,6 +192,14 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn mem_usage(&self) -> usize {
|
||||
self.term_map.mem_usage()
|
||||
}
|
||||
|
||||
fn term_map(&self) -> &TermHashMap {
|
||||
&self.term_map
|
||||
}
|
||||
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
@@ -225,25 +207,30 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
term: &Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
//debug_assert!(term.value_bytes().len() >= 1);
|
||||
self.total_num_tokens += 1;
|
||||
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
|
||||
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(arena);
|
||||
let arena = &mut ctx.arena;
|
||||
let arena_terms = &mut ctx.arena_terms;
|
||||
self.term_map.mutate_or_create(
|
||||
term.value_bytes(),
|
||||
arena_terms,
|
||||
|opt_recorder: Option<Rec>| {
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(arena);
|
||||
recorder.new_doc(doc, arena);
|
||||
}
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::default();
|
||||
recorder.new_doc(doc, arena);
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
}
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::default();
|
||||
recorder.new_doc(doc, arena);
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
}
|
||||
}) as UnorderedTermId
|
||||
},
|
||||
) as UnorderedTermId
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
@@ -255,7 +242,15 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
||||
Self::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
&self.term_map,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@ impl Addr {
|
||||
}
|
||||
|
||||
/// Returns the `Addr` object for `addr + offset`
|
||||
#[inline]
|
||||
pub fn offset(self, offset: u32) -> Addr {
|
||||
Addr(self.0.wrapping_add(offset))
|
||||
}
|
||||
@@ -54,20 +55,24 @@ impl Addr {
|
||||
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn page_id(self) -> usize {
|
||||
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn page_local_addr(self) -> usize {
|
||||
(self.0 as usize) & (PAGE_SIZE - 1)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the `Addr` is null.
|
||||
#[inline]
|
||||
pub fn is_null(self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||
assert_eq!(dest.len(), std::mem::size_of::<Item>());
|
||||
unsafe {
|
||||
@@ -75,6 +80,7 @@ pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
|
||||
assert_eq!(data.len(), std::mem::size_of::<Item>());
|
||||
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
|
||||
@@ -110,6 +116,7 @@ impl MemoryArena {
|
||||
self.pages.len() * PAGE_SIZE
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
|
||||
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
|
||||
store(dest, val);
|
||||
@@ -120,6 +127,7 @@ impl MemoryArena {
|
||||
/// # Panics
|
||||
///
|
||||
/// If the address is erroneous
|
||||
#[inline]
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
load(self.slice(addr, mem::size_of::<Item>()))
|
||||
}
|
||||
@@ -128,6 +136,7 @@ impl MemoryArena {
|
||||
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn slice_from(&self, addr: Addr) -> &[u8] {
|
||||
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
||||
}
|
||||
|
||||
@@ -4,4 +4,4 @@ mod term_hashmap;
|
||||
|
||||
pub(crate) use self::expull::ExpUnrolledLinkedList;
|
||||
pub(crate) use self::memory_arena::{Addr, MemoryArena};
|
||||
pub(crate) use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
pub(crate) use self::term_hashmap::TermHashMap;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::convert::TryInto;
|
||||
use std::{iter, mem, slice};
|
||||
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
@@ -8,13 +8,6 @@ use crate::postings::stacker::memory_arena::store;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::Term;
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table with a given capacity.
|
||||
/// required to create a table of size
|
||||
pub(crate) fn compute_table_size(capacity: usize) -> usize {
|
||||
capacity * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external memory arena.
|
||||
/// The `value_addr` also points to an address in the memory arena.
|
||||
@@ -36,6 +29,7 @@ impl Default for KeyValue {
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
#[inline]
|
||||
fn is_empty(self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
@@ -51,12 +45,17 @@ impl KeyValue {
|
||||
/// or copying the key as long as there is no insert.
|
||||
pub struct TermHashMap {
|
||||
table: Box<[KeyValue]>,
|
||||
memory_arena: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl Default for TermHashMap {
|
||||
fn default() -> Self {
|
||||
Self::new(1 << 10)
|
||||
}
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
@@ -75,18 +74,21 @@ impl QuadraticProbing {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
pub struct Iter<'a, 'm> {
|
||||
hashmap: &'a TermHashMap,
|
||||
memory_arena: &'m MemoryArena,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId);
|
||||
impl<'a, 'm> Iterator for Iter<'a, 'm> {
|
||||
type Item = (Term<&'m [u8]>, Addr, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
let (key, offset): (&'m [u8], Addr) = self
|
||||
.hashmap
|
||||
.get_key_value(kv.key_value_addr, self.memory_arena);
|
||||
(Term::wrap(key), offset, kv.unordered_term_id)
|
||||
})
|
||||
}
|
||||
@@ -106,21 +108,19 @@ impl TermHashMap {
|
||||
pub(crate) fn new(table_size: usize) -> TermHashMap {
|
||||
assert!(table_size > 0);
|
||||
let table_size_power_of_2 = compute_previous_power_of_two(table_size);
|
||||
let memory_arena = MemoryArena::new();
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
|
||||
.take(table_size_power_of_2)
|
||||
.collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
memory_arena,
|
||||
mask: table_size_power_of_2 - 1,
|
||||
occupied: Vec::with_capacity(table_size_power_of_2 / 2),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
self.memory_arena.read(addr)
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr, memory_arena: &MemoryArena) -> Item {
|
||||
memory_arena.read(addr)
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
@@ -129,6 +129,8 @@ impl TermHashMap {
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.table.len() * mem::size_of::<KeyValue>()
|
||||
+ self.occupied.len()
|
||||
* std::mem::size_of_val(&self.occupied.get(0).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
fn is_saturated(&self) -> bool {
|
||||
@@ -136,16 +138,22 @@ impl TermHashMap {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let data = self.memory_arena.slice_from(addr);
|
||||
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
||||
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
|
||||
fn get_key_value<'m>(&self, addr: Addr, memory_arena: &'m MemoryArena) -> (&'m [u8], Addr) {
|
||||
let data = memory_arena.slice_from(addr);
|
||||
let (key_bytes_len_enc, data) = data.split_at(2);
|
||||
let key_bytes_len: u16 = u16::from_ne_bytes(key_bytes_len_enc.try_into().unwrap());
|
||||
let key_bytes: &[u8] = &data[..key_bytes_len as usize];
|
||||
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||
let (stored_key, value_addr) = self.get_key_value(addr);
|
||||
fn get_value_addr_if_key_match(
|
||||
&self,
|
||||
target_key: &[u8],
|
||||
addr: Addr,
|
||||
memory_arena: &mut MemoryArena,
|
||||
) -> Option<Addr> {
|
||||
let (stored_key, value_addr) = self.get_key_value(addr, memory_arena);
|
||||
if stored_key == target_key {
|
||||
Some(value_addr)
|
||||
} else {
|
||||
@@ -169,10 +177,11 @@ impl TermHashMap {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
pub fn iter<'a, 'm>(&'a self, memory_arena: &'m MemoryArena) -> Iter<'a, 'm> {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: self,
|
||||
memory_arena,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,6 +218,7 @@ impl TermHashMap {
|
||||
pub fn mutate_or_create<V, TMutator>(
|
||||
&mut self,
|
||||
key: &[u8],
|
||||
memory_arena: &mut MemoryArena,
|
||||
mut updater: TMutator,
|
||||
) -> UnorderedTermId
|
||||
where
|
||||
@@ -219,28 +229,33 @@ impl TermHashMap {
|
||||
self.resize();
|
||||
}
|
||||
let hash = murmurhash2(key);
|
||||
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
|
||||
if kv.is_empty() {
|
||||
// The key does not exists yet.
|
||||
let val = updater(None);
|
||||
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
|
||||
let key_addr = self.memory_arena.allocate_space(num_bytes);
|
||||
let key_addr = memory_arena.allocate_space(num_bytes);
|
||||
{
|
||||
let data = self.memory_arena.slice_mut(key_addr, num_bytes);
|
||||
NativeEndian::write_u16(data, key.len() as u16);
|
||||
let stop = 2 + key.len();
|
||||
data[2..stop].copy_from_slice(key);
|
||||
let data = memory_arena.slice_mut(key_addr, num_bytes);
|
||||
let (key_len, data) = data.split_at_mut(2);
|
||||
key_len.copy_from_slice(&(key.len() as u16).to_le_bytes());
|
||||
let stop = key.len();
|
||||
data[..key.len()].copy_from_slice(key);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
return self.set_bucket(hash, key_addr, bucket);
|
||||
} else if kv.hash == hash {
|
||||
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
|
||||
let v = self.memory_arena.read(val_addr);
|
||||
if let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
|
||||
{
|
||||
let v = memory_arena.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.memory_arena.write_at(val_addr, new_v);
|
||||
memory_arena.write_at(val_addr, new_v);
|
||||
return kv.unordered_term_id;
|
||||
}
|
||||
}
|
||||
@@ -254,26 +269,28 @@ mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::{compute_previous_power_of_two, TermHashMap};
|
||||
use crate::postings::stacker::MemoryArena;
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(1 << 18);
|
||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
||||
hash_map.mutate_or_create(b"abc", &mut arena, |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| {
|
||||
hash_map.mutate_or_create(b"abcd", &mut arena, |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
||||
hash_map.mutate_or_create(b"abc", &mut arena, |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let iter_values = hash_map.iter();
|
||||
let iter_values = hash_map.iter(&arena);
|
||||
for (key, addr, _) in iter_values {
|
||||
let val: u32 = hash_map.memory_arena.read(addr);
|
||||
let val: u32 = arena.read(addr);
|
||||
vanilla_hash_map.insert(key.to_owned(), val);
|
||||
}
|
||||
assert_eq!(vanilla_hash_map.len(), 2);
|
||||
|
||||
@@ -184,66 +184,6 @@ fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
|
||||
count
|
||||
}
|
||||
|
||||
fn intersection_count_with_slop(left: &[u32], right: &[u32], slop: u32) -> usize {
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let mut count = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
while left_index < left_len && right_index < right_len {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
let right_slop = if right_val >= slop {
|
||||
right_val - slop
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if left_val < right_slop {
|
||||
left_index += 1;
|
||||
} else if right_slop <= left_val && left_val <= right_val {
|
||||
while left_index + 1 < left_len {
|
||||
let next_left_val = left[left_index + 1];
|
||||
if next_left_val > right_val {
|
||||
break;
|
||||
}
|
||||
left_index += 1;
|
||||
}
|
||||
count += 1;
|
||||
left_index += 1;
|
||||
right_index += 1;
|
||||
} else if left_val > right_val {
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
fn intersection_exists_with_slop(left: &[u32], right: &[u32], slop: u32) -> bool {
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
while left_index < left_len && right_index < right_len {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
let right_slop = if right_val >= slop {
|
||||
right_val - slop
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if left_val < right_slop {
|
||||
left_index += 1;
|
||||
} else if right_slop <= left_val && left_val <= right_val {
|
||||
return true;
|
||||
} else if left_val > right_val {
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
pub fn new(
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
@@ -297,25 +237,11 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
|
||||
fn phrase_exists(&mut self) -> bool {
|
||||
let intersection_len = self.compute_phrase_match();
|
||||
if self.has_slop() {
|
||||
return intersection_exists_with_slop(
|
||||
&self.left[..intersection_len],
|
||||
&self.right[..],
|
||||
self.slop,
|
||||
);
|
||||
}
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
|
||||
fn compute_phrase_count(&mut self) -> u32 {
|
||||
let intersection_len = self.compute_phrase_match();
|
||||
if self.has_slop() {
|
||||
return intersection_count_with_slop(
|
||||
&self.left[..intersection_len],
|
||||
&self.right[..],
|
||||
self.slop,
|
||||
) as u32;
|
||||
}
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
|
||||
@@ -326,7 +252,12 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_terms - 1 {
|
||||
let end_term = if self.has_slop() {
|
||||
self.num_terms
|
||||
} else {
|
||||
self.num_terms - 1
|
||||
};
|
||||
for i in 1..end_term {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
@@ -7,9 +7,7 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp
|
||||
|
||||
use super::logical_ast::*;
|
||||
use crate::core::Index;
|
||||
use crate::indexer::{
|
||||
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
|
||||
};
|
||||
use crate::indexer::JsonTermWriter;
|
||||
use crate::query::{
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
|
||||
TermQuery,
|
||||
@@ -18,7 +16,7 @@ use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateTime, Score};
|
||||
|
||||
@@ -32,7 +30,7 @@ pub enum QueryParserError {
|
||||
#[error("Unsupported query: {0}")]
|
||||
UnsupportedQuery(String),
|
||||
/// The query references a field that is not in the schema
|
||||
#[error("Field does not exists: '{0}'")]
|
||||
#[error("Field does not exists: '{0:?}'")]
|
||||
FieldDoesNotExist(String),
|
||||
/// The query contains a term for a `u64` or `i64`-field, but the value
|
||||
/// is neither.
|
||||
@@ -55,11 +53,11 @@ pub enum QueryParserError {
|
||||
NoDefaultFieldDeclared,
|
||||
/// The field searched for is not declared
|
||||
/// as indexed in the schema.
|
||||
#[error("The field '{0}' is not declared as indexed")]
|
||||
#[error("The field '{0:?}' is not declared as indexed")]
|
||||
FieldNotIndexed(String),
|
||||
/// A phrase query was requested for a field that does not
|
||||
/// have any positions indexed.
|
||||
#[error("The field '{0}' does not have positions indexed")]
|
||||
#[error("The field '{0:?}' does not have positions indexed")]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
@@ -171,7 +169,7 @@ pub struct QueryParser {
|
||||
conjunction_by_default: bool,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
boost: HashMap<Field, Score>,
|
||||
field_names: HashMap<String, Field>,
|
||||
field_names: BTreeSet<String>,
|
||||
}
|
||||
|
||||
fn all_negative(ast: &LogicalAst) -> bool {
|
||||
@@ -184,31 +182,6 @@ fn all_negative(ast: &LogicalAst) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
|
||||
//
|
||||
// This function operates directly on bytes (as opposed to codepoint), relying
|
||||
// on a encoding property of utf-8 for its correctness.
|
||||
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
|
||||
let mut splitting_dots_pos = Vec::new();
|
||||
let mut escape_state = false;
|
||||
for (pos, b) in field_path.bytes().enumerate() {
|
||||
if escape_state {
|
||||
escape_state = false;
|
||||
continue;
|
||||
}
|
||||
match b {
|
||||
b'\\' => {
|
||||
escape_state = true;
|
||||
}
|
||||
b'.' => {
|
||||
splitting_dots_pos.push(pos);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
splitting_dots_pos
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`, given
|
||||
/// * schema - index Schema
|
||||
@@ -220,7 +193,7 @@ impl QueryParser {
|
||||
) -> QueryParser {
|
||||
let field_names = schema
|
||||
.fields()
|
||||
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
|
||||
.map(|(_, field_entry)| field_entry.name().to_string())
|
||||
.collect();
|
||||
QueryParser {
|
||||
schema,
|
||||
@@ -234,18 +207,25 @@ impl QueryParser {
|
||||
|
||||
// Splits a full_path as written in a query, into a field name and a
|
||||
// json path.
|
||||
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
|
||||
if let Some(field) = self.field_names.get(full_path) {
|
||||
return Some((*field, ""));
|
||||
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> (&'a str, &'a str) {
|
||||
if full_path.is_empty() {
|
||||
return ("", "");
|
||||
}
|
||||
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
|
||||
while let Some(pos) = splitting_period_pos.pop() {
|
||||
let (prefix, suffix) = full_path.split_at(pos);
|
||||
if let Some(field) = self.field_names.get(prefix) {
|
||||
return Some((*field, &suffix[1..]));
|
||||
if self.field_names.contains(full_path) {
|
||||
return (full_path, "");
|
||||
}
|
||||
let mut result = ("", full_path);
|
||||
let mut cursor = 0;
|
||||
while let Some(pos) = full_path[cursor..].find('.') {
|
||||
cursor += pos;
|
||||
let prefix = &full_path[..cursor];
|
||||
let suffix = &full_path[cursor + 1..];
|
||||
if self.field_names.contains(prefix) {
|
||||
result = (prefix, suffix);
|
||||
}
|
||||
cursor += 1;
|
||||
}
|
||||
None
|
||||
result
|
||||
}
|
||||
|
||||
/// Creates a `QueryParser`, given
|
||||
@@ -298,6 +278,12 @@ impl QueryParser {
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
||||
}
|
||||
|
||||
fn compute_logical_ast(
|
||||
&self,
|
||||
user_input_ast: UserInputAst,
|
||||
@@ -404,12 +390,6 @@ impl QueryParser {
|
||||
if !field_type.is_indexed() {
|
||||
return Err(QueryParserError::FieldNotIndexed(field_name.to_string()));
|
||||
}
|
||||
if field_type.value_type() != Type::Json && !json_path.is_empty() {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
return Err(QueryParserError::FieldDoesNotExist(format!(
|
||||
"{field_name}.{json_path}"
|
||||
)));
|
||||
}
|
||||
match *field_type {
|
||||
FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
@@ -551,56 +531,37 @@ impl QueryParser {
|
||||
})
|
||||
}
|
||||
|
||||
/// Given a literal, returns the list of terms that should be searched.
|
||||
///
|
||||
/// The terms are identified by a triplet:
|
||||
/// - tantivy field
|
||||
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
|
||||
/// object by naturally extending the json field name with a "." separated field_path
|
||||
/// - field_phrase: the phrase that is being searched.
|
||||
///
|
||||
/// The literal identifies the targetted field by a so-called *full field path*,
|
||||
/// specified before the ":". (e.g. identity.username:fulmicoton).
|
||||
///
|
||||
/// The way we split the full field path into (field_name, field_path) can be ambiguous,
|
||||
/// because field_names can contain "." themselves.
|
||||
// For instance if a field is named `one.two` and another one is named `one`,
|
||||
/// should `one.two:three` target `one.two` with field path `` or or `one` with
|
||||
/// the field path `two`.
|
||||
///
|
||||
/// In this case tantivy, just picks the solution with the longest field name.
|
||||
///
|
||||
/// Quirk: As a hack for quickwit, we do not split over a dot that appear escaped '\.'.
|
||||
fn compute_path_triplets_for_literal<'a>(
|
||||
fn compute_path_triplet_for_literal<'a>(
|
||||
&self,
|
||||
literal: &'a UserInputLiteral,
|
||||
) -> Result<Vec<(Field, &'a str, &'a str)>, QueryParserError> {
|
||||
let full_path = if let Some(full_path) = &literal.field_name {
|
||||
full_path
|
||||
} else {
|
||||
// The user did not specify any path...
|
||||
// We simply target default fields.
|
||||
if self.default_fields.is_empty() {
|
||||
return Err(QueryParserError::NoDefaultFieldDeclared);
|
||||
match &literal.field_name {
|
||||
Some(ref full_path) => {
|
||||
// We need to add terms associated to json default fields.
|
||||
let (field_name, path) = self.split_full_path(full_path);
|
||||
if let Ok(field) = self.resolve_field_name(field_name) {
|
||||
return Ok(vec![(field, path, literal.phrase.as_str())]);
|
||||
}
|
||||
let triplets: Vec<(Field, &str, &str)> = self
|
||||
.default_indexed_json_fields()
|
||||
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
|
||||
.collect();
|
||||
if triplets.is_empty() {
|
||||
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
|
||||
}
|
||||
Ok(triplets)
|
||||
}
|
||||
None => {
|
||||
if self.default_fields.is_empty() {
|
||||
return Err(QueryParserError::NoDefaultFieldDeclared);
|
||||
}
|
||||
Ok(self
|
||||
.default_fields
|
||||
.iter()
|
||||
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
|
||||
.collect::<Vec<(Field, &str, &str)>>())
|
||||
}
|
||||
return Ok(self
|
||||
.default_fields
|
||||
.iter()
|
||||
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
|
||||
.collect::<Vec<(Field, &str, &str)>>());
|
||||
};
|
||||
if let Some((field, path)) = self.split_full_path(full_path) {
|
||||
return Ok(vec![(field, path, literal.phrase.as_str())]);
|
||||
}
|
||||
// We need to add terms associated to json default fields.
|
||||
let triplets: Vec<(Field, &str, &str)> = self
|
||||
.default_indexed_json_fields()
|
||||
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
|
||||
.collect();
|
||||
if triplets.is_empty() {
|
||||
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
|
||||
}
|
||||
Ok(triplets)
|
||||
}
|
||||
|
||||
fn compute_logical_ast_from_leaf(
|
||||
@@ -610,7 +571,7 @@ impl QueryParser {
|
||||
match leaf {
|
||||
UserInputLeaf::Literal(literal) => {
|
||||
let term_phrases: Vec<(Field, &str, &str)> =
|
||||
self.compute_path_triplets_for_literal(&literal)?;
|
||||
self.compute_path_triplet_for_literal(&literal)?;
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, json_path, phrase) in term_phrases {
|
||||
for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? {
|
||||
@@ -637,9 +598,8 @@ impl QueryParser {
|
||||
"Range query need to target a specific field.".to_string(),
|
||||
)
|
||||
})?;
|
||||
let (field, json_path) = self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
|
||||
let (field_name, json_path) = self.split_full_path(&full_path);
|
||||
let field = self.resolve_field_name(field_name)?;
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
||||
@@ -700,6 +660,30 @@ fn generate_literals_for_str(
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
|
||||
enum NumValue {
|
||||
U64(u64),
|
||||
I64(i64),
|
||||
F64(f64),
|
||||
DateTime(OffsetDateTime),
|
||||
}
|
||||
|
||||
fn infer_type_num(phrase: &str) -> Option<NumValue> {
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
return Some(NumValue::DateTime(dt_utc));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(NumValue::U64(u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(NumValue::I64(i64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(NumValue::F64(f64_val));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn generate_literals_for_json_object(
|
||||
field_name: &str,
|
||||
field: Field,
|
||||
@@ -710,13 +694,38 @@ fn generate_literals_for_json_object(
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let mut logical_literals = Vec::new();
|
||||
let mut term = Term::new();
|
||||
let mut json_term_writer =
|
||||
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
|
||||
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
for segment in json_path.split('.') {
|
||||
json_term_writer.push_path_segment(segment);
|
||||
}
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
|
||||
if let Some(num_value) = infer_type_num(phrase) {
|
||||
match num_value {
|
||||
NumValue::U64(u64_val) => {
|
||||
json_term_writer.set_fast_value(u64_val);
|
||||
}
|
||||
NumValue::I64(i64_val) => {
|
||||
json_term_writer.set_fast_value(i64_val);
|
||||
}
|
||||
NumValue::F64(f64_val) => {
|
||||
json_term_writer.set_fast_value(f64_val);
|
||||
}
|
||||
NumValue::DateTime(dt_val) => {
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(dt_val));
|
||||
}
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone()));
|
||||
}
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
drop(json_term_writer);
|
||||
let term_num_bytes = term.value_bytes().len();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
token_stream.process(&mut |token| {
|
||||
term.truncate(term_num_bytes);
|
||||
term.append_bytes(token.text.as_bytes());
|
||||
terms.push((token.position, term.clone()));
|
||||
});
|
||||
if terms.len() <= 1 {
|
||||
for (_, term) in terms {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
@@ -1390,56 +1399,29 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escaped_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field(r#"a\.b"#, STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
||||
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
"TermQuery(Term(type=Str, field=0, \"hello\"))"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_full_path() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("second", STRING);
|
||||
schema_builder.add_text_field("first", STRING);
|
||||
schema_builder.add_text_field("first.toto", STRING);
|
||||
schema_builder.add_text_field("first.toto.titi", STRING);
|
||||
schema_builder.add_text_field("third.a.b.c", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser =
|
||||
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
|
||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto"),
|
||||
Some((schema.get_field("first.toto").unwrap(), ""))
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto.bubu"),
|
||||
Some((schema.get_field("first.toto").unwrap(), "bubu"))
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto.titi"),
|
||||
Some((schema.get_field("first.toto.titi").unwrap(), ""))
|
||||
("first.toto", "")
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.titi"),
|
||||
Some((schema.get_field("first").unwrap(), "titi"))
|
||||
("first", "titi")
|
||||
);
|
||||
assert_eq!(query_parser.split_full_path("third"), None);
|
||||
assert_eq!(query_parser.split_full_path("hello.toto"), None);
|
||||
assert_eq!(query_parser.split_full_path(""), None);
|
||||
assert_eq!(query_parser.split_full_path("firsty"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_locate_splitting_dots() {
|
||||
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
|
||||
assert_eq!(query_parser.split_full_path("third"), ("", "third"));
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("hello.toto"),
|
||||
("", "hello.toto")
|
||||
);
|
||||
assert_eq!(query_parser.split_full_path(""), ("", ""));
|
||||
assert_eq!(query_parser.split_full_path("firsty"), ("", "firsty"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::{unbounded, Receiver, RecvError, Sender};
|
||||
use crossbeam::channel::{unbounded, Receiver, RecvError, Sender};
|
||||
|
||||
pub struct GenerationItem<T> {
|
||||
generation: usize,
|
||||
@@ -197,7 +197,7 @@ mod tests {
|
||||
|
||||
use std::{iter, mem};
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
|
||||
use super::{Pool, Queue};
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ impl WarmingStateInner {
|
||||
/// Every [GC_INTERVAL] attempt to GC, with panics caught and logged using
|
||||
/// [std::panic::catch_unwind].
|
||||
fn gc_loop(inner: Weak<Mutex<WarmingStateInner>>) {
|
||||
for _ in crossbeam_channel::tick(GC_INTERVAL) {
|
||||
for _ in crossbeam::channel::tick(GC_INTERVAL) {
|
||||
if let Some(inner) = inner.upgrade() {
|
||||
// rely on deterministic gc in tests
|
||||
#[cfg(not(test))]
|
||||
|
||||
@@ -213,8 +213,6 @@ impl BinarySerializable for Document {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
@@ -225,22 +223,4 @@ mod tests {
|
||||
doc.add_text(text_field, "My title");
|
||||
assert_eq!(doc.field_values().len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_serialization_issue() {
|
||||
let mut doc = Document::default();
|
||||
doc.add_json_object(
|
||||
Field::from_field_id(0),
|
||||
serde_json::json!({"key": 2u64})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
doc.add_text(Field::from_field_id(1), "hello");
|
||||
assert_eq!(doc.field_values().len(), 2);
|
||||
let mut payload: Vec<u8> = Vec::new();
|
||||
doc.serialize(&mut payload).unwrap();
|
||||
assert_eq!(payload.len(), 26);
|
||||
Document::deserialize(&mut &payload[..]).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::DateTime;
|
||||
///
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
const FAST_VALUE_TERM_LEN: usize = 8;
|
||||
|
||||
/// Separates the different segments of
|
||||
/// the json path.
|
||||
@@ -33,22 +33,33 @@ pub const JSON_END_OF_PATH: u8 = 0u8;
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
#[derive(Clone)]
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where B: AsRef<[u8]>;
|
||||
pub struct Term<B = Vec<u8>> {
|
||||
data: B,
|
||||
field: Field,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl AsMut<Vec<u8>> for Term {
|
||||
fn as_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
&mut self.data
|
||||
}
|
||||
}
|
||||
|
||||
impl Term {
|
||||
pub(crate) fn new() -> Term {
|
||||
Term(Vec::with_capacity(100))
|
||||
Self::with_capacity(32)
|
||||
}
|
||||
|
||||
pub(crate) fn with_capacity(cap: usize) -> Term {
|
||||
Term {
|
||||
data: Vec::with_capacity(cap),
|
||||
field: Field::from_field_id(0),
|
||||
field_type: Type::Str,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_fast_value<T: FastValue>(field: Field, val: &T) -> Term {
|
||||
let mut term = Term(vec![0u8; FAST_VALUE_TERM_LEN]);
|
||||
let mut term = Term::with_capacity(FAST_VALUE_TERM_LEN);
|
||||
term.set_field(T::to_type(), field);
|
||||
term.set_u64(val.to_u64());
|
||||
term
|
||||
@@ -86,9 +97,9 @@ impl Term {
|
||||
}
|
||||
|
||||
fn create_bytes_term(typ: Type, field: Field, bytes: &[u8]) -> Term {
|
||||
let mut term = Term(vec![0u8; 5 + bytes.len()]);
|
||||
let mut term = Term::with_capacity(bytes.len());
|
||||
term.set_field(typ, field);
|
||||
term.0.extend_from_slice(bytes);
|
||||
term.data.extend_from_slice(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
@@ -98,10 +109,9 @@ impl Term {
|
||||
}
|
||||
|
||||
pub(crate) fn set_field(&mut self, typ: Type, field: Field) {
|
||||
self.0.clear();
|
||||
self.0
|
||||
.extend_from_slice(field.field_id().to_be_bytes().as_ref());
|
||||
self.0.push(typ.to_code());
|
||||
self.field = field;
|
||||
self.field_type = typ;
|
||||
self.data.clear();
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
@@ -112,11 +122,9 @@ impl Term {
|
||||
/// the natural order of the values.
|
||||
pub fn set_u64(&mut self, val: u64) {
|
||||
self.set_fast_value(val);
|
||||
self.set_bytes(val.to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
self.0.resize(FAST_VALUE_TERM_LEN, 0u8);
|
||||
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
@@ -137,8 +145,8 @@ impl Term {
|
||||
|
||||
/// Sets the value of a `Bytes` field.
|
||||
pub fn set_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.resize(5, 0u8);
|
||||
self.0.extend(bytes);
|
||||
self.data.clear();
|
||||
self.data.extend(bytes);
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
@@ -148,18 +156,18 @@ impl Term {
|
||||
|
||||
/// Removes the value_bytes and set the type code.
|
||||
pub fn clear_with_type(&mut self, typ: Type) {
|
||||
self.truncate(5);
|
||||
self.0[4] = typ.to_code();
|
||||
self.data.clear();
|
||||
self.field_type = typ;
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn truncate(&mut self, len: usize) {
|
||||
self.0.truncate(len);
|
||||
self.data.truncate(len);
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn append_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.extend_from_slice(bytes);
|
||||
self.data.extend_from_slice(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,7 +175,7 @@ impl<B> Ord for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.as_slice().cmp(other.as_slice())
|
||||
self.value_bytes().cmp(other.value_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +191,7 @@ impl<B> PartialEq for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_slice() == other.as_slice()
|
||||
self.value_bytes() == other.value_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -193,7 +201,7 @@ impl<B> Hash for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.0.as_ref().hash(state)
|
||||
self.data.as_ref().hash(state)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,14 +210,15 @@ where B: AsRef<[u8]>
|
||||
{
|
||||
/// Wraps a object holding bytes
|
||||
pub fn wrap(data: B) -> Term<B> {
|
||||
Term(data)
|
||||
Term {
|
||||
data,
|
||||
field: Field::from_field_id(0),
|
||||
field_type: Type::Str,
|
||||
}
|
||||
}
|
||||
|
||||
fn typ_code(&self) -> u8 {
|
||||
*self
|
||||
.as_slice()
|
||||
.get(4)
|
||||
.expect("the byte representation is too short")
|
||||
self.field_type as u8
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
@@ -219,55 +228,7 @@ where B: AsRef<[u8]>
|
||||
|
||||
/// Returns the field.
|
||||
pub fn field(&self) -> Field {
|
||||
let mut field_id_bytes = [0u8; 4];
|
||||
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
|
||||
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
|
||||
}
|
||||
|
||||
/// Returns the `u64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the u64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_u64(&self) -> Option<u64> {
|
||||
self.get_fast_type::<u64>()
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
let bytes = self.value_bytes();
|
||||
if bytes.len() != 8 {
|
||||
return None;
|
||||
}
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
|
||||
/// Returns the `i64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the i64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
self.get_fast_type::<i64>()
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the f64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_f64(&self) -> Option<f64> {
|
||||
self.get_fast_type::<f64>()
|
||||
}
|
||||
|
||||
/// Returns the `Date` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the Date type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_date(&self) -> Option<DateTime> {
|
||||
self.get_fast_type::<DateTime>()
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
@@ -275,43 +236,12 @@ where B: AsRef<[u8]>
|
||||
/// Returns None if the field is not of string type
|
||||
/// or if the bytes are not valid utf-8.
|
||||
pub fn as_str(&self) -> Option<&str> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Str {
|
||||
return None;
|
||||
}
|
||||
str::from_utf8(self.value_bytes()).ok()
|
||||
}
|
||||
|
||||
/// Returns the facet associated with the term.
|
||||
///
|
||||
/// Returns None if the field is not of facet type
|
||||
/// or if the bytes are not valid utf-8.
|
||||
pub fn as_facet(&self) -> Option<Facet> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Facet {
|
||||
return None;
|
||||
}
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
|
||||
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
|
||||
}
|
||||
|
||||
/// Returns the bytes associated with the term.
|
||||
///
|
||||
/// Returns None if the field is not of bytes type.
|
||||
pub fn as_bytes(&self) -> Option<&[u8]> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Bytes {
|
||||
return None;
|
||||
}
|
||||
Some(self.value_bytes())
|
||||
}
|
||||
|
||||
/// Returns the serialized value of the term.
|
||||
/// (this does not include the field.)
|
||||
///
|
||||
@@ -319,15 +249,7 @@ where B: AsRef<[u8]>
|
||||
/// If the term is a u64, its value is encoded according
|
||||
/// to `byteorder::LittleEndian`.
|
||||
pub fn value_bytes(&self) -> &[u8] {
|
||||
&self.0.as_ref()[5..]
|
||||
}
|
||||
|
||||
/// Returns the underlying `&[u8]`.
|
||||
///
|
||||
/// Do NOT rely on this byte representation in the index.
|
||||
/// This value is likely to change in the future.
|
||||
pub(crate) fn as_slice(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
&self.data.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -434,7 +356,6 @@ mod tests {
|
||||
let term = Term::from_field_u64(count_field, 983u64);
|
||||
assert_eq!(term.field(), count_field);
|
||||
assert_eq!(term.typ(), Type::U64);
|
||||
assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN);
|
||||
assert_eq!(term.as_u64(), Some(983u64))
|
||||
assert_eq!(term.value_bytes().len(), super::FAST_VALUE_TERM_LEN);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,11 +42,6 @@ impl TextOptions {
|
||||
/// Text fast fields will have the term ids stored in the fast field.
|
||||
/// The fast field will be a multivalued fast field.
|
||||
///
|
||||
/// The effective cardinality depends on the tokenizer. When creating fast fields on text
|
||||
/// fields it is recommended to use the "raw" tokenizer, since it will store the original text
|
||||
/// unchanged. The "default" tokenizer will store the terms as lower case and this will be
|
||||
/// reflected in the dictionary.
|
||||
///
|
||||
/// The original text can be retrieved via `ord_to_term` from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> TextOptions {
|
||||
|
||||
@@ -388,16 +388,8 @@ mod binary_serialize {
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
// As explained in
|
||||
// https://docs.serde.rs/serde_json/fn.from_reader.html
|
||||
//
|
||||
// `T::from_reader(..)` expects EOF after reading the object,
|
||||
// which is not what we want here.
|
||||
//
|
||||
// For this reason we need to create our own `Deserializer`.
|
||||
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||
let json_map = <serde_json::Map::<String, serde_json::Value> as serde::Deserialize>::deserialize(&mut de)?;
|
||||
Ok(Value::JsonObject(json_map))
|
||||
let map = serde_json::from_reader(reader)?;
|
||||
Ok(Value::JsonObject(map))
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
use zstd::bulk::{compress_to_buffer, decompress_to_buffer};
|
||||
use zstd::DEFAULT_COMPRESSION_LEVEL;
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let count_size = std::mem::size_of::<u32>();
|
||||
let max_size = zstd::zstd_safe::compress_bound(uncompressed.len()) + count_size;
|
||||
|
||||
compressed.clear();
|
||||
compressed.resize(max_size, 0);
|
||||
|
||||
let compressed_size = compress_to_buffer(
|
||||
uncompressed,
|
||||
&mut compressed[count_size..],
|
||||
DEFAULT_COMPRESSION_LEVEL,
|
||||
)?;
|
||||
|
||||
compressed[0..count_size].copy_from_slice(&(uncompressed.len() as u32).to_le_bytes());
|
||||
compressed.resize(compressed_size + count_size, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let count_size = std::mem::size_of::<u32>();
|
||||
let uncompressed_size = u32::from_le_bytes(
|
||||
compressed
|
||||
.get(..count_size)
|
||||
.ok_or(io::ErrorKind::InvalidData)?
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
) as usize;
|
||||
|
||||
decompressed.clear();
|
||||
decompressed.resize(uncompressed_size, 0);
|
||||
|
||||
let decompressed_size = decompress_to_buffer(&compressed[count_size..], decompressed)?;
|
||||
|
||||
if decompressed_size != uncompressed_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"doc store block not completely decompressed, data corruption".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -26,9 +26,6 @@ pub enum Compressor {
|
||||
#[serde(rename = "snappy")]
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
#[serde(rename = "zstd")]
|
||||
/// Use the zstd compressor
|
||||
Zstd,
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
@@ -39,8 +36,6 @@ impl Default for Compressor {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else if cfg!(feature = "zstd-compression") {
|
||||
Compressor::Zstd
|
||||
} else {
|
||||
Compressor::None
|
||||
}
|
||||
@@ -54,7 +49,6 @@ impl Compressor {
|
||||
1 => Compressor::Lz4,
|
||||
2 => Compressor::Brotli,
|
||||
3 => Compressor::Snappy,
|
||||
4 => Compressor::Zstd,
|
||||
_ => panic!("unknown compressor id {:?}", id),
|
||||
}
|
||||
}
|
||||
@@ -64,7 +58,6 @@ impl Compressor {
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
Self::Zstd => 4,
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
@@ -105,16 +98,6 @@ impl Compressor {
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,16 +143,6 @@ impl Compressor {
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,9 +50,6 @@ mod compression_brotli;
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
mod compression_zstd_block;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -72,13 +69,10 @@ pub mod tests {
|
||||
sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \
|
||||
mollit anim id est laborum.";
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
pub fn write_lorem_ipsum_store(
|
||||
writer: WritePtr,
|
||||
num_docs: usize,
|
||||
compressor: Compressor,
|
||||
blocksize: usize,
|
||||
) -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
@@ -86,7 +80,7 @@ pub mod tests {
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(writer, compressor, blocksize);
|
||||
let mut store_writer = StoreWriter::new(writer, compressor);
|
||||
for i in 0..num_docs {
|
||||
let mut doc = Document::default();
|
||||
doc.add_field_value(field_body, LOREM.to_string());
|
||||
@@ -109,7 +103,7 @@ pub mod tests {
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4, BLOCK_SIZE);
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
@@ -145,11 +139,11 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_store(compressor: Compressor, blocksize: usize) -> crate::Result<()> {
|
||||
fn test_store(compressor: Compressor) -> crate::Result<()> {
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize);
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
@@ -175,28 +169,22 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_store_noop() -> crate::Result<()> {
|
||||
test_store(Compressor::None, BLOCK_SIZE)
|
||||
test_store(Compressor::None)
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[test]
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4, BLOCK_SIZE)
|
||||
test_store(Compressor::Lz4)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy, BLOCK_SIZE)
|
||||
test_store(Compressor::Snappy)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli, BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
fn test_store_zstd() -> crate::Result<()> {
|
||||
test_store(Compressor::Zstd, BLOCK_SIZE)
|
||||
test_store(Compressor::Brotli)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -360,7 +348,6 @@ mod bench {
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
16_384,
|
||||
);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
@@ -374,7 +361,6 @@ mod bench {
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
16_384,
|
||||
);
|
||||
let store_file = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::open(store_file).unwrap();
|
||||
|
||||
@@ -304,8 +304,6 @@ mod tests {
|
||||
use crate::store::tests::write_lorem_ipsum_store;
|
||||
use crate::Directory;
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
|
||||
doc.get_first(*field).and_then(|f| f.as_text())
|
||||
}
|
||||
@@ -315,7 +313,7 @@ mod tests {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
let writer = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default(), BLOCK_SIZE);
|
||||
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default());
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
|
||||
@@ -11,6 +11,8 @@ use crate::schema::Document;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
/// Write tantivy's [`Store`](./index.html)
|
||||
///
|
||||
/// Contrary to the other components of `tantivy`,
|
||||
@@ -20,7 +22,6 @@ use crate::DocId;
|
||||
/// The skip list index on the other hand, is built in memory.
|
||||
pub struct StoreWriter {
|
||||
compressor: Compressor,
|
||||
block_size: usize,
|
||||
doc: DocId,
|
||||
first_doc_in_block: DocId,
|
||||
offset_index_writer: SkipIndexBuilder,
|
||||
@@ -34,10 +35,9 @@ impl StoreWriter {
|
||||
///
|
||||
/// The store writer will writes blocks on disc as
|
||||
/// document are added.
|
||||
pub fn new(writer: WritePtr, compressor: Compressor, block_size: usize) -> StoreWriter {
|
||||
pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter {
|
||||
StoreWriter {
|
||||
compressor,
|
||||
block_size,
|
||||
doc: 0,
|
||||
first_doc_in_block: 0,
|
||||
offset_index_writer: SkipIndexBuilder::new(),
|
||||
@@ -65,7 +65,7 @@ impl StoreWriter {
|
||||
VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?;
|
||||
self.current_block.write_all(serialized_document)?;
|
||||
self.doc += 1;
|
||||
if self.current_block.len() > self.block_size {
|
||||
if self.current_block.len() > BLOCK_SIZE {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -86,7 +86,7 @@ impl StoreWriter {
|
||||
self.current_block
|
||||
.write_all(&self.intermediary_buffer[..])?;
|
||||
self.doc += 1;
|
||||
if self.current_block.len() > self.block_size {
|
||||
if self.current_block.len() > BLOCK_SIZE {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -28,6 +28,7 @@ use fst_termdict as termdict;
|
||||
mod sstable_termdict;
|
||||
#[cfg(feature = "quickwit")]
|
||||
use sstable_termdict as termdict;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
@@ -35,4 +36,24 @@ mod tests;
|
||||
/// Position of the term in the sorted list of terms.
|
||||
pub type TermOrdinal = u64;
|
||||
|
||||
pub use self::termdict::{TermDictionary, TermDictionaryBuilder, TermMerger, TermStreamer};
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
pub type TermDictionary = self::termdict::TermDictionary;
|
||||
|
||||
/// Builder for the new term dictionary.
|
||||
///
|
||||
/// Inserting must be done in the order of the `keys`.
|
||||
pub type TermDictionaryBuilder<W> = self::termdict::TermDictionaryBuilder<W>;
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub type TermMerger<'a> = self::termdict::TermMerger<'a>;
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
|
||||
|
||||
@@ -145,12 +145,6 @@ where
|
||||
}
|
||||
|
||||
pub fn write_key(&mut self, key: &[u8]) {
|
||||
// If this is the first key in the block, we use it to
|
||||
// shorten the last term in the last block.
|
||||
if self.first_ordinal_of_the_block == self.num_terms {
|
||||
self.index_builder
|
||||
.shorten_last_block_key_given_next_key(key);
|
||||
}
|
||||
let keep_len = common_prefix_len(&self.previous_key, key);
|
||||
let add_len = key.len() - keep_len;
|
||||
let increasing_keys = add_len > 0 && (self.previous_key.len() == keep_len)
|
||||
@@ -279,12 +273,11 @@ mod test {
|
||||
33u8, 18u8, 19u8, // keep 1 push 1 | 20
|
||||
17u8, 20u8, 0u8, 0u8, 0u8, 0u8, // no more blocks
|
||||
// index
|
||||
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 115, 108, 97, 115, 116, 95, 107,
|
||||
101, 121, 95, 111, 114, 95, 103, 114, 101, 97, 116, 101, 114, 130, 17, 20, 106, 98,
|
||||
108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106, 98, 121, 116, 101, 95, 114, 97,
|
||||
110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0, 99, 101, 110, 100, 11, 109,
|
||||
102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110, 97, 108, 0, 15, 0, 0, 0, 0,
|
||||
0, 0, 0, // offset for the index
|
||||
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 104, 108, 97, 115, 116, 95, 107,
|
||||
101, 121, 130, 17, 20, 106, 98, 108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106,
|
||||
98, 121, 116, 101, 95, 114, 97, 110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0,
|
||||
99, 101, 110, 100, 11, 109, 102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110,
|
||||
97, 108, 0, 15, 0, 0, 0, 0, 0, 0, 0, // offset for the index
|
||||
3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8 // num terms
|
||||
]
|
||||
);
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::ops::Range;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::DataCorruption;
|
||||
use crate::termdict::sstable_termdict::sstable::common_prefix_len;
|
||||
|
||||
#[derive(Default, Debug, Serialize, Deserialize)]
|
||||
pub struct SSTableIndex {
|
||||
@@ -20,7 +19,7 @@ impl SSTableIndex {
|
||||
pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
|
||||
self.blocks
|
||||
.iter()
|
||||
.find(|block| &block.last_key_or_greater[..] >= key)
|
||||
.find(|block| &block.last_key[..] >= key)
|
||||
.map(|block| block.block_addr.clone())
|
||||
}
|
||||
}
|
||||
@@ -33,10 +32,7 @@ pub struct BlockAddr {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct BlockMeta {
|
||||
/// Any byte string that is lexicographically greater or equal to
|
||||
/// the last key in the block,
|
||||
/// and yet stricly smaller than the first key in the next block.
|
||||
pub last_key_or_greater: Vec<u8>,
|
||||
pub last_key: Vec<u8>,
|
||||
pub block_addr: BlockAddr,
|
||||
}
|
||||
|
||||
@@ -45,39 +41,10 @@ pub struct SSTableIndexBuilder {
|
||||
index: SSTableIndex,
|
||||
}
|
||||
|
||||
/// Given that left < right,
|
||||
/// mutates `left into a shorter byte string left'` that
|
||||
/// matches `left <= left' < right`.
|
||||
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
|
||||
assert!(&left[..] < right);
|
||||
let common_len = common_prefix_len(&left, right);
|
||||
if left.len() == common_len {
|
||||
return;
|
||||
}
|
||||
// It is possible to do one character shorter in some case,
|
||||
// but it is not worth the extra complexity
|
||||
for pos in (common_len + 1)..left.len() {
|
||||
if left[pos] != u8::MAX {
|
||||
left[pos] += 1;
|
||||
left.truncate(pos + 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SSTableIndexBuilder {
|
||||
/// In order to make the index as light as possible, we
|
||||
/// try to find a shorter alternative to the last key of the last block
|
||||
/// that is still smaller than the next key.
|
||||
pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
|
||||
if let Some(last_block) = self.index.blocks.last_mut() {
|
||||
find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
|
||||
self.index.blocks.push(BlockMeta {
|
||||
last_key_or_greater: last_key.to_vec(),
|
||||
last_key: last_key.to_vec(),
|
||||
block_addr: BlockAddr {
|
||||
byte_range,
|
||||
first_ordinal,
|
||||
@@ -130,35 +97,4 @@ mod tests {
|
||||
"Data corruption: SSTable index is corrupted."
|
||||
);
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
|
||||
let mut left_buf = left.to_vec();
|
||||
super::find_shorter_str_in_between(&mut left_buf, right);
|
||||
assert!(left_buf.len() <= left.len());
|
||||
assert!(left <= &left_buf);
|
||||
assert!(&left_buf[..] < &right);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_shorter_str_in_between() {
|
||||
test_find_shorter_str_in_between_aux(b"", b"hello");
|
||||
test_find_shorter_str_in_between_aux(b"abc", b"abcd");
|
||||
test_find_shorter_str_in_between_aux(b"abcd", b"abd");
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
|
||||
if left < right {
|
||||
test_find_shorter_str_in_between_aux(&left, &right);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,13 +25,6 @@ pub struct TokenizerManager {
|
||||
}
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Creates an empty tokenizer manager.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
where TextAnalyzer: From<T> {
|
||||
@@ -59,7 +52,9 @@ impl Default for TokenizerManager {
|
||||
/// - en_stem
|
||||
/// - ja
|
||||
fn default() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
let manager = TokenizerManager {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
};
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
|
||||
Reference in New Issue
Block a user