Compare commits

...

10 Commits

Author SHA1 Message Date
PSeitz
8199aa7de7 bump version to 0.20.2 (#2089) 2023-06-12 18:56:54 +08:00
PSeitz
657f0cd3bd add missing Bytes validation to term_agg (#2077)
returns empty for now instead of failing like before
2023-06-12 16:38:07 +08:00
Adam Reichold
3a82ef2560 Fix is_child_of function not considering the root facet. (#2086) 2023-06-12 08:35:18 +02:00
PSeitz
3546e7fc63 small agg limit docs improvement (#2073)
small docs improvement as follow up on bug https://github.com/quickwit-oss/quickwit/issues/3503
2023-06-12 10:55:24 +09:00
PSeitz
862f367f9e release without Alice in Wonderland, bump version to 0.20.1 (#2087)
* Release without Alice in Wonderland

* bump version to 0.20.1
2023-06-12 10:54:03 +09:00
PSeitz
14137d91c4 Update CHANGELOG.md (#2081) 2023-06-12 10:53:40 +09:00
François Massot
924fc70cb5 Merge pull request #2088 from quickwit-oss/fmassot/align-type-priorities-for-json-numbers
Align numerical type priority order on the search side.
2023-06-11 22:04:54 +02:00
François Massot
07023948aa Add test that indexes and searches a JSON field. 2023-06-11 21:47:52 +02:00
François Massot
0cb53207ec Fix tests. 2023-06-11 12:13:35 +02:00
François Massot
17c783b4db Align numerical type priority order on the search side. 2023-06-11 11:49:27 +02:00
9 changed files with 183 additions and 13 deletions

View File

@@ -27,6 +27,7 @@ Tantivy 0.20 [Unreleased]
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
@@ -39,10 +40,10 @@ Tantivy 0.20 [Unreleased]
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@Denis Bazhenov)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
- Faster indexing
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
- Faster term hash map [#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
- Faster search
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
@@ -51,7 +52,8 @@ Tantivy 0.20 [Unreleased]
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@Yukun Guo)
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
- sstable

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.20.0"
version = "0.20.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,6 +12,7 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.62"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
oneshot = "0.1.5"

View File

@@ -60,6 +60,8 @@ impl AggregationLimits {
/// *bucket_limit*
/// Limits the maximum number of buckets returned from an aggregation request.
/// bucket_limit will default to `DEFAULT_BUCKET_LIMIT` (65000)
///
/// Note: The returned instance contains a Arc shared counter to track memory consumption.
pub fn new(memory_limit: Option<u64>, bucket_limit: Option<u32>) -> Self {
Self {
memory_consumption: Default::default(),

View File

@@ -74,14 +74,14 @@ impl AggregationWithAccessor {
ColumnType::I64,
ColumnType::U64,
ColumnType::F64,
ColumnType::Bytes,
ColumnType::Str,
// ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
];
let mut columns =
get_all_ff_reader(reader, field_name, Some(&allowed_column_types))?;
get_all_ff_reader_or_empty(reader, field_name, Some(&allowed_column_types))?;
let first = columns.pop().unwrap();
accessor2 = columns.pop();
first
@@ -177,7 +177,7 @@ fn get_ff_reader(
/// Get all fast field reader or empty as default.
///
/// Is guaranteed to return at least one column.
fn get_all_ff_reader(
fn get_all_ff_reader_or_empty(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,

View File

@@ -428,6 +428,12 @@ impl SegmentTermCollector {
field_type: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
if field_type == ColumnType::Bytes || field_type == ColumnType::Bool {
return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {:?}",
field_type
)));
}
let term_buckets = TermBuckets::default();
if let Some(custom_order) = req.order.as_ref() {
@@ -1500,4 +1506,41 @@ mod tests {
Ok(())
}
#[test]
fn terms_aggregation_bytes() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let bytes_field = schema_builder.add_bytes_field("bytes", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
bytes_field => vec![1,2,3],
))?;
index_writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": {
"field": "bytes"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// TODO: Returning an error would be better instead of an empty result, since this is not a
// JSON field
assert_eq!(
res["my_texts"]["buckets"][0]["key"],
serde_json::Value::Null
);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
Ok(())
}
}

View File

@@ -161,6 +161,21 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// ]);
/// }
///
/// {
/// let mut facet_collector = FacetCollector::for_field("facet");
/// facet_collector.add_facet("/");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts
/// .get("/")
/// .collect();
/// assert_eq!(facets, vec![
/// (&Facet::from("/category"), 4),
/// (&Facet::from("/lang"), 4)
/// ]);
/// }
///
/// Ok(())
/// }
/// # assert!(example().is_ok());
@@ -285,6 +300,9 @@ fn is_child_facet(parent_facet: &[u8], possible_child_facet: &[u8]) -> bool {
if !possible_child_facet.starts_with(parent_facet) {
return false;
}
if parent_facet.is_empty() {
return true;
}
possible_child_facet.get(parent_facet.len()).copied() == Some(0u8)
}
@@ -789,6 +807,15 @@ mod tests {
);
Ok(())
}
#[test]
fn is_child_facet() {
assert!(super::is_child_facet(&b"foo"[..], &b"foo\0bar"[..]));
assert!(super::is_child_facet(&b""[..], &b"foo\0bar"[..]));
assert!(super::is_child_facet(&b""[..], &b"foo"[..]));
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -212,12 +212,12 @@ pub fn convert_to_fast_value_and_get_term(
DateTime::from_utc(dt_utc),
));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
}

View File

@@ -302,6 +302,7 @@ pub struct DocAddress {
#[cfg(test)]
pub mod tests {
use common::{BinarySerializable, FixedSize};
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
use rand::distributions::{Bernoulli, Uniform};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
@@ -857,6 +858,95 @@ pub mod tests {
Ok(())
}
#[test]
fn test_searcher_on_json_field_with_type_inference() {
// When indexing and searching a json value, we infer its type.
// This tests aims to check the type infereence is consistent between indexing and search.
// Inference order is date, i64, u64, f64, bool.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
r#"{
"signed": 2,
"float": 2.0,
"unsigned": 10000000000000,
"date": "1985-04-12T23:20:50.52Z",
"bool": true
}"#,
)
.unwrap();
let doc = doc!(json_field=>json_val.clone());
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_doc_ids = |user_input_literal: UserInputLiteral| {
let query_parser = crate::query::QueryParser::for_index(&index, Vec::new());
let query = query_parser
.build_query_from_user_input_ast(UserInputAst::from(UserInputLeaf::Literal(
user_input_literal,
)))
.unwrap();
searcher
.search(&query, &TEST_COLLECTOR_WITH_SCORE)
.map(|topdocs| topdocs.docs().to_vec())
.unwrap()
};
{
let user_input_literal = UserInputLiteral {
field_name: Some("json.signed".to_string()),
phrase: "2".to_string(),
delimiter: crate::query_grammar::Delimiter::None,
slop: 0,
prefix: false,
};
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
}
{
let user_input_literal = UserInputLiteral {
field_name: Some("json.float".to_string()),
phrase: "2.0".to_string(),
delimiter: crate::query_grammar::Delimiter::None,
slop: 0,
prefix: false,
};
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
}
{
let user_input_literal = UserInputLiteral {
field_name: Some("json.date".to_string()),
phrase: "1985-04-12T23:20:50.52Z".to_string(),
delimiter: crate::query_grammar::Delimiter::None,
slop: 0,
prefix: false,
};
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
}
{
let user_input_literal = UserInputLiteral {
field_name: Some("json.unsigned".to_string()),
phrase: "10000000000000".to_string(),
delimiter: crate::query_grammar::Delimiter::None,
slop: 0,
prefix: false,
};
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
}
{
let user_input_literal = UserInputLiteral {
field_name: Some("json.bool".to_string()),
phrase: "true".to_string(),
delimiter: crate::query_grammar::Delimiter::None,
slop: 0,
prefix: false,
};
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
}
}
#[test]
fn test_doc_macro() {
let mut schema_builder = Schema::builder();

View File

@@ -1203,7 +1203,7 @@ mod test {
fn test_json_field_possibly_a_number() {
test_parse_query_to_logical_ast_helper(
"json.titi:5",
r#"(Term(field=14, type=Json, path=titi, type=U64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
r#"(Term(field=14, type=Json, path=titi, type=I64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
true,
);
test_parse_query_to_logical_ast_helper(
@@ -1211,6 +1211,11 @@ mod test {
r#"(Term(field=14, type=Json, path=titi, type=I64, -5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
true,
);
test_parse_query_to_logical_ast_helper(
"json.titi:10000000000000000000",
r#"(Term(field=14, type=Json, path=titi, type=U64, 10000000000000000000) Term(field=14, type=Json, path=titi, type=Str, "10000000000000000000"))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"json.titi:-5.2",
r#"(Term(field=14, type=Json, path=titi, type=F64, -5.2) "[(0, Term(field=14, type=Json, path=titi, type=Str, "5")), (1, Term(field=14, type=Json, path=titi, type=Str, "2"))]")"#,
@@ -1260,7 +1265,7 @@ mod test {
fn test_json_default() {
test_query_to_logical_ast_with_default_json(
"titi:4",
"(Term(field=14, type=Json, path=titi, type=U64, 4) Term(field=14, type=Json, \
"(Term(field=14, type=Json, path=titi, type=I64, 4) Term(field=14, type=Json, \
path=titi, type=Str, \"4\"))",
false,
);
@@ -1282,7 +1287,7 @@ mod test {
for conjunction in [false, true] {
test_query_to_logical_ast_with_default_json(
"json:4",
r#"(Term(field=14, type=Json, path=, type=U64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
r#"(Term(field=14, type=Json, path=, type=I64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
conjunction,
);
}