mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
Compare commits
6 Commits
unit-test-
...
qw-airmail
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb5f51f3c4 | ||
|
|
7963b0b4aa | ||
|
|
d5eefca11d | ||
|
|
5d6c8de23e | ||
|
|
a06365f39f | ||
|
|
f4b374110f |
20
CHANGELOG.md
20
CHANGELOG.md
@@ -14,6 +14,18 @@ Tantivy 0.25
|
||||
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
|
||||
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
|
||||
|
||||
Tantivy 0.24.2
|
||||
================================
|
||||
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
|
||||
|
||||
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
|
||||
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
|
||||
for `Order::Asc`
|
||||
|
||||
Tantivy 0.24.1
|
||||
================================
|
||||
- Fix: bump required rust version to 1.81
|
||||
|
||||
Tantivy 0.24
|
||||
================================
|
||||
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
|
||||
@@ -96,6 +108,14 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
|
||||
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
|
||||
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
|
||||
|
||||
Tantivy 0.22.1
|
||||
================================
|
||||
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
|
||||
|
||||
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
|
||||
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
|
||||
for `Order::Asc`
|
||||
|
||||
Tantivy 0.22
|
||||
================================
|
||||
|
||||
|
||||
@@ -167,3 +167,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "agg_bench"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "exists_json"
|
||||
harness = false
|
||||
|
||||
69
benches/exists_json.rs
Normal file
69
benches/exists_json.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use serde_json::json;
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::ExistsQuery;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
|
||||
fn main() {
|
||||
let doc_count: usize = 500_000;
|
||||
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
|
||||
|
||||
let indices: Vec<(String, Index)> = subfield_counts
|
||||
.iter()
|
||||
.map(|&sub_fields| {
|
||||
(
|
||||
format!("subfields={sub_fields}"),
|
||||
build_index_with_json_subfields(doc_count, sub_fields),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut group = InputGroup::new_with_inputs(indices);
|
||||
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
group.config().num_iter_group = Some(1);
|
||||
group.config().num_iter_bench = Some(1);
|
||||
group.register("exists_json", exists_json_union);
|
||||
|
||||
group.run();
|
||||
}
|
||||
|
||||
fn exists_json_union(index: &Index) {
|
||||
let reader = index.reader().expect("reader");
|
||||
let searcher = reader.searcher();
|
||||
let query = ExistsQuery::new("json".to_string(), true);
|
||||
let count = searcher.search(&query, &Count).expect("exists search");
|
||||
// Prevents optimizer from eliding the search
|
||||
black_box(count);
|
||||
}
|
||||
|
||||
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
|
||||
// Schema: single JSON field stored as FAST to support ExistsQuery.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).expect("create index");
|
||||
{
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 200_000_000)
|
||||
.expect("writer");
|
||||
for i in 0..num_docs {
|
||||
let sub = i % num_subfields;
|
||||
// Only one subpath set per document; rotate subpaths so that
|
||||
// no single subpath is full, but the union covers all docs.
|
||||
let v = json!({ format!("field_{sub}"): i as u64 });
|
||||
index_writer
|
||||
.add_document(doc!(json_field => v))
|
||||
.expect("add_document");
|
||||
}
|
||||
index_writer.commit().expect("commit");
|
||||
}
|
||||
|
||||
index
|
||||
}
|
||||
@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
|
||||
ColumnIndex::Full => Box::new(doc_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
|
||||
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
},
|
||||
@@ -177,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
||||
ColumnIndex::Full => Box::new(columnar_row_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row_id: RowId| columnar_row_range.start + row_id),
|
||||
),
|
||||
ColumnIndex::Multivalued(_) => {
|
||||
|
||||
@@ -215,6 +215,32 @@ impl MultiValueIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over document ids that have at least one value.
|
||||
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => {
|
||||
let mut doc: DocId = 0u32;
|
||||
let num_docs = idx.num_docs();
|
||||
Box::new(std::iter::from_fn(move || {
|
||||
// This is not the most efficient way to do this, but it's legacy code.
|
||||
while doc < num_docs {
|
||||
let cur = doc;
|
||||
doc += 1;
|
||||
let start = idx.start_index_column.get_val(cur);
|
||||
let end = idx.start_index_column.get_val(cur + 1);
|
||||
if end > start {
|
||||
return Some(cur);
|
||||
}
|
||||
}
|
||||
None
|
||||
}))
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => {
|
||||
Box::new(idx.optional_index.iter_non_null_docs())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
|
||||
@@ -88,7 +88,7 @@ pub struct OptionalIndex {
|
||||
|
||||
impl Iterable<u32> for &OptionalIndex {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.iter_docs())
|
||||
Box::new(self.iter_non_null_docs())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,8 +280,9 @@ impl OptionalIndex {
|
||||
self.num_non_null_docs
|
||||
}
|
||||
|
||||
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize
|
||||
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize. We could iterate over the blocks directly.
|
||||
// We use the dense value ids and retrieve the doc ids via select.
|
||||
let mut select_batch = self.select_cursor();
|
||||
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
|
||||
}
|
||||
|
||||
@@ -164,7 +164,11 @@ fn test_optional_index_large() {
|
||||
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
|
||||
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
|
||||
assert_eq!(optional_index.num_docs(), num_rows);
|
||||
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
|
||||
assert!(
|
||||
optional_index
|
||||
.iter_non_null_docs()
|
||||
.eq(row_ids.iter().copied())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -367,7 +367,7 @@ fn is_empty_after_merge(
|
||||
ColumnIndex::Empty { .. } => true,
|
||||
ColumnIndex::Full => alive_bitset.len() == 0,
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_docs() {
|
||||
for doc in optional_index.iter_non_null_docs() {
|
||||
if alive_bitset.contains(doc) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use common::DateTime;
|
||||
|
||||
use crate::InvalidData;
|
||||
@@ -9,6 +11,23 @@ pub enum NumericalValue {
|
||||
F64(f64),
|
||||
}
|
||||
|
||||
impl FromStr for NumericalValue {
|
||||
type Err = ();
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, ()> {
|
||||
if let Ok(val_i64) = s.parse::<i64>() {
|
||||
return Ok(val_i64.into());
|
||||
}
|
||||
if let Ok(val_u64) = s.parse::<u64>() {
|
||||
return Ok(val_u64.into());
|
||||
}
|
||||
if let Ok(val_f64) = s.parse::<f64>() {
|
||||
return Ok(NumericalValue::from(val_f64).normalize());
|
||||
}
|
||||
Err(())
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericalValue {
|
||||
pub fn numerical_type(&self) -> NumericalType {
|
||||
match self {
|
||||
@@ -26,7 +45,7 @@ impl NumericalValue {
|
||||
if val <= i64::MAX as u64 {
|
||||
NumericalValue::I64(val as i64)
|
||||
} else {
|
||||
NumericalValue::F64(val as f64)
|
||||
NumericalValue::U64(val)
|
||||
}
|
||||
}
|
||||
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::NumericalType;
|
||||
use crate::NumericalValue;
|
||||
|
||||
#[test]
|
||||
fn test_numerical_type_code() {
|
||||
@@ -153,4 +173,58 @@ mod tests {
|
||||
}
|
||||
assert_eq!(num_numerical_type, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_numerical() {
|
||||
assert_eq!(
|
||||
"123".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::I64(123)
|
||||
);
|
||||
assert_eq!(
|
||||
"18446744073709551615".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::U64(18446744073709551615u64)
|
||||
);
|
||||
assert_eq!(
|
||||
"1.0".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::I64(1i64)
|
||||
);
|
||||
assert_eq!(
|
||||
"1.1".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::F64(1.1f64)
|
||||
);
|
||||
assert_eq!(
|
||||
"-1.0".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::I64(-1i64)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_numerical() {
|
||||
assert_eq!(
|
||||
NumericalValue::from(1u64).normalize(),
|
||||
NumericalValue::I64(1i64),
|
||||
);
|
||||
let limit_val = i64::MAX as u64 + 1u64;
|
||||
assert_eq!(
|
||||
NumericalValue::from(limit_val).normalize(),
|
||||
NumericalValue::U64(limit_val),
|
||||
);
|
||||
assert_eq!(
|
||||
NumericalValue::from(-1i64).normalize(),
|
||||
NumericalValue::I64(-1i64),
|
||||
);
|
||||
assert_eq!(
|
||||
NumericalValue::from(-2.0f64).normalize(),
|
||||
NumericalValue::I64(-2i64),
|
||||
);
|
||||
assert_eq!(
|
||||
NumericalValue::from(-2.1f64).normalize(),
|
||||
NumericalValue::F64(-2.1f64),
|
||||
);
|
||||
let large_float = 2.0f64.powf(70.0f64);
|
||||
assert_eq!(
|
||||
NumericalValue::from(large_float).normalize(),
|
||||
NumericalValue::F64(large_float),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,6 +117,22 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
|
||||
mut first: F,
|
||||
mut second: G,
|
||||
) -> impl FnMut(I) -> JResult<I, O1>
|
||||
where
|
||||
F: nom::Parser<I, (O1, ErrorList), Infallible>,
|
||||
G: nom::Parser<I, (O2, ErrorList), Infallible>,
|
||||
{
|
||||
move |input: I| {
|
||||
let (input, (o1, mut err)) = first.parse(input)?;
|
||||
let (input, (_, mut err2)) = second.parse(input)?;
|
||||
err.append(&mut err2);
|
||||
Ok((input, (o1, err)))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
|
||||
mut first: F,
|
||||
mut second: G,
|
||||
|
||||
@@ -367,7 +367,10 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
|
||||
// something (a field name) got parsed before
|
||||
alt((
|
||||
map(
|
||||
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
|
||||
tuple((
|
||||
opt(field_name),
|
||||
alt((range, set, exists, regex, term_or_phrase)),
|
||||
)),
|
||||
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
|
||||
),
|
||||
term_group,
|
||||
@@ -389,6 +392,10 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|
||||
value((), peek(one_of("{[><"))),
|
||||
map(range_infallible, |(range, errs)| (Some(range), errs)),
|
||||
),
|
||||
(
|
||||
value((), peek(one_of("/"))),
|
||||
map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
|
||||
),
|
||||
),
|
||||
delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
|
||||
),
|
||||
@@ -689,6 +696,61 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
}
|
||||
}
|
||||
|
||||
fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||
map(
|
||||
terminated(
|
||||
delimited(
|
||||
char('/'),
|
||||
many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
|
||||
char('/'),
|
||||
),
|
||||
peek(alt((multispace1, eof))),
|
||||
),
|
||||
|elements| UserInputLeaf::Regex {
|
||||
field: None,
|
||||
pattern: elements.into_iter().collect::<String>(),
|
||||
},
|
||||
)(inp)
|
||||
}
|
||||
|
||||
fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
match terminated_infallible(
|
||||
delimited_infallible(
|
||||
opt_i_err(char('/'), "missing delimiter /"),
|
||||
opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
|
||||
opt_i_err(char('/'), "missing delimiter /"),
|
||||
),
|
||||
opt_i_err(
|
||||
peek(alt((multispace1, eof))),
|
||||
"expected whitespace or end of input",
|
||||
),
|
||||
)(inp)
|
||||
{
|
||||
Ok((rest, (elements_part, errors))) => {
|
||||
let pattern = match elements_part {
|
||||
Some(elements_part) => elements_part.into_iter().collect(),
|
||||
None => String::new(),
|
||||
};
|
||||
let res = UserInputLeaf::Regex {
|
||||
field: None,
|
||||
pattern,
|
||||
};
|
||||
Ok((rest, (res, errors)))
|
||||
}
|
||||
Err(e) => {
|
||||
let errs = vec![LenientErrorInternal {
|
||||
pos: inp.len(),
|
||||
message: e.to_string(),
|
||||
}];
|
||||
let res = UserInputLeaf::Regex {
|
||||
field: None,
|
||||
pattern: String::new(),
|
||||
};
|
||||
Ok((inp, (res, errs)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn negate(expr: UserInputAst) -> UserInputAst {
|
||||
expr.unary(Occur::MustNot)
|
||||
}
|
||||
@@ -1694,6 +1756,63 @@ mod test {
|
||||
test_is_parse_err(r#"!bc:def"#, "!bc:def");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regex_parser() {
|
||||
let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
|
||||
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
|
||||
let (_, input) = r.unwrap();
|
||||
match input {
|
||||
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
assert_eq!(field, &Some("a".to_string()));
|
||||
assert_eq!(pattern, "joh?n(ath[oa]n)");
|
||||
}
|
||||
_ => panic!("Expected a regex leaf, got {leaf:?}"),
|
||||
},
|
||||
_ => panic!("Expected a leaf"),
|
||||
}
|
||||
let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
|
||||
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
|
||||
let (_, input) = r.unwrap();
|
||||
match input {
|
||||
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
assert_eq!(field, &Some("a".to_string()));
|
||||
assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
|
||||
}
|
||||
_ => panic!("Expected a regex leaf, got {leaf:?}"),
|
||||
},
|
||||
_ => panic!("Expected a leaf"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regex_parser_lenient() {
|
||||
let literal = |query| literal_infallible(query).unwrap().1;
|
||||
|
||||
let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
|
||||
let expected = UserInputLeaf::Regex {
|
||||
field: Some("a".to_string()),
|
||||
pattern: "joh?n(ath[oa]n)".to_string(),
|
||||
}
|
||||
.into();
|
||||
assert_eq!(res.unwrap(), expected);
|
||||
assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
|
||||
|
||||
let (res, errs) = literal("title:/joh?n(ath[oa]n)");
|
||||
let expected = UserInputLeaf::Regex {
|
||||
field: Some("title".to_string()),
|
||||
pattern: "joh?n(ath[oa]n)".to_string(),
|
||||
}
|
||||
.into();
|
||||
assert_eq!(res.unwrap(), expected);
|
||||
assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
|
||||
assert_eq!(
|
||||
errs[0].message, "missing delimiter /",
|
||||
"Unexpected error message",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_space_before_value() {
|
||||
test_parse_query_to_ast_helper("field : a", r#""field":a"#);
|
||||
|
||||
@@ -23,6 +23,10 @@ pub enum UserInputLeaf {
|
||||
Exists {
|
||||
field: String,
|
||||
},
|
||||
Regex {
|
||||
field: Option<String>,
|
||||
pattern: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl UserInputLeaf {
|
||||
@@ -46,6 +50,7 @@ impl UserInputLeaf {
|
||||
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
|
||||
field: field.expect("Exist query without a field isn't allowed"),
|
||||
},
|
||||
UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,6 +108,14 @@ impl Debug for UserInputLeaf {
|
||||
UserInputLeaf::Exists { field } => {
|
||||
write!(formatter, "$exists(\"{field}\")")
|
||||
}
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
if let Some(field) = field {
|
||||
// TODO properly escape field (in case of \")
|
||||
write!(formatter, "\"{field}\":")?;
|
||||
}
|
||||
// TODO properly escape pattern (in case of \")
|
||||
write!(formatter, "/{pattern}/")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,7 +155,7 @@ fn test_aggregation_flushing(
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["bucketsL1"]["buckets"][0]["doc_count"], 3);
|
||||
assert_eq!(
|
||||
@@ -270,7 +270,7 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(res["average"]["value"], 12.142857142857142);
|
||||
assert_eq!(
|
||||
res["range"]["buckets"],
|
||||
@@ -304,29 +304,6 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_term_truncate_sum_other_doc_count() {
|
||||
let index = get_test_index_2_segments(true).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let count_per_text: Aggregation = serde_json::from_value(json!({ "terms": { "field": "text", "size": 1 } })).unwrap();
|
||||
let aggs: Aggregations = vec![("group_by_term_truncate".to_string(), count_per_text)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = get_collector(aggs);
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res).unwrap();
|
||||
assert_eq!(res, serde_json::json!({
|
||||
"group_by_term_truncate": {
|
||||
"buckets": [{ "doc_count": 7, "key": "cool" }],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 2,
|
||||
},
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_level1() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(true)?;
|
||||
@@ -365,7 +342,7 @@ fn test_aggregation_level1() -> crate::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(res["average"]["value"], 12.142857142857142);
|
||||
assert_eq!(res["average_f64"]["value"], 12.214285714285714);
|
||||
assert_eq!(res["average_i64"]["value"], 12.142857142857142);
|
||||
@@ -420,7 +397,7 @@ fn test_aggregation_level2(
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let elasticsearch_compatible_json_req = serde_json::json!(
|
||||
let elasticsearch_compatible_json_req = r#"
|
||||
{
|
||||
"rangef64": {
|
||||
"range": {
|
||||
@@ -473,8 +450,9 @@ fn test_aggregation_level2(
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
let agg_req: Aggregations = serde_json::from_value(elasticsearch_compatible_json_req).unwrap();
|
||||
}
|
||||
"#;
|
||||
let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector =
|
||||
@@ -491,7 +469,7 @@ fn test_aggregation_level2(
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::to_value(agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["range"]["buckets"][1]["key"], "3-7");
|
||||
assert_eq!(res["range"]["buckets"][1]["doc_count"], 2u64);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use columnar::NumericalValue;
|
||||
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use common::{replace_in_place, JsonPathWriter};
|
||||
use rustc_hash::FxHashMap;
|
||||
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
if let Ok(i64_val) = val.try_into() {
|
||||
term_buffer.append_type_and_fast_value::<i64>(i64_val);
|
||||
} else {
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
term_buffer.append_type_and_fast_value::<u64>(val);
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
ReferenceValueLeaf::F64(val) => {
|
||||
if !val.is_finite() {
|
||||
return;
|
||||
};
|
||||
set_path_id(
|
||||
term_buffer,
|
||||
ctx.path_to_unordered_id
|
||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||
);
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
// Normalize here is important.
|
||||
// In the inverted index, we coerce all numerical values to their canonical
|
||||
// representation.
|
||||
//
|
||||
// (We do the same thing on the query side)
|
||||
match NumericalValue::F64(val).normalize() {
|
||||
NumericalValue::I64(val_i64) => {
|
||||
term_buffer.append_type_and_fast_value::<i64>(val_i64);
|
||||
}
|
||||
NumericalValue::U64(val_u64) => {
|
||||
term_buffer.append_type_and_fast_value::<u64>(val_u64);
|
||||
}
|
||||
NumericalValue::F64(val_f64) => {
|
||||
term_buffer.append_type_and_fast_value::<f64>(val_f64);
|
||||
}
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
ReferenceValueLeaf::Bool(val) => {
|
||||
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
///
|
||||
/// The term must be json + JSON path.
|
||||
pub fn convert_to_fast_value_and_append_to_json_term(
|
||||
mut term: Term,
|
||||
phrase: &str,
|
||||
term: &Term,
|
||||
text: &str,
|
||||
truncate_date_for_search: bool,
|
||||
) -> Option<Term> {
|
||||
assert_eq!(
|
||||
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
|
||||
0,
|
||||
"JSON value bytes should be empty"
|
||||
);
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||
if truncate_date_for_search {
|
||||
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
|
||||
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
|
||||
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
|
||||
}
|
||||
|
||||
fn try_convert_to_datetime_and_append_to_json_term(
|
||||
term: &Term,
|
||||
text: &str,
|
||||
truncate_date_for_search: bool,
|
||||
) -> Option<Term> {
|
||||
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
|
||||
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||
if truncate_date_for_search {
|
||||
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||
}
|
||||
let mut term_clone = term.clone();
|
||||
term_clone.append_type_and_fast_value(dt);
|
||||
Some(term_clone)
|
||||
}
|
||||
|
||||
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
|
||||
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
|
||||
let mut term_clone = term.clone();
|
||||
// Parse is actually returning normalized values already today, but let's not
|
||||
// not rely on that hidden contract.
|
||||
match numerical_value.normalize() {
|
||||
NumericalValue::I64(i64_value) => {
|
||||
term_clone.append_type_and_fast_value::<i64>(i64_value);
|
||||
}
|
||||
NumericalValue::U64(u64_value) => {
|
||||
term_clone.append_type_and_fast_value::<u64>(u64_value);
|
||||
}
|
||||
NumericalValue::F64(f64_value) => {
|
||||
term_clone.append_type_and_fast_value::<f64>(f64_value);
|
||||
}
|
||||
term.append_type_and_fast_value(dt);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
term.append_type_and_fast_value(i64_val);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
term.append_type_and_fast_value(u64_val);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
term.append_type_and_fast_value(f64_val);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(bool_val) = str::parse::<bool>(phrase) {
|
||||
term.append_type_and_fast_value(bool_val);
|
||||
return Some(term);
|
||||
}
|
||||
None
|
||||
Some(term_clone)
|
||||
}
|
||||
|
||||
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
|
||||
let val = str::parse::<bool>(text).ok()?;
|
||||
let mut term_clone = term.clone();
|
||||
term_clone.append_type_and_fast_value(val);
|
||||
Some(term_clone)
|
||||
}
|
||||
|
||||
/// Splits a json path supplied to the query parser in such a way that
|
||||
|
||||
49
src/lib.rs
49
src/lib.rs
@@ -370,6 +370,8 @@ macro_rules! fail_point {
|
||||
/// Common test utilities.
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
@@ -382,7 +384,7 @@ pub mod tests {
|
||||
use crate::index::SegmentReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::{BooleanQuery, QueryParser};
|
||||
use crate::schema::*;
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
||||
|
||||
@@ -1223,4 +1225,49 @@ pub mod tests {
|
||||
);
|
||||
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_number_ambiguity() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
{
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut obj = BTreeMap::default();
|
||||
obj.insert("key".to_string(), OwnedValue::I64(1i64));
|
||||
doc.add_object(json_field, obj);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut obj = BTreeMap::default();
|
||||
obj.insert("key".to_string(), OwnedValue::U64(1u64));
|
||||
doc.add_object(json_field, obj);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut obj = BTreeMap::default();
|
||||
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
|
||||
doc.add_object(json_field, obj);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query("number.key:1").unwrap();
|
||||
let count = searcher.search(&query, &crate::collector::Count).unwrap();
|
||||
assert_eq!(count, 3);
|
||||
}
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query("number.key:1.0").unwrap();
|
||||
let count = searcher.search(&query, &crate::collector::Count).unwrap();
|
||||
assert_eq!(count, 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
use core::fmt::Debug;
|
||||
|
||||
use columnar::{ColumnIndex, DynamicColumn};
|
||||
use common::BitSet;
|
||||
|
||||
use super::{ConstScorer, EmptyScorer};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::all_query::AllScorer;
|
||||
use crate::query::boost_query::BoostScorer;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, Score, TantivyError};
|
||||
|
||||
@@ -113,13 +116,49 @@ impl Weight for ExistsWeight {
|
||||
non_empty_columns.push(column)
|
||||
}
|
||||
}
|
||||
// TODO: we can optimizer more here since in most cases we will have only one index
|
||||
if !non_empty_columns.is_empty() {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
if non_empty_columns.is_empty() {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
|
||||
// If any column is full, all docs match.
|
||||
let max_doc = reader.max_doc();
|
||||
if non_empty_columns
|
||||
.iter()
|
||||
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
|
||||
{
|
||||
let all_scorer = AllScorer::new(max_doc);
|
||||
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
|
||||
}
|
||||
|
||||
// If we have a single dynamic column, use ExistsDocSet
|
||||
// NOTE: A lower number may be better for very sparse columns
|
||||
if non_empty_columns.len() < 4 {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
return Ok(Box::new(ConstScorer::new(docset, boost)));
|
||||
}
|
||||
|
||||
// If we have many dynamic columns, precompute a bitset of matching docs
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
for column in &non_empty_columns {
|
||||
match column.column_index() {
|
||||
ColumnIndex::Empty { .. } => {}
|
||||
ColumnIndex::Full => {
|
||||
// Handled by AllScorer return above.
|
||||
}
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_non_null_docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
ColumnIndex::Multivalued(multi_idx) => {
|
||||
for doc in multi_idx.iter_non_null_docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let docset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
@@ -294,6 +333,43 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
|
||||
// Build docs where no single subpath exists for all docs, but the union does.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json = schema_builder.add_json_field("json", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0u64..100u64 {
|
||||
if i % 2 == 0 {
|
||||
// only subpath `a`
|
||||
index_writer.add_document(doc!(json => json!({"a": i})))?;
|
||||
} else {
|
||||
// only subpath `b`
|
||||
index_writer.add_document(doc!(json => json!({"b": i})))?;
|
||||
}
|
||||
}
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// No single subpath is full
|
||||
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
|
||||
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
|
||||
|
||||
// Root exists with subpaths disabled is zero
|
||||
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
|
||||
|
||||
// Root exists with subpaths enabled should match all docs via union
|
||||
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
use std::sync::Arc;
|
||||
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
use crate::query::Occur;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::{Field, Term};
|
||||
use crate::Score;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -21,6 +24,10 @@ pub enum LogicalLiteral {
|
||||
elements: Vec<Term>,
|
||||
},
|
||||
All,
|
||||
Regex {
|
||||
pattern: Arc<Regex>,
|
||||
field: Field,
|
||||
},
|
||||
}
|
||||
|
||||
pub enum LogicalAst {
|
||||
@@ -147,6 +154,10 @@ impl fmt::Debug for LogicalLiteral {
|
||||
write!(formatter, "]")
|
||||
}
|
||||
LogicalLiteral::All => write!(formatter, "*"),
|
||||
LogicalLiteral::Regex {
|
||||
ref pattern,
|
||||
ref field,
|
||||
} => write!(formatter, "Regex({field:?}, {pattern:?})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,12 +2,14 @@ use std::net::{AddrParseError, IpAddr};
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::{FromStr, ParseBoolError};
|
||||
use std::sync::Arc;
|
||||
|
||||
use base64::engine::general_purpose::STANDARD as BASE64;
|
||||
use base64::Engine;
|
||||
use itertools::Itertools;
|
||||
use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
use super::logical_ast::*;
|
||||
use crate::index::Index;
|
||||
@@ -15,7 +17,7 @@ use crate::json_utils::convert_to_fast_value_and_append_to_json_term;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
|
||||
use crate::query::{
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
|
||||
PhraseQuery, Query, TermQuery, TermSetQuery,
|
||||
PhraseQuery, Query, RegexQuery, TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
@@ -206,6 +208,7 @@ pub struct QueryParser {
|
||||
tokenizer_manager: TokenizerManager,
|
||||
boost: FxHashMap<Field, Score>,
|
||||
fuzzy: FxHashMap<Field, Fuzzy>,
|
||||
regexes_allowed: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -260,6 +263,7 @@ impl QueryParser {
|
||||
conjunction_by_default: false,
|
||||
boost: Default::default(),
|
||||
fuzzy: Default::default(),
|
||||
regexes_allowed: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,6 +324,11 @@ impl QueryParser {
|
||||
);
|
||||
}
|
||||
|
||||
/// Allow regexes in queries
|
||||
pub fn allow_regexes(&mut self) {
|
||||
self.regexes_allowed = true;
|
||||
}
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query` returns an error if the input
|
||||
@@ -486,24 +495,17 @@ impl QueryParser {
|
||||
Ok(terms.into_iter().next().unwrap())
|
||||
}
|
||||
FieldType::JsonObject(ref json_options) => {
|
||||
let get_term_with_path = || {
|
||||
Term::from_field_json_path(
|
||||
field,
|
||||
json_path,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
)
|
||||
};
|
||||
let mut term = Term::from_field_json_path(
|
||||
field,
|
||||
json_path,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
);
|
||||
if let Some(term) =
|
||||
// Try to convert the phrase to a fast value
|
||||
convert_to_fast_value_and_append_to_json_term(
|
||||
get_term_with_path(),
|
||||
phrase,
|
||||
false,
|
||||
)
|
||||
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
|
||||
{
|
||||
Ok(term)
|
||||
} else {
|
||||
let mut term = get_term_with_path();
|
||||
term.append_type_and_str(phrase);
|
||||
Ok(term)
|
||||
}
|
||||
@@ -860,6 +862,51 @@ impl QueryParser {
|
||||
"Range query need to target a specific field.".to_string(),
|
||||
)],
|
||||
),
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
if !self.regexes_allowed {
|
||||
return (
|
||||
None,
|
||||
vec![QueryParserError::UnsupportedQuery(
|
||||
"Regex queries are not allowed.".to_string(),
|
||||
)],
|
||||
);
|
||||
}
|
||||
let full_path = try_tuple!(field.ok_or_else(|| {
|
||||
QueryParserError::UnsupportedQuery(
|
||||
"Regex query need to target a specific field.".to_string(),
|
||||
)
|
||||
}));
|
||||
let (field, json_path) = try_tuple!(self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||
if !json_path.is_empty() {
|
||||
return (
|
||||
None,
|
||||
vec![QueryParserError::UnsupportedQuery(
|
||||
"Regex query does not support json paths.".to_string(),
|
||||
)],
|
||||
);
|
||||
}
|
||||
if !matches!(
|
||||
self.schema.get_field_entry(field).field_type(),
|
||||
FieldType::Str(_)
|
||||
) {
|
||||
return (
|
||||
None,
|
||||
vec![QueryParserError::UnsupportedQuery(
|
||||
"Regex query only supported on text fields".to_string(),
|
||||
)],
|
||||
);
|
||||
}
|
||||
let pattern = try_tuple!(Regex::new(&pattern).map_err(|e| {
|
||||
QueryParserError::UnsupportedQuery(format!("Invalid regex: {e}"))
|
||||
}));
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Regex {
|
||||
pattern: Arc::new(pattern),
|
||||
field,
|
||||
}));
|
||||
(Some(logical_ast), Vec::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -902,6 +949,9 @@ fn convert_literal_to_query(
|
||||
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
|
||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
LogicalLiteral::Regex { pattern, field } => {
|
||||
Box::new(RegexQuery::from_regex(pattern, field))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -971,7 +1021,7 @@ fn generate_literals_for_json_object(
|
||||
|
||||
// Try to convert the phrase to a fast value
|
||||
if let Some(term) =
|
||||
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
|
||||
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
|
||||
{
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
}
|
||||
@@ -1100,11 +1150,15 @@ mod test {
|
||||
query: &str,
|
||||
default_conjunction: bool,
|
||||
default_fields: &[&'static str],
|
||||
allow_regexes: bool,
|
||||
) -> Result<LogicalAst, QueryParserError> {
|
||||
let mut query_parser = make_query_parser_with_default_fields(default_fields);
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
}
|
||||
if allow_regexes {
|
||||
query_parser.allow_regexes();
|
||||
}
|
||||
query_parser.parse_query_to_logical_ast(query)
|
||||
}
|
||||
|
||||
@@ -1116,6 +1170,7 @@ mod test {
|
||||
query,
|
||||
default_conjunction,
|
||||
&["title", "text"],
|
||||
true,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1130,6 +1185,7 @@ mod test {
|
||||
query,
|
||||
default_conjunction,
|
||||
default_fields,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
let query_str = format!("{query:?}");
|
||||
@@ -1993,4 +2049,56 @@ mod test {
|
||||
Err(QueryParserError::ExpectedInt(_))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_regex() {
|
||||
let expected_regex = tantivy_fst::Regex::new(r".*b").unwrap();
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:/.*b/",
|
||||
format!("Regex(Field(0), {:#?})", expected_regex).as_str(),
|
||||
false,
|
||||
);
|
||||
|
||||
// Invalid field
|
||||
let err = parse_query_to_logical_ast("float:/.*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex query only supported on text fields"
|
||||
);
|
||||
|
||||
// No field specified
|
||||
let err = parse_query_to_logical_ast("/.*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex query need to target a specific field."
|
||||
);
|
||||
|
||||
// Regex on a json path
|
||||
let err = parse_query_to_logical_ast("title.subpath:/.*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex query does not support json paths."
|
||||
);
|
||||
|
||||
// Invalid regex
|
||||
let err = parse_query_to_logical_ast("title:/[A-Z*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Invalid regex: regex parse error:\n [A-Z*b\n ^\nerror: \
|
||||
unclosed character class"
|
||||
);
|
||||
|
||||
// Regexes not allowed
|
||||
let err = parse_query_to_logical_ast_with_default_fields(
|
||||
"title:/.*b/",
|
||||
false,
|
||||
&["title", "text"],
|
||||
false,
|
||||
)
|
||||
.unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex queries are not allowed."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,10 +12,14 @@ pub use self::range_query_fastfield::*;
|
||||
// TODO is this correct?
|
||||
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
|
||||
match typ {
|
||||
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => {
|
||||
true
|
||||
}
|
||||
Type::IpAddr => true,
|
||||
Type::Str
|
||||
| Type::U64
|
||||
| Type::I64
|
||||
| Type::F64
|
||||
| Type::Bool
|
||||
| Type::Date
|
||||
| Type::Json
|
||||
| Type::IpAddr => true,
|
||||
Type::Facet | Type::Bytes => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ mod tests {
|
||||
use crate::docset::DocSet;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, FAST, STRING, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
@@ -212,4 +212,232 @@ mod tests {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_to_fastfield() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
use crate::schema::FAST;
|
||||
|
||||
// Create a FAST-only numeric field (not indexed)
|
||||
let mut schema_builder = Schema::builder();
|
||||
let num_field = schema_builder.add_u64_field("num", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(num_field => 10u64))?;
|
||||
index_writer.add_document(doc!(num_field => 20u64))?;
|
||||
index_writer.add_document(doc!(num_field => 10u64))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// TermQuery should fall back to a fastfield range query and match correctly.
|
||||
let tq_10 = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 10u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_20 = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 20u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_30 = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 30u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let count_10 = searcher.search(&tq_10, &Count)?;
|
||||
let count_20 = searcher.search(&tq_20, &Count)?;
|
||||
let count_30 = searcher.search(&tq_30, &Count)?;
|
||||
|
||||
assert_eq!(count_10, 2);
|
||||
assert_eq!(count_20, 1);
|
||||
assert_eq!(count_30, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_text_fast_only() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
|
||||
// FAST-only text field (not indexed)
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field => "hello"))?;
|
||||
index_writer.add_document(doc!(text_field => "world"))?;
|
||||
index_writer.add_document(doc!(text_field => "hello"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let tq_hello = TermQuery::new(
|
||||
Term::from_field_text(text_field, "hello"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_world = TermQuery::new(
|
||||
Term::from_field_text(text_field, "world"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_missing = TermQuery::new(
|
||||
Term::from_field_text(text_field, "nope"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(searcher.search(&tq_hello, &Count)?, 2);
|
||||
assert_eq!(searcher.search(&tq_world, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_missing, &Count)?, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_json_fast_only() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::FAST;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "x"})))?;
|
||||
index_writer.add_document(doc!(json_field => json!({"a": 20, "b": "y"})))?;
|
||||
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "z"})))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
fn json_term_fast<T: FastValue>(field: Field, path: &str, v: T) -> Term {
|
||||
let mut term = Term::from_field_json_path(field, path, true);
|
||||
term.append_type_and_fast_value(v);
|
||||
term
|
||||
}
|
||||
fn json_term_str(field: Field, path: &str, v: &str) -> Term {
|
||||
let mut term = Term::from_field_json_path(field, path, true);
|
||||
term.append_type_and_str(v);
|
||||
term
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
// numeric path match
|
||||
let tq_a10 = TermQuery::new(
|
||||
json_term_fast(json_field, "a", 10u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_a20 = TermQuery::new(
|
||||
json_term_fast(json_field, "a", 20u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_a30 = TermQuery::new(
|
||||
json_term_fast(json_field, "a", 30u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert_eq!(searcher.search(&tq_a10, &Count)?, 2);
|
||||
assert_eq!(searcher.search(&tq_a20, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_a30, &Count)?, 0);
|
||||
|
||||
// string path match
|
||||
let tq_bx = TermQuery::new(
|
||||
json_term_str(json_field, "b", "x"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_by = TermQuery::new(
|
||||
json_term_str(json_field, "b", "y"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_bm = TermQuery::new(
|
||||
json_term_str(json_field, "b", "missing"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert_eq!(searcher.search(&tq_bx, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_by, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_bm, &Count)?, 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_ip_fast_only() -> crate::Result<()> {
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::collector::Count;
|
||||
use crate::schema::{IntoIpv6Addr, FAST};
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let ip1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
|
||||
let ip2 = IpAddr::from_str("127.0.0.2").unwrap().into_ipv6_addr();
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(ip_field => ip1))?;
|
||||
index_writer.add_document(doc!(ip_field => ip2))?;
|
||||
index_writer.add_document(doc!(ip_field => ip1))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let tq_ip1 = TermQuery::new(
|
||||
Term::from_field_ip_addr(ip_field, ip1),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_ip2 = TermQuery::new(
|
||||
Term::from_field_ip_addr(ip_field, ip2),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let ip3 = IpAddr::from_str("127.0.0.3").unwrap().into_ipv6_addr();
|
||||
let tq_ip3 = TermQuery::new(
|
||||
Term::from_field_ip_addr(ip_field, ip3),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(searcher.search(&tq_ip1, &Count)?, 2);
|
||||
assert_eq!(searcher.search(&tq_ip2, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_ip3, &Count)?, 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_fastfield_with_scores_errors() -> crate::Result<()> {
|
||||
use crate::collector::TopDocs;
|
||||
|
||||
// FAST-only numeric field (not indexed) should error when scoring is required
|
||||
let mut schema_builder = Schema::builder();
|
||||
let num_field = schema_builder.add_u64_field("num", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(num_field => 10u64))?;
|
||||
index_writer.add_document(doc!(num_field => 20u64))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let tq = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 10u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
// Using TopDocs requires scoring; since the field is not indexed,
|
||||
// TermQuery cannot score and should return a SchemaError.
|
||||
let res = searcher.search(&tq, &TopDocs::with_limit(1));
|
||||
assert!(matches!(res, Err(crate::TantivyError::SchemaError(_))));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
|
||||
use super::term_weight::TermWeight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Weight};
|
||||
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||
use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Term;
|
||||
|
||||
@@ -122,6 +124,24 @@ impl TermQuery {
|
||||
|
||||
impl Query for TermQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
// If the field is not indexed but is a suitable fast field, fall back to a range query
|
||||
// on the fast field matching exactly this term.
|
||||
//
|
||||
// Note: This is considerable slower since it requires to scan the entire fast field.
|
||||
// TODO: The range query would gain from having a single-value optimization
|
||||
let schema = enable_scoring.schema();
|
||||
let field_entry = schema.get_field_entry(self.term.field());
|
||||
if !field_entry.is_indexed()
|
||||
&& field_entry.is_fast()
|
||||
&& is_type_valid_for_fastfield_range_query(self.term.typ())
|
||||
&& !enable_scoring.is_scoring_enabled()
|
||||
{
|
||||
let range_query = RangeQuery::new(
|
||||
Bound::Included(self.term.clone()),
|
||||
Bound::Included(self.term.clone()),
|
||||
);
|
||||
return range_query.weight(enable_scoring);
|
||||
}
|
||||
Ok(Box::new(self.specialized_weight(enable_scoring)?))
|
||||
}
|
||||
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
|
||||
|
||||
Reference in New Issue
Block a user