Compare commits

..

6 Commits

Author SHA1 Message Date
PSeitz-dd
eb5f51f3c4 Optimize ExistsQuery for a high number of dynamic columns (#2694)
* Optimize ExistsQuery for a high number of dynamic columns

The previous algorithm checked _each_ doc in _each_ column for
existence. This causes huge cost on JSON fields with e.g. 100k columns.
Compute a bitset instead if we have more than one column.

add `iter_docs` to the multivalued_index

* add benchmark

subfields=1
exists_json_union    Memory: 89.3 KB (+2.01%)    Avg: 0.4865ms (-26.03%)    Median: 0.4865ms (-26.03%)    [0.4865ms .. 0.4865ms]
subfields=2
exists_json_union    Memory: 68.1 KB     Avg: 1.7048ms (-0.46%)    Median: 1.7048ms (-0.46%)    [1.7048ms .. 1.7048ms]
subfields=3
exists_json_union    Memory: 61.8 KB     Avg: 2.0742ms (-2.22%)    Median: 2.0742ms (-2.22%)    [2.0742ms .. 2.0742ms]
subfields=4
exists_json_union    Memory: 119.8 KB (+103.44%)    Avg: 3.9500ms (+42.62%)    Median: 3.9500ms (+42.62%)    [3.9500ms .. 3.9500ms]
subfields=5
exists_json_union    Memory: 120.4 KB (+107.65%)    Avg: 3.9610ms (+20.65%)    Median: 3.9610ms (+20.65%)    [3.9610ms .. 3.9610ms]
subfields=6
exists_json_union    Memory: 120.6 KB (+107.49%)    Avg: 3.8903ms (+3.11%)    Median: 3.8903ms (+3.11%)    [3.8903ms .. 3.8903ms]
subfields=7
exists_json_union    Memory: 120.9 KB (+106.93%)    Avg: 3.6220ms (-16.22%)    Median: 3.6220ms (-16.22%)    [3.6220ms .. 3.6220ms]
subfields=8
exists_json_union    Memory: 121.3 KB (+106.23%)    Avg: 4.0981ms (-15.97%)    Median: 4.0981ms (-15.97%)    [4.0981ms .. 4.0981ms]
subfields=16
exists_json_union    Memory: 123.1 KB (+103.09%)    Avg: 4.3483ms (-92.26%)    Median: 4.3483ms (-92.26%)    [4.3483ms .. 4.3483ms]
subfields=256
exists_json_union    Memory: 204.6 KB (+19.85%)    Avg: 3.8874ms (-99.01%)    Median: 3.8874ms (-99.01%)    [3.8874ms .. 3.8874ms]
subfields=4096
exists_json_union    Memory: 2.0 MB     Avg: 3.5571ms (-99.90%)    Median: 3.5571ms (-99.90%)    [3.5571ms .. 3.5571ms]
subfields=65536
exists_json_union    Memory: 28.3 MB     Avg: 14.4417ms (-99.97%)    Median: 14.4417ms (-99.97%)    [14.4417ms .. 14.4417ms]
subfields=262144
exists_json_union    Memory: 113.3 MB     Avg: 66.2860ms (-99.95%)    Median: 66.2860ms (-99.95%)    [66.2860ms .. 66.2860ms]

* rename methods
2025-09-16 12:57:57 -04:00
PSeitz-dd
7963b0b4aa Add fast field fallback for term query if not indexed (#2693)
* Add fast field fallback for term query if not indexed

* only fallback without scores
2025-09-12 14:58:21 +02:00
Paul Masurel
d5eefca11d Merge pull request #2692 from quickwit-oss/paul.masurel/coerce-floats-too-in-search-too
This PR changes the logic used on the ingestion of floats.
2025-09-10 09:46:54 +02:00
Paul Masurel
5d6c8de23e Align search float search logic to the columnar coercion rules
It applies the same logic on floats as for u64 or i64.
In all case, the idea is (for the inverted index) to coerce number
to their canonical representation, before indexing and before searching.

That way a document with the float 1.0 will be searchable when the user
searches for 1.

Note that contrary to the columnar, we do not attempt to coerce all of the
terms associated to a given json path to a single numerical type.
We simply rely on this "point-wise" canonicalization.
2025-09-09 19:28:17 +02:00
PSeitz
a06365f39f Update CHANGELOG.md for bugfixes (#2674)
* Update CHANGELOG.md

* Update CHANGELOG.md
2025-09-04 11:51:00 +02:00
Raphaël Cohen
f4b374110f feat: Regex query grammar (#2677)
* feat: Regex query grammar

* feat: Disable regexes by default

* chore: Apply formatting
2025-09-03 10:07:04 +02:00
21 changed files with 953 additions and 97 deletions

View File

@@ -14,6 +14,18 @@ Tantivy 0.25
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
Tantivy 0.24.2
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.24.1
================================
- Fix: bump required rust version to 1.81
Tantivy 0.24
================================
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
@@ -96,6 +108,14 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
Tantivy 0.22.1
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.22
================================

View File

@@ -167,3 +167,7 @@ harness = false
[[bench]]
name = "agg_bench"
harness = false
[[bench]]
name = "exists_json"
harness = false

69
benches/exists_json.rs Normal file
View File

@@ -0,0 +1,69 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use serde_json::json;
use tantivy::collector::Count;
use tantivy::query::ExistsQuery;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
let doc_count: usize = 500_000;
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
let indices: Vec<(String, Index)> = subfield_counts
.iter()
.map(|&sub_fields| {
(
format!("subfields={sub_fields}"),
build_index_with_json_subfields(doc_count, sub_fields),
)
})
.collect();
let mut group = InputGroup::new_with_inputs(indices);
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
group.config().num_iter_group = Some(1);
group.config().num_iter_bench = Some(1);
group.register("exists_json", exists_json_union);
group.run();
}
fn exists_json_union(index: &Index) {
let reader = index.reader().expect("reader");
let searcher = reader.searcher();
let query = ExistsQuery::new("json".to_string(), true);
let count = searcher.search(&query, &Count).expect("exists search");
// Prevents optimizer from eliding the search
black_box(count);
}
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
// Schema: single JSON field stored as FAST to support ExistsQuery.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).expect("create index");
{
let mut index_writer = index
.writer_with_num_threads(1, 200_000_000)
.expect("writer");
for i in 0..num_docs {
let sub = i % num_subfields;
// Only one subpath set per document; rotate subpaths so that
// no single subpath is full, but the union covers all docs.
let v = json!({ format!("field_{sub}"): i as u64 });
index_writer
.add_document(doc!(json_field => v))
.expect("add_document");
}
index_writer.commit().expect("commit");
}
index
}

View File

@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
ColumnIndex::Full => Box::new(doc_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_docs()
.iter_non_null_docs()
.map(move |row| row + doc_range.start),
),
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
multivalued_index
.optional_index
.iter_docs()
.iter_non_null_docs()
.map(move |row| row + doc_range.start),
),
},
@@ -177,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
ColumnIndex::Full => Box::new(columnar_row_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_docs()
.iter_non_null_docs()
.map(move |row_id: RowId| columnar_row_range.start + row_id),
),
ColumnIndex::Multivalued(_) => {

View File

@@ -215,6 +215,32 @@ impl MultiValueIndex {
}
}
/// Returns an iterator over document ids that have at least one value.
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => {
let mut doc: DocId = 0u32;
let num_docs = idx.num_docs();
Box::new(std::iter::from_fn(move || {
// This is not the most efficient way to do this, but it's legacy code.
while doc < num_docs {
let cur = doc;
doc += 1;
let start = idx.start_index_column.get_val(cur);
let end = idx.start_index_column.get_val(cur + 1);
if end > start {
return Some(cur);
}
}
None
}))
}
MultiValueIndex::MultiValueIndexV2(idx) => {
Box::new(idx.optional_index.iter_non_null_docs())
}
}
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// docids. Positions are converted inplace to docids.
///

View File

@@ -88,7 +88,7 @@ pub struct OptionalIndex {
impl Iterable<u32> for &OptionalIndex {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.iter_docs())
Box::new(self.iter_non_null_docs())
}
}
@@ -280,8 +280,9 @@ impl OptionalIndex {
self.num_non_null_docs
}
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize. We could iterate over the blocks directly.
// We use the dense value ids and retrieve the doc ids via select.
let mut select_batch = self.select_cursor();
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
}

View File

@@ -164,7 +164,11 @@ fn test_optional_index_large() {
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
assert_eq!(optional_index.num_docs(), num_rows);
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
assert!(
optional_index
.iter_non_null_docs()
.eq(row_ids.iter().copied())
);
}
#[test]

View File

@@ -367,7 +367,7 @@ fn is_empty_after_merge(
ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_docs() {
for doc in optional_index.iter_non_null_docs() {
if alive_bitset.contains(doc) {
return false;
}

View File

@@ -1,3 +1,5 @@
use std::str::FromStr;
use common::DateTime;
use crate::InvalidData;
@@ -9,6 +11,23 @@ pub enum NumericalValue {
F64(f64),
}
impl FromStr for NumericalValue {
type Err = ();
fn from_str(s: &str) -> Result<Self, ()> {
if let Ok(val_i64) = s.parse::<i64>() {
return Ok(val_i64.into());
}
if let Ok(val_u64) = s.parse::<u64>() {
return Ok(val_u64.into());
}
if let Ok(val_f64) = s.parse::<f64>() {
return Ok(NumericalValue::from(val_f64).normalize());
}
Err(())
}
}
impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType {
match self {
@@ -26,7 +45,7 @@ impl NumericalValue {
if val <= i64::MAX as u64 {
NumericalValue::I64(val as i64)
} else {
NumericalValue::F64(val as f64)
NumericalValue::U64(val)
}
}
NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
#[cfg(test)]
mod tests {
use super::NumericalType;
use crate::NumericalValue;
#[test]
fn test_numerical_type_code() {
@@ -153,4 +173,58 @@ mod tests {
}
assert_eq!(num_numerical_type, 3);
}
#[test]
fn test_parse_numerical() {
assert_eq!(
"123".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(123)
);
assert_eq!(
"18446744073709551615".parse::<NumericalValue>().unwrap(),
NumericalValue::U64(18446744073709551615u64)
);
assert_eq!(
"1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(1i64)
);
assert_eq!(
"1.1".parse::<NumericalValue>().unwrap(),
NumericalValue::F64(1.1f64)
);
assert_eq!(
"-1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(-1i64)
);
}
#[test]
fn test_normalize_numerical() {
assert_eq!(
NumericalValue::from(1u64).normalize(),
NumericalValue::I64(1i64),
);
let limit_val = i64::MAX as u64 + 1u64;
assert_eq!(
NumericalValue::from(limit_val).normalize(),
NumericalValue::U64(limit_val),
);
assert_eq!(
NumericalValue::from(-1i64).normalize(),
NumericalValue::I64(-1i64),
);
assert_eq!(
NumericalValue::from(-2.0f64).normalize(),
NumericalValue::I64(-2i64),
);
assert_eq!(
NumericalValue::from(-2.1f64).normalize(),
NumericalValue::F64(-2.1f64),
);
let large_float = 2.0f64.powf(70.0f64);
assert_eq!(
NumericalValue::from(large_float).normalize(),
NumericalValue::F64(large_float),
);
}
}

View File

@@ -117,6 +117,22 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
}
}
pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
mut first: F,
mut second: G,
) -> impl FnMut(I) -> JResult<I, O1>
where
F: nom::Parser<I, (O1, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
{
move |input: I| {
let (input, (o1, mut err)) = first.parse(input)?;
let (input, (_, mut err2)) = second.parse(input)?;
err.append(&mut err2);
Ok((input, (o1, err)))
}
}
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
mut first: F,
mut second: G,

View File

@@ -367,7 +367,10 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// something (a field name) got parsed before
alt((
map(
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
tuple((
opt(field_name),
alt((range, set, exists, regex, term_or_phrase)),
)),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
),
term_group,
@@ -389,6 +392,10 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
value((), peek(one_of("{[><"))),
map(range_infallible, |(range, errs)| (Some(range), errs)),
),
(
value((), peek(one_of("/"))),
map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
),
),
delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
),
@@ -689,6 +696,61 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
}
}
fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
map(
terminated(
delimited(
char('/'),
many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
char('/'),
),
peek(alt((multispace1, eof))),
),
|elements| UserInputLeaf::Regex {
field: None,
pattern: elements.into_iter().collect::<String>(),
},
)(inp)
}
fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
match terminated_infallible(
delimited_infallible(
opt_i_err(char('/'), "missing delimiter /"),
opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
opt_i_err(char('/'), "missing delimiter /"),
),
opt_i_err(
peek(alt((multispace1, eof))),
"expected whitespace or end of input",
),
)(inp)
{
Ok((rest, (elements_part, errors))) => {
let pattern = match elements_part {
Some(elements_part) => elements_part.into_iter().collect(),
None => String::new(),
};
let res = UserInputLeaf::Regex {
field: None,
pattern,
};
Ok((rest, (res, errors)))
}
Err(e) => {
let errs = vec![LenientErrorInternal {
pos: inp.len(),
message: e.to_string(),
}];
let res = UserInputLeaf::Regex {
field: None,
pattern: String::new(),
};
Ok((inp, (res, errs)))
}
}
}
fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
@@ -1694,6 +1756,63 @@ mod test {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
}
#[test]
fn test_regex_parser() {
let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
let (_, input) = r.unwrap();
match input {
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
UserInputLeaf::Regex { field, pattern } => {
assert_eq!(field, &Some("a".to_string()));
assert_eq!(pattern, "joh?n(ath[oa]n)");
}
_ => panic!("Expected a regex leaf, got {leaf:?}"),
},
_ => panic!("Expected a leaf"),
}
let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
let (_, input) = r.unwrap();
match input {
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
UserInputLeaf::Regex { field, pattern } => {
assert_eq!(field, &Some("a".to_string()));
assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
}
_ => panic!("Expected a regex leaf, got {leaf:?}"),
},
_ => panic!("Expected a leaf"),
}
}
#[test]
fn test_regex_parser_lenient() {
let literal = |query| literal_infallible(query).unwrap().1;
let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
let expected = UserInputLeaf::Regex {
field: Some("a".to_string()),
pattern: "joh?n(ath[oa]n)".to_string(),
}
.into();
assert_eq!(res.unwrap(), expected);
assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
let (res, errs) = literal("title:/joh?n(ath[oa]n)");
let expected = UserInputLeaf::Regex {
field: Some("title".to_string()),
pattern: "joh?n(ath[oa]n)".to_string(),
}
.into();
assert_eq!(res.unwrap(), expected);
assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
assert_eq!(
errs[0].message, "missing delimiter /",
"Unexpected error message",
);
}
#[test]
fn test_space_before_value() {
test_parse_query_to_ast_helper("field : a", r#""field":a"#);

View File

@@ -23,6 +23,10 @@ pub enum UserInputLeaf {
Exists {
field: String,
},
Regex {
field: Option<String>,
pattern: String,
},
}
impl UserInputLeaf {
@@ -46,6 +50,7 @@ impl UserInputLeaf {
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
}
}
@@ -103,6 +108,14 @@ impl Debug for UserInputLeaf {
UserInputLeaf::Exists { field } => {
write!(formatter, "$exists(\"{field}\")")
}
UserInputLeaf::Regex { field, pattern } => {
if let Some(field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
}
// TODO properly escape pattern (in case of \")
write!(formatter, "/{pattern}/")
}
}
}
}

View File

@@ -155,7 +155,7 @@ fn test_aggregation_flushing(
searcher.search(&AllQuery, &collector).unwrap()
};
let res: Value = serde_json::to_value(&agg_res)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["bucketsL1"]["buckets"][0]["doc_count"], 3);
assert_eq!(
@@ -270,7 +270,7 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let res: Value = serde_json::to_value(&agg_res)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["average"]["value"], 12.142857142857142);
assert_eq!(
res["range"]["buckets"],
@@ -304,29 +304,6 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
Ok(())
}
#[test]
fn test_aggregation_term_truncate_sum_other_doc_count() {
let index = get_test_index_2_segments(true).unwrap();
let reader = index.reader().unwrap();
let count_per_text: Aggregation = serde_json::from_value(json!({ "terms": { "field": "text", "size": 1 } })).unwrap();
let aggs: Aggregations = vec![("group_by_term_truncate".to_string(), count_per_text)]
.into_iter()
.collect();
let collector = get_collector(aggs);
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::to_value(&agg_res).unwrap();
assert_eq!(res, serde_json::json!({
"group_by_term_truncate": {
"buckets": [{ "doc_count": 7, "key": "cool" }],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 2,
},
}));
}
#[test]
fn test_aggregation_level1() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;
@@ -365,7 +342,7 @@ fn test_aggregation_level1() -> crate::Result<()> {
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let res: Value = serde_json::to_value(&agg_res)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["average"]["value"], 12.142857142857142);
assert_eq!(res["average_f64"]["value"], 12.214285714285714);
assert_eq!(res["average_i64"]["value"], 12.142857142857142);
@@ -420,7 +397,7 @@ fn test_aggregation_level2(
IndexRecordOption::Basic,
);
let elasticsearch_compatible_json_req = serde_json::json!(
let elasticsearch_compatible_json_req = r#"
{
"rangef64": {
"range": {
@@ -473,8 +450,9 @@ fn test_aggregation_level2(
"term_agg": { "terms": { "field": "text" } }
}
}
});
let agg_req: Aggregations = serde_json::from_value(elasticsearch_compatible_json_req).unwrap();
}
"#;
let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
let agg_res: AggregationResults = if use_distributed_collector {
let collector =
@@ -491,7 +469,7 @@ fn test_aggregation_level2(
searcher.search(&term_query, &collector).unwrap()
};
let res: Value = serde_json::to_value(agg_res)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["range"]["buckets"][1]["key"], "3-7");
assert_eq!(res["range"]["buckets"][1]["doc_count"], 2u64);

View File

@@ -1,3 +1,4 @@
use columnar::NumericalValue;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value(val);
term_buffer.append_type_and_fast_value::<u64>(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
if !val.is_finite() {
return;
};
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
// Normalize here is important.
// In the inverted index, we coerce all numerical values to their canonical
// representation.
//
// (We do the same thing on the query side)
match NumericalValue::F64(val).normalize() {
NumericalValue::I64(val_i64) => {
term_buffer.append_type_and_fast_value::<i64>(val_i64);
}
NumericalValue::U64(val_u64) => {
term_buffer.append_type_and_fast_value::<u64>(val_u64);
}
NumericalValue::F64(val_f64) => {
term_buffer.append_type_and_fast_value::<f64>(val_f64);
}
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
///
/// The term must be json + JSON path.
pub fn convert_to_fast_value_and_append_to_json_term(
mut term: Term,
phrase: &str,
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
0,
"JSON value bytes should be empty"
);
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
}
fn try_convert_to_datetime_and_append_to_json_term(
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(dt);
Some(term_clone)
}
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
let mut term_clone = term.clone();
// Parse is actually returning normalized values already today, but let's not
// not rely on that hidden contract.
match numerical_value.normalize() {
NumericalValue::I64(i64_value) => {
term_clone.append_type_and_fast_value::<i64>(i64_value);
}
NumericalValue::U64(u64_value) => {
term_clone.append_type_and_fast_value::<u64>(u64_value);
}
NumericalValue::F64(f64_value) => {
term_clone.append_type_and_fast_value::<f64>(f64_value);
}
term.append_type_and_fast_value(dt);
return Some(term);
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
term.append_type_and_fast_value(i64_val);
return Some(term);
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
term.append_type_and_fast_value(u64_val);
return Some(term);
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
term.append_type_and_fast_value(f64_val);
return Some(term);
}
if let Ok(bool_val) = str::parse::<bool>(phrase) {
term.append_type_and_fast_value(bool_val);
return Some(term);
}
None
Some(term_clone)
}
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
let val = str::parse::<bool>(text).ok()?;
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(val);
Some(term_clone)
}
/// Splits a json path supplied to the query parser in such a way that

View File

@@ -370,6 +370,8 @@ macro_rules! fail_point {
/// Common test utilities.
#[cfg(test)]
pub mod tests {
use std::collections::BTreeMap;
use common::{BinarySerializable, FixedSize};
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
use rand::distributions::{Bernoulli, Uniform};
@@ -382,7 +384,7 @@ pub mod tests {
use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy;
use crate::postings::Postings;
use crate::query::BooleanQuery;
use crate::query::{BooleanQuery, QueryParser};
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
@@ -1223,4 +1225,49 @@ pub mod tests {
);
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
}
#[test]
fn test_json_number_ambiguity() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::I64(1i64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::U64(1u64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.num_docs(), 3);
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1.0").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
}
}

View File

@@ -1,12 +1,15 @@
use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use common::BitSet;
use super::{ConstScorer, EmptyScorer};
use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::query::all_query::AllScorer;
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::Type;
use crate::{DocId, Score, TantivyError};
@@ -113,13 +116,49 @@ impl Weight for ExistsWeight {
non_empty_columns.push(column)
}
}
// TODO: we can optimizer more here since in most cases we will have only one index
if !non_empty_columns.is_empty() {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
Ok(Box::new(EmptyScorer))
if non_empty_columns.is_empty() {
return Ok(Box::new(EmptyScorer));
}
// If any column is full, all docs match.
let max_doc = reader.max_doc();
if non_empty_columns
.iter()
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
{
let all_scorer = AllScorer::new(max_doc);
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
}
// If we have a single dynamic column, use ExistsDocSet
// NOTE: A lower number may be better for very sparse columns
if non_empty_columns.len() < 4 {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
return Ok(Box::new(ConstScorer::new(docset, boost)));
}
// If we have many dynamic columns, precompute a bitset of matching docs
let mut doc_bitset = BitSet::with_max_value(max_doc);
for column in &non_empty_columns {
match column.column_index() {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => {
// Handled by AllScorer return above.
}
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
ColumnIndex::Multivalued(multi_idx) => {
for doc in multi_idx.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
}
}
let docset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -294,6 +333,43 @@ mod tests {
Ok(())
}
#[test]
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
// Build docs where no single subpath exists for all docs, but the union does.
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
// only subpath `a`
index_writer.add_document(doc!(json => json!({"a": i})))?;
} else {
// only subpath `b`
index_writer.add_document(doc!(json => json!({"b": i})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// No single subpath is full
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
// Root exists with subpaths disabled is zero
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
// Root exists with subpaths enabled should match all docs via union
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -1,8 +1,11 @@
use std::fmt;
use std::ops::Bound;
use std::sync::Arc;
use tantivy_fst::Regex;
use crate::query::Occur;
use crate::schema::Term;
use crate::schema::{Field, Term};
use crate::Score;
#[derive(Clone)]
@@ -21,6 +24,10 @@ pub enum LogicalLiteral {
elements: Vec<Term>,
},
All,
Regex {
pattern: Arc<Regex>,
field: Field,
},
}
pub enum LogicalAst {
@@ -147,6 +154,10 @@ impl fmt::Debug for LogicalLiteral {
write!(formatter, "]")
}
LogicalLiteral::All => write!(formatter, "*"),
LogicalLiteral::Regex {
ref pattern,
ref field,
} => write!(formatter, "Regex({field:?}, {pattern:?})"),
}
}
}

View File

@@ -2,12 +2,14 @@ use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};
use std::sync::Arc;
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use itertools::Itertools;
use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use rustc_hash::FxHashMap;
use tantivy_fst::Regex;
use super::logical_ast::*;
use crate::index::Index;
@@ -15,7 +17,7 @@ use crate::json_utils::convert_to_fast_value_and_append_to_json_term;
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
PhraseQuery, Query, TermQuery, TermSetQuery,
PhraseQuery, Query, RegexQuery, TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -206,6 +208,7 @@ pub struct QueryParser {
tokenizer_manager: TokenizerManager,
boost: FxHashMap<Field, Score>,
fuzzy: FxHashMap<Field, Fuzzy>,
regexes_allowed: bool,
}
#[derive(Clone)]
@@ -260,6 +263,7 @@ impl QueryParser {
conjunction_by_default: false,
boost: Default::default(),
fuzzy: Default::default(),
regexes_allowed: false,
}
}
@@ -320,6 +324,11 @@ impl QueryParser {
);
}
/// Allow regexes in queries
pub fn allow_regexes(&mut self) {
self.regexes_allowed = true;
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
@@ -486,24 +495,17 @@ impl QueryParser {
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(ref json_options) => {
let get_term_with_path = || {
Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
)
};
let mut term = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
if let Some(term) =
// Try to convert the phrase to a fast value
convert_to_fast_value_and_append_to_json_term(
get_term_with_path(),
phrase,
false,
)
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
{
Ok(term)
} else {
let mut term = get_term_with_path();
term.append_type_and_str(phrase);
Ok(term)
}
@@ -860,6 +862,51 @@ impl QueryParser {
"Range query need to target a specific field.".to_string(),
)],
),
UserInputLeaf::Regex { field, pattern } => {
if !self.regexes_allowed {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex queries are not allowed.".to_string(),
)],
);
}
let full_path = try_tuple!(field.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Regex query need to target a specific field.".to_string(),
)
}));
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
if !json_path.is_empty() {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex query does not support json paths.".to_string(),
)],
);
}
if !matches!(
self.schema.get_field_entry(field).field_type(),
FieldType::Str(_)
) {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex query only supported on text fields".to_string(),
)],
);
}
let pattern = try_tuple!(Regex::new(&pattern).map_err(|e| {
QueryParserError::UnsupportedQuery(format!("Invalid regex: {e}"))
}));
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Regex {
pattern: Arc::new(pattern),
field,
}));
(Some(logical_ast), Vec::new())
}
}
}
}
@@ -902,6 +949,9 @@ fn convert_literal_to_query(
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
LogicalLiteral::Regex { pattern, field } => {
Box::new(RegexQuery::from_regex(pattern, field))
}
}
}
@@ -971,7 +1021,7 @@ fn generate_literals_for_json_object(
// Try to convert the phrase to a fast value
if let Some(term) =
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
{
logical_literals.push(LogicalLiteral::Term(term));
}
@@ -1100,11 +1150,15 @@ mod test {
query: &str,
default_conjunction: bool,
default_fields: &[&'static str],
allow_regexes: bool,
) -> Result<LogicalAst, QueryParserError> {
let mut query_parser = make_query_parser_with_default_fields(default_fields);
if default_conjunction {
query_parser.set_conjunction_by_default();
}
if allow_regexes {
query_parser.allow_regexes();
}
query_parser.parse_query_to_logical_ast(query)
}
@@ -1116,6 +1170,7 @@ mod test {
query,
default_conjunction,
&["title", "text"],
true,
)
}
@@ -1130,6 +1185,7 @@ mod test {
query,
default_conjunction,
default_fields,
true,
)
.unwrap();
let query_str = format!("{query:?}");
@@ -1993,4 +2049,56 @@ mod test {
Err(QueryParserError::ExpectedInt(_))
);
}
#[test]
pub fn test_regex() {
let expected_regex = tantivy_fst::Regex::new(r".*b").unwrap();
test_parse_query_to_logical_ast_helper(
"title:/.*b/",
format!("Regex(Field(0), {:#?})", expected_regex).as_str(),
false,
);
// Invalid field
let err = parse_query_to_logical_ast("float:/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query only supported on text fields"
);
// No field specified
let err = parse_query_to_logical_ast("/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query need to target a specific field."
);
// Regex on a json path
let err = parse_query_to_logical_ast("title.subpath:/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query does not support json paths."
);
// Invalid regex
let err = parse_query_to_logical_ast("title:/[A-Z*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Invalid regex: regex parse error:\n [A-Z*b\n ^\nerror: \
unclosed character class"
);
// Regexes not allowed
let err = parse_query_to_logical_ast_with_default_fields(
"title:/.*b/",
false,
&["title", "text"],
false,
)
.unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex queries are not allowed."
);
}
}

View File

@@ -12,10 +12,14 @@ pub use self::range_query_fastfield::*;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => {
true
}
Type::IpAddr => true,
Type::Str
| Type::U64
| Type::I64
| Type::F64
| Type::Bool
| Type::Date
| Type::Json
| Type::IpAddr => true,
Type::Facet | Type::Bytes => false,
}
}

View File

@@ -11,7 +11,7 @@ mod tests {
use crate::docset::DocSet;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::schema::{Field, IndexRecordOption, Schema, FAST, STRING, TEXT};
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
#[test]
@@ -212,4 +212,232 @@ mod tests {
}
Ok(())
}
#[test]
fn test_term_query_fallback_to_fastfield() -> crate::Result<()> {
use crate::collector::Count;
use crate::schema::FAST;
// Create a FAST-only numeric field (not indexed)
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("num", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.add_document(doc!(num_field => 20u64))?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// TermQuery should fall back to a fastfield range query and match correctly.
let tq_10 = TermQuery::new(
Term::from_field_u64(num_field, 10u64),
IndexRecordOption::Basic,
);
let tq_20 = TermQuery::new(
Term::from_field_u64(num_field, 20u64),
IndexRecordOption::Basic,
);
let tq_30 = TermQuery::new(
Term::from_field_u64(num_field, 30u64),
IndexRecordOption::Basic,
);
let count_10 = searcher.search(&tq_10, &Count)?;
let count_20 = searcher.search(&tq_20, &Count)?;
let count_30 = searcher.search(&tq_30, &Count)?;
assert_eq!(count_10, 2);
assert_eq!(count_20, 1);
assert_eq!(count_30, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_text_fast_only() -> crate::Result<()> {
use crate::collector::Count;
// FAST-only text field (not indexed)
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "hello"))?;
index_writer.add_document(doc!(text_field => "world"))?;
index_writer.add_document(doc!(text_field => "hello"))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq_hello = TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
);
let tq_world = TermQuery::new(
Term::from_field_text(text_field, "world"),
IndexRecordOption::Basic,
);
let tq_missing = TermQuery::new(
Term::from_field_text(text_field, "nope"),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_hello, &Count)?, 2);
assert_eq!(searcher.search(&tq_world, &Count)?, 1);
assert_eq!(searcher.search(&tq_missing, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_json_fast_only() -> crate::Result<()> {
use crate::collector::Count;
use crate::fastfield::FastValue;
use crate::schema::FAST;
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "x"})))?;
index_writer.add_document(doc!(json_field => json!({"a": 20, "b": "y"})))?;
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "z"})))?;
index_writer.commit()?;
}
fn json_term_fast<T: FastValue>(field: Field, path: &str, v: T) -> Term {
let mut term = Term::from_field_json_path(field, path, true);
term.append_type_and_fast_value(v);
term
}
fn json_term_str(field: Field, path: &str, v: &str) -> Term {
let mut term = Term::from_field_json_path(field, path, true);
term.append_type_and_str(v);
term
}
let searcher = index.reader()?.searcher();
// numeric path match
let tq_a10 = TermQuery::new(
json_term_fast(json_field, "a", 10u64),
IndexRecordOption::Basic,
);
let tq_a20 = TermQuery::new(
json_term_fast(json_field, "a", 20u64),
IndexRecordOption::Basic,
);
let tq_a30 = TermQuery::new(
json_term_fast(json_field, "a", 30u64),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_a10, &Count)?, 2);
assert_eq!(searcher.search(&tq_a20, &Count)?, 1);
assert_eq!(searcher.search(&tq_a30, &Count)?, 0);
// string path match
let tq_bx = TermQuery::new(
json_term_str(json_field, "b", "x"),
IndexRecordOption::Basic,
);
let tq_by = TermQuery::new(
json_term_str(json_field, "b", "y"),
IndexRecordOption::Basic,
);
let tq_bm = TermQuery::new(
json_term_str(json_field, "b", "missing"),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_bx, &Count)?, 1);
assert_eq!(searcher.search(&tq_by, &Count)?, 1);
assert_eq!(searcher.search(&tq_bm, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_ip_fast_only() -> crate::Result<()> {
use std::net::IpAddr;
use std::str::FromStr;
use crate::collector::Count;
use crate::schema::{IntoIpv6Addr, FAST};
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let ip1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
let ip2 = IpAddr::from_str("127.0.0.2").unwrap().into_ipv6_addr();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(ip_field => ip1))?;
index_writer.add_document(doc!(ip_field => ip2))?;
index_writer.add_document(doc!(ip_field => ip1))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq_ip1 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip1),
IndexRecordOption::Basic,
);
let tq_ip2 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip2),
IndexRecordOption::Basic,
);
let ip3 = IpAddr::from_str("127.0.0.3").unwrap().into_ipv6_addr();
let tq_ip3 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip3),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_ip1, &Count)?, 2);
assert_eq!(searcher.search(&tq_ip2, &Count)?, 1);
assert_eq!(searcher.search(&tq_ip3, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_fastfield_with_scores_errors() -> crate::Result<()> {
use crate::collector::TopDocs;
// FAST-only numeric field (not indexed) should error when scoring is required
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("num", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.add_document(doc!(num_field => 20u64))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq = TermQuery::new(
Term::from_field_u64(num_field, 10u64),
IndexRecordOption::Basic,
);
// Using TopDocs requires scoring; since the field is not indexed,
// TermQuery cannot score and should return a SchemaError.
let res = searcher.search(&tq, &TopDocs::with_limit(1));
assert!(matches!(res, Err(crate::TantivyError::SchemaError(_))));
Ok(())
}
}

View File

@@ -1,8 +1,10 @@
use std::fmt;
use std::ops::Bound;
use super::term_weight::TermWeight;
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight};
use crate::schema::IndexRecordOption;
use crate::Term;
@@ -122,6 +124,24 @@ impl TermQuery {
impl Query for TermQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
// If the field is not indexed but is a suitable fast field, fall back to a range query
// on the fast field matching exactly this term.
//
// Note: This is considerable slower since it requires to scan the entire fast field.
// TODO: The range query would gain from having a single-value optimization
let schema = enable_scoring.schema();
let field_entry = schema.get_field_entry(self.term.field());
if !field_entry.is_indexed()
&& field_entry.is_fast()
&& is_type_valid_for_fastfield_range_query(self.term.typ())
&& !enable_scoring.is_scoring_enabled()
{
let range_query = RangeQuery::new(
Bound::Included(self.term.clone()),
Bound::Included(self.term.clone()),
);
return range_query.weight(enable_scoring);
}
Ok(Box::new(self.specialized_weight(enable_scoring)?))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {