Compare commits

..

9 Commits

Author SHA1 Message Date
Paul Masurel
3a8a83da80 tracing 2023-10-16 19:23:47 +09:00
trinity-1686a
0241a05b90 add support for exists query syntax in query parser (#2170)
* add support for exists query syntax in query parser

* rustfmt

* make Exists require a field
2023-09-19 11:10:39 +02:00
PSeitz
e125f3b041 fix test (#2178) 2023-09-19 08:21:50 +02:00
PSeitz
c520ac46fc add support for date in term agg (#2172)
support DateTime in TermsAggregation
Format dates with Rfc3339
2023-09-14 09:22:18 +02:00
PSeitz
2d7390341c increase min memory to 15MB for indexing (#2176)
With tantivy 0.20 the minimum memory consumption per SegmentWriter increased to
12MB. 7MB are for the different fast field collectors types (they could be
lazily created). Increase the minimum memory from 3MB to 15MB.

Change memory variable naming from arena to budget.

closes #2156
2023-09-13 07:38:34 +02:00
dependabot[bot]
03fcdce016 Bump actions/checkout from 3 to 4 (#2171)
Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-09-11 10:47:33 +02:00
Ping Xia
e4e416ac42 extend FuzzyTermQuery to support json field (#2173)
* extend fuzzy search for json field

* comments

* comments

* fmt fix

* comments
2023-09-11 05:59:40 +02:00
Igor Motov
19325132b7 Fast-field based implementation of ExistsQuery (#2160)
Adds an implementation of ExistsQuery that takes advantage of fast fields.

Fixes #2159
2023-09-07 11:51:49 +09:00
Paul Masurel
389d36f760 Added comments 2023-09-04 11:06:56 +09:00
22 changed files with 878 additions and 264 deletions

View File

@@ -15,7 +15,7 @@ jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Install Rust
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2

View File

@@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Install stable
uses: actions-rs/toolchain@v1
with:

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Install nightly
uses: actions-rs/toolchain@v1
@@ -60,7 +60,7 @@ jobs:
name: test-${{ matrix.features.label}}
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Install stable
uses: actions-rs/toolchain@v1

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.21.1"
version = "0.21.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -19,6 +19,7 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
tracing = "0.1"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"

View File

@@ -24,7 +24,7 @@ const SPECIAL_CHARS: &[char] = &[
/// consume a field name followed by colon. Return the field name with escape sequence
/// already interpreted
fn field_name(i: &str) -> IResult<&str, String> {
fn field_name(inp: &str) -> IResult<&str, String> {
let simple_char = none_of(SPECIAL_CHARS);
let first_char = verify(none_of(SPECIAL_CHARS), |c| *c != '-');
let escape_sequence = || preceded(char('\\'), one_of(SPECIAL_CHARS));
@@ -38,12 +38,12 @@ fn field_name(i: &str) -> IResult<&str, String> {
char(':'),
),
|(first_char, next)| once(first_char).chain(next).collect(),
)(i)
)(inp)
}
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(i: &str) -> IResult<&str, &str> {
fn word(inp: &str) -> IResult<&str, &str> {
map_res(
recognize(tuple((
satisfy(|c| {
@@ -55,14 +55,14 @@ fn word(i: &str) -> IResult<&str, &str> {
})),
))),
|s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(i, ErrorKind::Tag)),
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
_ => Ok(s),
},
)(i)
)(inp)
}
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|i| {
|inp| {
opt_i_err(
preceded(
space0,
@@ -71,29 +71,29 @@ fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&st
}))),
),
"expected word",
)(i)
)(inp)
}
}
/// Consume a word inside a Range context. More values are allowed as they are
/// not ambiguous in this context.
fn relaxed_word(i: &str) -> IResult<&str, &str> {
fn relaxed_word(inp: &str) -> IResult<&str, &str> {
recognize(tuple((
satisfy(|c| !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)),
many0(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)))(i)
)))(inp)
}
fn negative_number(i: &str) -> IResult<&str, &str> {
fn negative_number(inp: &str) -> IResult<&str, &str> {
recognize(preceded(
char('-'),
tuple((digit1, opt(tuple((char('.'), digit1))))),
))(i)
))(inp)
}
fn simple_term(i: &str) -> IResult<&str, (Delimiter, String)> {
fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> {
let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -123,13 +123,13 @@ fn simple_term(i: &str) -> IResult<&str, (Delimiter, String)> {
simple_quotes,
double_quotes,
text_no_delimiter,
))(i)
))(inp)
}
fn simple_term_infallible(
delimiter: &str,
) -> impl Fn(&str) -> JResult<&str, Option<(Delimiter, String)>> + '_ {
|i| {
|inp| {
let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -162,11 +162,11 @@ fn simple_term_infallible(
map(word_infallible(delimiter), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors)
}),
)(i)
)(inp)
}
}
fn term_or_phrase(i: &str) -> IResult<&str, UserInputLeaf> {
fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
map(
tuple((simple_term, fallible(slop_or_prefix_val))),
|((delimiter, phrase), (slop, prefix))| {
@@ -179,10 +179,10 @@ fn term_or_phrase(i: &str) -> IResult<&str, UserInputLeaf> {
}
.into()
},
)(i)
)(inp)
}
fn term_or_phrase_infallible(i: &str) -> JResult<&str, Option<UserInputLeaf>> {
fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> {
map(
// ~* for slop/prefix, ) inside group or ast tree, ^ if boost
tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)),
@@ -214,10 +214,10 @@ fn term_or_phrase_infallible(i: &str) -> JResult<&str, Option<UserInputLeaf>> {
};
(leaf, errors)
},
)(i)
)(inp)
}
fn term_group(i: &str) -> IResult<&str, UserInputAst> {
fn term_group(inp: &str) -> IResult<&str, UserInputAst> {
let occur_symbol = alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
@@ -240,12 +240,12 @@ fn term_group(i: &str) -> IResult<&str, UserInputAst> {
.collect(),
)
},
)(i)
)(inp)
}
// this is a precondition for term_group_infallible. Without it, term_group_infallible can fail
// with a panic. It does not consume its input.
fn term_group_precond(i: &str) -> IResult<&str, (), ()> {
fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
@@ -253,13 +253,13 @@ fn term_group_precond(i: &str) -> IResult<&str, (), ()> {
space0,
char('('), // when we are here, we know it can't be anything but a term group
))),
)(i)
)(inp)
.map_err(|e| e.map(|_| ()))
}
fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> {
let (mut i, (field_name, _, _, _)) =
tuple((field_name, space0, char('('), space0))(i).expect("precondition failed");
fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (mut inp, (field_name, _, _, _)) =
tuple((field_name, space0, char('('), space0))(inp).expect("precondition failed");
let mut terms = Vec::new();
let mut errs = Vec::new();
@@ -270,19 +270,19 @@ fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> {
first_round = false;
Vec::new()
} else {
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
let (rest, (_, err)) = space1_infallible(inp)?;
inp = rest;
err
};
if i.is_empty() {
if inp.is_empty() {
errs.push(LenientErrorInternal {
pos: i.len(),
pos: inp.len(),
message: "missing )".to_string(),
});
break Ok((i, (UserInputAst::Clause(terms), errs)));
break Ok((inp, (UserInputAst::Clause(terms), errs)));
}
if let Some(i) = i.strip_prefix(')') {
break Ok((i, (UserInputAst::Clause(terms), errs)));
if let Some(inp) = inp.strip_prefix(')') {
break Ok((inp, (UserInputAst::Clause(terms), errs)));
}
// only append missing space error if we did not reach the end of group
errs.append(&mut space_error);
@@ -291,26 +291,57 @@ fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> {
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, ((occur, leaf), mut err)) =
tuple_infallible((occur_symbol, term_or_phrase_infallible))(i)?;
tuple_infallible((occur_symbol, term_or_phrase_infallible))(inp)?;
errs.append(&mut err);
if let Some(leaf) = leaf {
terms.push((occur, leaf.set_field(Some(field_name.clone())).into()));
}
i = rest;
inp = rest;
}
}
fn literal(i: &str) -> IResult<&str, UserInputAst> {
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
value(
UserInputLeaf::Exists {
field: String::new(),
},
tuple((space0, char('*'))),
)(inp)
}
fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
field_name,
space0,
char('*'), // when we are here, we know it can't be anything but a exists
))),
)(inp)
.map_err(|e| e.map(|_| ()))
}
fn exists_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _)) =
tuple((field_name, space0, char('*')))(inp).expect("precondition failed");
let exists = UserInputLeaf::Exists { field: field_name }.into();
Ok((inp, (exists, Vec::new())))
}
fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// * alone is already parsed by our caller, so if `exists` succeed, we can be confident
// something (a field name) got parsed before
alt((
map(
tuple((opt(field_name), alt((range, set, term_or_phrase)))),
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
),
term_group,
))(i)
))(inp)
}
fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
map(
tuple_infallible((
opt_i(field_name),
@@ -337,7 +368,7 @@ fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: i.len(),
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
@@ -346,7 +377,7 @@ fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: i.len(),
pos: inp.len(),
message: "parsed keyword NOT as term. It should be quoted".to_string(),
});
}
@@ -355,34 +386,40 @@ fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
errors,
)
},
)(i)
)(inp)
}
fn literal_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
fn literal_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible(
((
term_group_precond,
map(term_group_infallible, |(group, errs)| (Some(group), errs)),
),),
(
(
term_group_precond,
map(term_group_infallible, |(group, errs)| (Some(group), errs)),
),
(
exists_precond,
map(exists_infallible, |(exists, errs)| (Some(exists), errs)),
),
),
literal_no_group_infallible,
)(i)
)(inp)
}
fn slop_or_prefix_val(i: &str) -> JResult<&str, (u32, bool)> {
fn slop_or_prefix_val(inp: &str) -> JResult<&str, (u32, bool)> {
map(
opt_i(alt((
value((0, true), char('*')),
map(preceded(char('~'), u32), |slop| (slop, false)),
))),
|(slop_or_prefix_opt, err)| (slop_or_prefix_opt.unwrap_or_default(), err),
)(i)
)(inp)
}
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range(i: &str) -> IResult<&str, UserInputLeaf> {
fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
let range_term_val = || {
map(
alt((negative_number, relaxed_word, tag("*"))),
@@ -442,10 +479,10 @@ fn range(i: &str) -> IResult<&str, UserInputLeaf> {
lower,
upper,
},
)(i)
)(inp)
}
fn range_infallible(i: &str) -> JResult<&str, UserInputLeaf> {
fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
let lower_to_upper = map(
tuple_infallible((
opt_i(anychar),
@@ -553,10 +590,10 @@ fn range_infallible(i: &str) -> JResult<&str, UserInputLeaf> {
errors,
)
},
)(i)
)(inp)
}
fn set(i: &str) -> IResult<&str, UserInputLeaf> {
fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
map(
preceded(
tuple((space0, tag("IN"), space1)),
@@ -570,10 +607,10 @@ fn set(i: &str) -> IResult<&str, UserInputLeaf> {
field: None,
elements,
},
)(i)
)(inp)
}
fn set_infallible(mut i: &str) -> JResult<&str, UserInputLeaf> {
fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
// `IN [` has already been parsed when we enter, we only need to parse simple terms until we
// find a `]`
let mut elements = Vec::new();
@@ -584,41 +621,41 @@ fn set_infallible(mut i: &str) -> JResult<&str, UserInputLeaf> {
first_round = false;
Vec::new()
} else {
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
let (rest, (_, err)) = space1_infallible(inp)?;
inp = rest;
err
};
if i.is_empty() {
if inp.is_empty() {
// TODO push error about missing ]
//
errs.push(LenientErrorInternal {
pos: i.len(),
pos: inp.len(),
message: "missing ]".to_string(),
});
let res = UserInputLeaf::Set {
field: None,
elements,
};
return Ok((i, (res, errs)));
return Ok((inp, (res, errs)));
}
if let Some(i) = i.strip_prefix(']') {
if let Some(inp) = inp.strip_prefix(']') {
let res = UserInputLeaf::Set {
field: None,
elements,
};
return Ok((i, (res, errs)));
return Ok((inp, (res, errs)));
}
errs.append(&mut space_error);
// TODO
// here we do the assumption term_or_phrase_infallible always consume something if the
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(i)?;
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(inp)?;
errs.append(&mut err);
if let Some((_, term)) = delim_term {
elements.push(term);
}
i = rest;
inp = rest;
}
}
@@ -626,16 +663,16 @@ fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
fn leaf(i: &str) -> IResult<&str, UserInputAst> {
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
alt((
delimited(char('('), ast, char(')')),
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
map(preceded(tuple((tag("NOT"), space1)), leaf), negate),
literal,
))(i)
))(inp)
}
fn leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible(
(
(
@@ -665,23 +702,23 @@ fn leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
),
),
literal_infallible,
)(i)
)(inp)
}
fn positive_float_number(i: &str) -> IResult<&str, f64> {
fn positive_float_number(inp: &str) -> IResult<&str, f64> {
map(
recognize(tuple((digit1, opt(tuple((char('.'), digit1)))))),
// TODO this is actually dangerous if the number is actually not representable as a f64
// (too big for instance)
|float_str: &str| float_str.parse::<f64>().unwrap(),
)(i)
)(inp)
}
fn boost(i: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(i)
fn boost(inp: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(inp)
}
fn boosted_leaf(i: &str) -> IResult<&str, UserInputAst> {
fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
map(
tuple((leaf, fallible(boost))),
|(leaf, boost_opt)| match boost_opt {
@@ -690,10 +727,10 @@ fn boosted_leaf(i: &str) -> IResult<&str, UserInputAst> {
}
_ => leaf,
},
)(i)
)(inp)
}
fn boosted_leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
map(
tuple_infallible((leaf_infallible, boost)),
|((leaf, boost_opt), error)| match boost_opt {
@@ -703,30 +740,30 @@ fn boosted_leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
),
_ => (leaf, error),
},
)(i)
)(inp)
}
fn occur_symbol(i: &str) -> JResult<&str, Option<Occur>> {
fn occur_symbol(inp: &str) -> JResult<&str, Option<Occur>> {
opt_i(alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
)))(i)
)))(inp)
}
fn occur_leaf(i: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(i)
fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(inp)
}
#[allow(clippy::type_complexity)]
fn operand_occur_leaf_infallible(
i: &str,
inp: &str,
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
// TODO maybe this should support multiple chained AND/OR, and "fuse" them?
tuple_infallible((
delimited_infallible(nothing, opt_i(binary_operand), space0_infallible),
occur_symbol,
boosted_leaf_infallible,
))(i)
))(inp)
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -735,11 +772,11 @@ enum BinaryOperand {
And,
}
fn binary_operand(i: &str) -> IResult<&str, BinaryOperand> {
fn binary_operand(inp: &str) -> IResult<&str, BinaryOperand> {
alt((
value(BinaryOperand::And, tag("AND ")),
value(BinaryOperand::Or, tag("OR ")),
))(i)
))(inp)
}
fn aggregate_binary_expressions(
@@ -880,14 +917,14 @@ fn aggregate_infallible_expressions(
}
}
fn operand_leaf(i: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
fn operand_leaf(inp: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
tuple((
terminated(binary_operand, space0),
terminated(boosted_leaf, space0),
))(i)
))(inp)
}
fn ast(i: &str) -> IResult<&str, UserInputAst> {
fn ast(inp: &str) -> IResult<&str, UserInputAst> {
let boolean_expr = map(
separated_pair(boosted_leaf, space1, many1(operand_leaf)),
|(left, right)| aggregate_binary_expressions(left, right),
@@ -908,10 +945,10 @@ fn ast(i: &str) -> IResult<&str, UserInputAst> {
space0,
alt((boolean_expr, whitespace_separated_leaves)),
space0,
)(i)
)(inp)
}
fn ast_infallible(i: &str) -> JResult<&str, UserInputAst> {
fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
// ast() parse either `term AND term OR term` or `+term term -term`
// both are locally ambiguous, and as we allow error, it's hard to permit backtracking.
// Instead, we allow a mix of both syntaxes, trying to make sense of what a user meant.
@@ -928,13 +965,13 @@ fn ast_infallible(i: &str) -> JResult<&str, UserInputAst> {
},
);
delimited_infallible(space0_infallible, expression, space0_infallible)(i)
delimited_infallible(space0_infallible, expression, space0_infallible)(inp)
}
pub fn parse_to_ast(i: &str) -> IResult<&str, UserInputAst> {
pub fn parse_to_ast(inp: &str) -> IResult<&str, UserInputAst> {
map(delimited(space0, opt(ast), eof), |opt_ast| {
rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query))
})(i)
})(inp)
}
pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>) {
@@ -1538,6 +1575,17 @@ mod test {
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
}
#[test]
fn test_exist_query() {
test_parse_query_to_ast_helper("a:*", "\"a\":*");
test_parse_query_to_ast_helper("a: *", "\"a\":*");
// an exist followed by default term being b
test_is_parse_err("a:*b", "(*\"a\":* *b)");
// this is a term query (not a phrase prefix)
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
}
#[test]
fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");

View File

@@ -16,6 +16,9 @@ pub enum UserInputLeaf {
field: Option<String>,
elements: Vec<String>,
},
Exists {
field: String,
},
}
impl UserInputLeaf {
@@ -36,6 +39,9 @@ impl UserInputLeaf {
upper,
},
UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements },
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
}
}
}
@@ -74,6 +80,9 @@ impl Debug for UserInputLeaf {
write!(formatter, "]")
}
UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "\"{field}\":*")
}
}
}
}

View File

@@ -117,10 +117,10 @@ impl AggregationWithAccessor {
ColumnType::U64,
ColumnType::F64,
ColumnType::Str,
ColumnType::DateTime,
// ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
];
// In case the column is empty we want the shim column to match the missing type
@@ -145,7 +145,18 @@ impl AggregationWithAccessor {
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg = missing_and_more_than_one_col || text_on_non_text_col;
// Actually we could convert the text to a number and have the fast path, if it is
// provided in Rfc3339 format. But this use case is probably common
// enough to justify the effort.
let text_on_date_col = column_and_types.len() == 1
&& column_and_types[0].1 == ColumnType::DateTime
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg =
missing_and_more_than_one_col || text_on_non_text_col || text_on_date_col;
if use_special_missing_agg {
let column_and_types =
get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;

View File

@@ -1,6 +1,6 @@
use std::fmt::Debug;
use columnar::{BytesColumn, ColumnType, StrColumn};
use columnar::{BytesColumn, ColumnType, MonotonicallyMappableToU64, StrColumn};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
@@ -16,7 +16,7 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
use crate::aggregation::{f64_from_fastfield_u64, Key};
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
use crate::error::DataCorruption;
use crate::TantivyError;
@@ -531,6 +531,13 @@ impl SegmentTermCollector {
});
}
}
} else if self.field_type == ColumnType::DateTime {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = i64::from_u64(val);
let date = format_date(val)?;
dict.insert(IntermediateKey::Str(date), intermediate_entry);
}
} else {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
@@ -583,6 +590,9 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
#[cfg(test)]
mod tests {
use common::DateTime;
use time::{Date, Month};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
@@ -1813,4 +1823,75 @@ mod tests {
Ok(())
}
#[test]
fn terms_aggregation_date() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_date_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!())?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field",
"missing": "1982-09-17T00:00:00Z"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
}

View File

@@ -234,6 +234,22 @@ impl FastFieldReaders {
Ok(dynamic_column_handle_opt)
}
/// Returning all `dynamic_column_handle`.
pub fn dynamic_column_handles(
&self,
field_name: &str,
) -> crate::Result<Vec<DynamicColumnHandle>> {
let Some(resolved_field_name) = self.resolve_field(field_name)? else {
return Ok(Vec::new());
};
let dynamic_column_handles = self
.columnar
.read_columns(&resolved_field_name)?
.into_iter()
.collect();
Ok(dynamic_column_handles)
}
#[doc(hidden)]
pub async fn list_dynamic_column_handles(
&self,
@@ -338,6 +354,8 @@ impl FastFieldReaders {
#[cfg(test)]
mod tests {
use columnar::ColumnType;
use crate::schema::{JsonObjectOptions, Schema, FAST};
use crate::{Document, Index};
@@ -417,4 +435,45 @@ mod tests {
Some("_dyna\u{1}notinschema\u{1}attr\u{1}color".to_string())
);
}
#[test]
fn test_fast_field_reader_dynamic_column_handles() {
let mut schema_builder = Schema::builder();
let id = schema_builder.add_u64_field("id", FAST);
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
.unwrap();
index_writer
.add_document(doc!(id=> 2u64, json => json!({"foo": true})))
.unwrap();
index_writer
.add_document(doc!(id=> 3u64, json => json!({"foo": "bar"})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0u32);
let fast_fields = reader.fast_fields();
let id_columns = fast_fields.dynamic_column_handles("id").unwrap();
assert_eq!(id_columns.len(), 1);
assert_eq!(id_columns.first().unwrap().column_type(), ColumnType::U64);
let foo_columns = fast_fields.dynamic_column_handles("json.foo").unwrap();
assert_eq!(foo_columns.len(), 3);
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::I64));
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::Bool));
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::Str));
println!("*** {:?}", fast_fields.columnar().list_columns());
}
}

View File

@@ -1703,8 +1703,7 @@ mod tests {
let old_reader = index.reader()?;
// Every 3rd doc has only id field
let id_is_full_doc = |id| id % 3 != 0;
let id_exists = |id| id % 3 != 0; // 0 does not exist
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
@@ -1720,7 +1719,7 @@ mod tests {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_is_full_doc(id) {
if !id_exists(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
@@ -1840,7 +1839,7 @@ mod tests {
let num_docs_with_values = expected_ids_and_num_occurrences
.iter()
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
.filter(|(id, _id_occurrences)| id_exists(**id))
.map(|(_, id_occurrences)| *id_occurrences as usize)
.sum::<usize>();
@@ -1864,7 +1863,7 @@ mod tests {
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| id_is_full_doc(**id))
.filter(|id| id_exists(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1902,7 +1901,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.flat_map(|id| {
if !id_is_full_doc(*id) {
if !id_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1914,7 +1913,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.filter_map(|id| {
if !id_is_full_doc(*id) {
if !id_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1949,7 +1948,7 @@ mod tests {
let id = id_reader.first(doc).unwrap();
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) {
if id_exists(id) {
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1959,7 +1958,7 @@ mod tests {
}
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) {
if id_exists(id) {
assert_eq!(bool_vals.len(), 2);
assert_ne!(bool_vals[0], bool_vals[1]);
} else {
@@ -1988,7 +1987,7 @@ mod tests {
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_is_full_doc(id) {
if id_exists(id) {
let id2 = store_reader
.get(doc_id)
.unwrap()
@@ -2035,7 +2034,7 @@ mod tests {
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
assert_eq!(get_num_hits(text_field), count);
@@ -2085,7 +2084,7 @@ mod tests {
//
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2102,84 +2101,34 @@ mod tests {
}
}
// Range query
// assert data is like expected
//
// Take half as sample
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
sample.sort_by_key(|(k, _num_occurences)| *k);
// sample.truncate(sample.len() / 2);
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.sum::<u64>()
};
fn gen_query_inclusive<T1: ToString, T2: ToString>(
field: &str,
from: T1,
to: T2,
) -> String {
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
}
};
let ip = ip_from_id(existing_id);
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
}
assert_eq!(do_search_ip_field(&query), count);
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2207,7 +2156,7 @@ mod tests {
.first_or_default_col(9999);
for doc_id in segment_reader.doc_ids_alive() {
let id = ff_reader.get_val(doc_id);
if !id_is_full_doc(id) {
if !id_exists(id) {
continue;
}
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2245,12 +2194,6 @@ mod tests {
Ok(index)
}
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
#[test]
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(

View File

@@ -596,10 +596,15 @@ impl SegmentUpdater {
);
{
if let Some(after_merge_segment_entry) = after_merge_segment_entry.as_mut() {
// Deletes and commits could have happened as we were merging.
// We need to make sure we are up to date with deletes before accepting the
// segment.
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_meta().opstamp;
if delete_operation.opstamp < committed_opstamp {
// We are not up to date! Let's create a new tombstone file for our
// freshly create split.
let index = &segment_updater.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(advance_deletes_err) = advance_deletes(

View File

@@ -4,6 +4,7 @@ use std::sync::Arc;
use common::BitSet;
use tantivy_fst::Automaton;
use super::phrase_prefix_query::prefix_end;
use crate::core::SegmentReader;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
@@ -14,6 +15,10 @@ use crate::{DocId, Score, TantivyError};
pub struct AutomatonWeight<A> {
field: Field,
automaton: Arc<A>,
// For JSON fields, the term dictionary include terms from all paths.
// We apply additional filtering based on the given JSON path, when searching within the term
// dictionary. This prevents terms from unrelated paths from matching the search criteria.
json_path_bytes: Option<Box<[u8]>>,
}
impl<A> AutomatonWeight<A>
@@ -26,6 +31,20 @@ where
AutomatonWeight {
field,
automaton: automaton.into(),
json_path_bytes: None,
}
}
/// Create a new AutomationWeight for a json path
pub fn new_for_json_path<IntoArcA: Into<Arc<A>>>(
field: Field,
automaton: IntoArcA,
json_path_bytes: &[u8],
) -> AutomatonWeight<A> {
AutomatonWeight {
field,
automaton: automaton.into(),
json_path_bytes: Some(json_path_bytes.to_vec().into_boxed_slice()),
}
}
@@ -34,7 +53,15 @@ where
term_dict: &'a TermDictionary,
) -> io::Result<TermStreamer<'a, &'a A>> {
let automaton: &A = &self.automaton;
let term_stream_builder = term_dict.search(automaton);
let mut term_stream_builder = term_dict.search(automaton);
if let Some(json_path_bytes) = &self.json_path_bytes {
term_stream_builder = term_stream_builder.ge(json_path_bytes);
if let Some(end) = prefix_end(json_path_bytes) {
term_stream_builder = term_stream_builder.lt(&end);
}
}
term_stream_builder.into_stream()
}
}

345
src/query/exist_query.rs Normal file
View File

@@ -0,0 +1,345 @@
use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use super::{ConstScorer, EmptyScorer};
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, Score, TantivyError};
/// Query that matches all documents with a non-null value in the specified field.
///
/// All of the matched documents get the score 1.0.
#[derive(Clone, Debug)]
pub struct ExistsQuery {
field_name: String,
}
impl ExistsQuery {
/// Creates a new `ExistQuery` from the given field.
///
/// This query matches all documents with at least one non-null value in the specified field.
/// This constructor never fails, but executing the search with this query will return an
/// error if the specified field doesn't exists or is not a fast field.
pub fn new_exists_query(field: String) -> ExistsQuery {
ExistsQuery { field_name: field }
}
}
impl Query for ExistsQuery {
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let Some((field, _path)) = schema.find_field(&self.field_name) else {
return Err(TantivyError::FieldNotFound(self.field_name.clone()));
};
let field_type = schema.get_field_entry(field).field_type();
if !field_type.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {} is not a fast field.",
self.field_name
)));
}
Ok(Box::new(ExistsWeight {
field_name: self.field_name.clone(),
}))
}
}
/// Weight associated with the `ExistsQuery` query.
pub struct ExistsWeight {
field_name: String,
}
impl Weight for ExistsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader
.dynamic_column_handles(&self.field_name)?
.into_iter()
.map(|handle| handle.open().map_err(|io_error| io_error.into()))
.collect();
let mut non_empty_columns = Vec::new();
for column in dynamic_columns? {
if !matches!(column.column_index(), ColumnIndex::Empty { .. }) {
non_empty_columns.push(column)
}
}
// TODO: we can optimizer more here since in most cases we will have only one index
if !non_empty_columns.is_empty() {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(Explanation::new("ExistsQuery", 1.0))
}
}
pub(crate) struct ExistsDocSet {
columns: Vec<DynamicColumn>,
doc: DocId,
max_doc: DocId,
}
impl ExistsDocSet {
pub(crate) fn new(columns: Vec<DynamicColumn>, max_doc: DocId) -> Self {
let mut set = Self {
columns,
doc: 0u32,
max_doc,
};
set.find_next();
set
}
fn find_next(&mut self) -> DocId {
while self.doc < self.max_doc {
if self
.columns
.iter()
.any(|col| col.column_index().has_value(self.doc))
{
return self.doc;
}
self.doc += 1;
}
self.doc = TERMINATED;
TERMINATED
}
}
impl DocSet for ExistsDocSet {
fn advance(&mut self) -> DocId {
self.seek(self.doc + 1)
}
fn size_hint(&self) -> u32 {
0
}
fn doc(&self) -> DocId {
self.doc
}
#[inline(always)]
fn seek(&mut self, target: DocId) -> DocId {
self.doc = target;
self.find_next()
}
}
#[cfg(test)]
mod tests {
use std::net::Ipv6Addr;
use std::ops::Bound;
use common::DateTime;
use time::OffsetDateTime;
use crate::collector::Count;
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{doc, Index, Searcher};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let all_field = schema_builder.add_u64_field("all", INDEXED | FAST);
let even_field = schema_builder.add_u64_field("even", INDEXED | FAST);
let odd_field = schema_builder.add_text_field("odd", STRING | FAST);
let multi_field = schema_builder.add_text_field("multi", FAST);
let _never_field = schema_builder.add_u64_field("never", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
if i % 10 == 0 {
index_writer.add_document(doc!(all_field => i, even_field => i, multi_field => i.to_string(), multi_field => (i + 1).to_string()))?;
} else {
index_writer.add_document(doc!(all_field => i, even_field => i))?;
}
} else {
index_writer.add_document(doc!(all_field => i, odd_field => i.to_string()))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "all")?, 100);
assert_eq!(count_existing_fields(&searcher, "odd")?, 50);
assert_eq!(count_existing_fields(&searcher, "even")?, 50);
assert_eq!(count_existing_fields(&searcher, "multi")?, 10);
assert_eq!(count_existing_fields(&searcher, "never")?, 0);
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(50),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(0),
Bound::Excluded(50),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
Ok(())
}
#[test]
fn test_exists_query_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
index_writer.add_document(doc!(json => json!({"all": i, "even": true})))?;
} else {
index_writer
.add_document(doc!(json => json!({"all": i.to_string(), "odd": true})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "json.all")?, 100);
assert_eq!(count_existing_fields(&searcher, "json.even")?, 50);
assert_eq!(count_existing_fields(&searcher, "json.odd")?, 50);
// Handling of non-existing fields:
assert_eq!(count_existing_fields(&searcher, "json.absent")?, 0);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists.absent".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists.absent'"
);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let bool = schema_builder.add_bool_field("bool", FAST);
let bytes = schema_builder.add_bytes_field("bytes", FAST);
let date = schema_builder.add_date_field("date", FAST);
let f64 = schema_builder.add_f64_field("f64", FAST);
let ip_addr = schema_builder.add_ip_addr_field("ip_addr", FAST);
let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let now = OffsetDateTime::now_utc().unix_timestamp();
for i in 0u8..100u8 {
if i % 2 == 0 {
let date_val = DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
now + i as i64 * 100,
)?);
index_writer.add_document(
doc!(bool => i % 3 == 0, bytes => vec![i, i + 1, i + 2], date => date_val),
)?;
} else {
let ip_addr_v6 = Ipv6Addr::new(0, 0, 0, 0, 0, 0xffff, 0xc00a, i.into());
index_writer
.add_document(doc!(f64 => i as f64 * 0.5, ip_addr => ip_addr_v6, facet => Facet::from("/facet/foo"), facet => Facet::from("/facet/bar")))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "bool")?, 50);
assert_eq!(count_existing_fields(&searcher, "bytes")?, 50);
assert_eq!(count_existing_fields(&searcher, "date")?, 50);
assert_eq!(count_existing_fields(&searcher, "f64")?, 50);
assert_eq!(count_existing_fields(&searcher, "ip_addr")?, 50);
assert_eq!(count_existing_fields(&searcher, "facet")?, 50);
Ok(())
}
#[test]
fn test_exists_query_unsupported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let not_fast = schema_builder.add_text_field("not_fast", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
not_fast => "slow",
))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("not_fast".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"Schema error: 'Field not_fast is not a fast field.'"
);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists'"
);
Ok(())
}
fn count_existing_fields(searcher: &Searcher, field: &str) -> crate::Result<usize> {
let query = ExistsQuery::new_exists_query(field.to_string());
searcher.search(&query, &Count)
}
}

View File

@@ -3,7 +3,7 @@ use once_cell::sync::OnceCell;
use tantivy_fst::Automaton;
use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
use crate::schema::Term;
use crate::schema::{Term, Type};
use crate::TantivyError::InvalidArgument;
pub(crate) struct DfaWrapper(pub DFA);
@@ -132,18 +132,46 @@ impl FuzzyTermQuery {
});
let term_value = self.term.value();
let term_text = term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?;
let term_text = if term_value.typ() == Type::Json {
if let Some(json_path_type) = term_value.json_path_type() {
if json_path_type != Type::Str {
return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \
{:?}",
json_path_type
)));
}
}
std::str::from_utf8(self.term.serialized_value_bytes()).map_err(|_| {
InvalidArgument(
"Failed to convert json term value bytes to utf8 string.".to_string(),
)
})?
} else {
term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?
};
let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text)
} else {
automaton_builder.build_dfa(term_text)
};
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
if let Some((json_path_bytes, _)) = term_value.as_json() {
Ok(AutomatonWeight::new_for_json_path(
self.term.field(),
DfaWrapper(automaton),
json_path_bytes,
))
} else {
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
}
}
@@ -157,9 +185,89 @@ impl Query for FuzzyTermQuery {
mod test {
use super::FuzzyTermQuery;
use crate::collector::{Count, TopDocs};
use crate::schema::{Schema, TEXT};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Schema, STORED, TEXT};
use crate::{assert_nearly_equals, Index, Term};
#[test]
pub fn test_fuzzy_json_path() -> crate::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let attributes = schema_builder.add_json_field("attributes", TEXT | STORED);
let schema = schema_builder.build();
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let doc = schema.parse_document(
r#"{
"attributes": {
"a": "japan"
}
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
r#"{
"attributes": {
"aa": "japan"
}
}"#,
)?;
index_writer.add_document(doc)?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
// # Fuzzy search
let query_parser = QueryParser::for_index(&index, vec![attributes]);
let get_json_path_term = |query: &str| -> crate::Result<Term> {
let query = query_parser.parse_query(query)?;
let mut terms = Vec::new();
query.query_terms(&mut |term, _| {
terms.push(term.clone());
});
Ok(terms[0].clone())
};
// shall not match the first document due to json path mismatch
{
let term = get_json_path_term("attributes.aa:japan")?;
let fuzzy_query = FuzzyTermQuery::new(term, 2, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
assert_eq!(top_docs[0].1.doc_id, 1, "Expected the second document");
}
// shall match the first document because Levenshtein distance is 1 (substitute 'o' with
// 'a')
{
let term = get_json_path_term("attributes.a:japon")?;
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
assert_eq!(top_docs[0].1.doc_id, 0, "Expected the first document");
}
// shall not match because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
{
let term = get_json_path_term("attributes.a:jap")?;
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 0, "Expected no document");
}
Ok(())
}
#[test]
pub fn test_fuzzy_term() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -8,6 +8,7 @@ mod const_score_query;
mod disjunction_max_query;
mod empty_query;
mod exclude;
mod exist_query;
mod explanation;
mod fuzzy_query;
mod intersection;
@@ -41,6 +42,7 @@ pub use self::const_score_query::{ConstScoreQuery, ConstScorer};
pub use self::disjunction_max_query::DisjunctionMaxQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::exist_query::ExistsQuery;
pub use self::explanation::Explanation;
#[cfg(test)]
pub(crate) use self::fuzzy_query::DfaWrapper;

View File

@@ -6,7 +6,7 @@ pub use phrase_prefix_query::PhrasePrefixQuery;
pub use phrase_prefix_scorer::PhrasePrefixScorer;
pub use phrase_prefix_weight::PhrasePrefixWeight;
fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
pub(crate) fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
let mut res = prefix_start.to_owned();
while !res.is_empty() {
let end = res.len() - 1;

View File

@@ -847,6 +847,12 @@ impl QueryParser {
}));
(Some(logical_ast), errors)
}
UserInputLeaf::Exists { .. } => (
None,
vec![QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
)],
),
}
}
}

View File

@@ -31,8 +31,8 @@ impl VecCursor {
self.current_pos = 0;
&mut self.docs
}
fn last_doc(&self) -> Option<u32> {
self.docs.last().cloned()
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
}
fn is_empty(&self) -> bool {
self.current().is_none()
@@ -112,15 +112,15 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
finished_to_end = true;
}
let last_doc = self.loaded_docs.last_doc();
let last_value = self.loaded_docs.last_value();
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
doc_buffer,
);
if let Some(last_doc) = last_doc {
while self.loaded_docs.current() == Some(last_doc) {
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
@@ -136,7 +136,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
if let Some(docid) = self.loaded_docs.next() {
return docid;
}
if self.next_fetch_start >= self.column.num_docs() {
if self.next_fetch_start >= self.column.values.num_vals() {
return TERMINATED;
}
self.fetch_block();
@@ -177,54 +177,3 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, Document, IndexBuilder};
#[test]
fn range_query_fast_optional_field_minimum() {
let mut schema_builder = schema::SchemaBuilder::new();
let id_field = schema_builder.add_text_field("id", schema::STRING);
let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED);
let dir = RamDirectory::default();
let index = IndexBuilder::new()
.schema(schema_builder.build())
.open_or_create(dir)
.unwrap();
{
let mut writer = index.writer(15_000_000).unwrap();
let count = 1000;
for i in 0..count {
let mut doc = Document::new();
doc.add_text(id_field, format!("doc{i}"));
let nb_scores = i % 2; // 0 or 1 scores
for _ in 0..nb_scores {
doc.add_u64(score_field, 80);
}
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();
assert_eq!(count, 500);
}
}

View File

@@ -397,20 +397,29 @@ where B: AsRef<[u8]>
Some(Ipv6Addr::from_u128(ip_u128))
}
/// Returns the json path (without non-human friendly separators),
/// Returns the json path type.
///
/// Returns `None` if the value is not JSON.
pub fn json_path_type(&self) -> Option<Type> {
let json_value_bytes = self.as_json_value_bytes()?;
Some(json_value_bytes.typ())
}
/// Returns the json path bytes (including the JSON_END_OF_PATH byte),
/// and the encoded ValueBytes after the json path.
///
/// Returns `None` if the value is not JSON.
pub(crate) fn as_json(&self) -> Option<(&str, ValueBytes<&[u8]>)> {
pub(crate) fn as_json(&self) -> Option<(&[u8], ValueBytes<&[u8]>)> {
if self.typ() != Type::Json {
return None;
}
let bytes = self.value_bytes();
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
let (json_path_bytes, term) = bytes.split_at(pos);
let json_path = str::from_utf8(json_path_bytes).ok()?;
Some((json_path, ValueBytes::wrap(&term[1..])))
// split at pos + 1, so that json_path_bytes includes the JSON_END_OF_PATH byte.
let (json_path_bytes, term) = bytes.split_at(pos + 1);
Some((json_path_bytes, ValueBytes::wrap(&term)))
}
/// Returns the encoded ValueBytes after the json path.
@@ -469,7 +478,10 @@ where B: AsRef<[u8]>
write_opt(f, self.as_bytes())?;
}
Type::Json => {
if let Some((path, sub_value_bytes)) = self.as_json() {
if let Some((path_bytes, sub_value_bytes)) = self.as_json() {
// Remove the JSON_END_OF_PATH byte & convert to utf8.
let path = str::from_utf8(&path_bytes[..path_bytes.len() - 1])
.map_err(|_| std::fmt::Error)?;
let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, ".");
write!(f, "path={path_pretty}, ")?;
sub_value_bytes.debug_value_bytes(f)?;

View File

@@ -13,7 +13,8 @@ description = "sstables for tantivy"
common = {version= "0.6", path="../common", package="tantivy-common"}
tantivy-fst = "0.4"
# experimental gives us access to Decompressor::upper_bound
zstd = { version = "0.12", features = ["experimental"] }
zstd = { version = "0.13", features = ["experimental"] }
tracing = "0.1"
[dev-dependencies]
proptest = "1"

View File

@@ -3,6 +3,7 @@ use std::io;
use std::marker::PhantomData;
use std::ops::{Bound, RangeBounds};
use std::sync::Arc;
use tracing::instrument;
use common::file_slice::FileSlice;
use common::{BinarySerializable, OwnedBytes};
@@ -94,6 +95,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
Ok(TSSTable::delta_reader(data))
}
#[instrument(skip_all)]
pub(crate) async fn sstable_delta_reader_block_async(
&self,
block_addr: BlockAddr,
@@ -232,6 +234,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
/// If the key was not found, returns Ok(None).
/// After calling this function, it is possible to call `DeltaReader::value` to get the
/// associated value.
#[instrument(skip_all)]
fn decode_up_to_key<K: AsRef<[u8]>>(
&self,
key: K,
@@ -344,6 +347,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
}
/// Lookups the value corresponding to the key.
#[instrument(skip_all)]
pub async fn get_async<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref()) {
let sstable_reader = self.sstable_delta_reader_block_async(block_addr).await?;

View File

@@ -1,6 +1,7 @@
use std::io::{self, Write};
use std::ops::Range;
use tracing::instrument;
use common::OwnedBytes;
use crate::{common_prefix_len, SSTable, SSTableDataCorruption, TermOrdinal};
@@ -27,6 +28,7 @@ impl SSTableIndex {
}
/// Get the [`BlockAddr`] of the requested block.
#[instrument]
pub(crate) fn get_block(&self, block_id: usize) -> Option<BlockAddr> {
self.blocks
.get(block_id)
@@ -56,6 +58,7 @@ impl SSTableIndex {
/// Get the [`BlockAddr`] of the block that would contain `key`.
///
/// Returns None if `key` is lexicographically after the last key recorded.
#[instrument]
pub fn get_block_with_key(&self, key: &[u8]) -> Option<BlockAddr> {
self.locate_with_key(key).and_then(|id| self.get_block(id))
}