Compare commits

..

3 Commits

Author SHA1 Message Date
Pascal Seitz
722b6c5205 bump version 2023-10-25 20:41:07 +08:00
Pascal Seitz
0f2211ca44 increase min memory to 15MB for indexing
With tantivy 0.20 the minimum memory consumption per SegmentWriter increased to
12MB. 7MB are for the different fast field collectors types (they could be
lazily created). Increase the minimum memory from 3MB to 15MB.

Change memory variable naming from arena to budget.

closes #2156
2023-10-25 20:37:47 +08:00
PSeitz
21aabf961c Fix range query (#2226)
Fix range query end check in advance
Rename vars to reduce ambiguity
add tests

Fixes #2225
2023-10-25 20:37:36 +08:00
23 changed files with 286 additions and 1046 deletions

View File

@@ -15,13 +15,13 @@ jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly-2023-09-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2023-09-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1
with:

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install nightly
uses: actions-rs/toolchain@v1
@@ -60,7 +60,7 @@ jobs:
name: test-${{ matrix.features.label}}
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.21.0"
version = "0.21.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -24,7 +24,7 @@ const SPECIAL_CHARS: &[char] = &[
/// consume a field name followed by colon. Return the field name with escape sequence
/// already interpreted
fn field_name(inp: &str) -> IResult<&str, String> {
fn field_name(i: &str) -> IResult<&str, String> {
let simple_char = none_of(SPECIAL_CHARS);
let first_char = verify(none_of(SPECIAL_CHARS), |c| *c != '-');
let escape_sequence = || preceded(char('\\'), one_of(SPECIAL_CHARS));
@@ -38,12 +38,12 @@ fn field_name(inp: &str) -> IResult<&str, String> {
char(':'),
),
|(first_char, next)| once(first_char).chain(next).collect(),
)(inp)
)(i)
}
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> {
fn word(i: &str) -> IResult<&str, &str> {
map_res(
recognize(tuple((
satisfy(|c| {
@@ -55,14 +55,14 @@ fn word(inp: &str) -> IResult<&str, &str> {
})),
))),
|s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(i, ErrorKind::Tag)),
_ => Ok(s),
},
)(inp)
)(i)
}
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|inp| {
|i| {
opt_i_err(
preceded(
space0,
@@ -71,29 +71,29 @@ fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&st
}))),
),
"expected word",
)(inp)
)(i)
}
}
/// Consume a word inside a Range context. More values are allowed as they are
/// not ambiguous in this context.
fn relaxed_word(inp: &str) -> IResult<&str, &str> {
fn relaxed_word(i: &str) -> IResult<&str, &str> {
recognize(tuple((
satisfy(|c| !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)),
many0(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)))(inp)
)))(i)
}
fn negative_number(inp: &str) -> IResult<&str, &str> {
fn negative_number(i: &str) -> IResult<&str, &str> {
recognize(preceded(
char('-'),
tuple((digit1, opt(tuple((char('.'), digit1))))),
))(inp)
))(i)
}
fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> {
fn simple_term(i: &str) -> IResult<&str, (Delimiter, String)> {
let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -123,13 +123,13 @@ fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> {
simple_quotes,
double_quotes,
text_no_delimiter,
))(inp)
))(i)
}
fn simple_term_infallible(
delimiter: &str,
) -> impl Fn(&str) -> JResult<&str, Option<(Delimiter, String)>> + '_ {
|inp| {
|i| {
let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -162,11 +162,11 @@ fn simple_term_infallible(
map(word_infallible(delimiter), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors)
}),
)(inp)
)(i)
}
}
fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
fn term_or_phrase(i: &str) -> IResult<&str, UserInputLeaf> {
map(
tuple((simple_term, fallible(slop_or_prefix_val))),
|((delimiter, phrase), (slop, prefix))| {
@@ -179,10 +179,10 @@ fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
}
.into()
},
)(inp)
)(i)
}
fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> {
fn term_or_phrase_infallible(i: &str) -> JResult<&str, Option<UserInputLeaf>> {
map(
// ~* for slop/prefix, ) inside group or ast tree, ^ if boost
tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)),
@@ -214,10 +214,10 @@ fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>>
};
(leaf, errors)
},
)(inp)
)(i)
}
fn term_group(inp: &str) -> IResult<&str, UserInputAst> {
fn term_group(i: &str) -> IResult<&str, UserInputAst> {
let occur_symbol = alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
@@ -240,12 +240,12 @@ fn term_group(inp: &str) -> IResult<&str, UserInputAst> {
.collect(),
)
},
)(inp)
)(i)
}
// this is a precondition for term_group_infallible. Without it, term_group_infallible can fail
// with a panic. It does not consume its input.
fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
fn term_group_precond(i: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
@@ -253,13 +253,13 @@ fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
space0,
char('('), // when we are here, we know it can't be anything but a term group
))),
)(inp)
)(i)
.map_err(|e| e.map(|_| ()))
}
fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (mut inp, (field_name, _, _, _)) =
tuple((field_name, space0, char('('), space0))(inp).expect("precondition failed");
fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> {
let (mut i, (field_name, _, _, _)) =
tuple((field_name, space0, char('('), space0))(i).expect("precondition failed");
let mut terms = Vec::new();
let mut errs = Vec::new();
@@ -270,19 +270,19 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
first_round = false;
Vec::new()
} else {
let (rest, (_, err)) = space1_infallible(inp)?;
inp = rest;
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
err
};
if inp.is_empty() {
if i.is_empty() {
errs.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "missing )".to_string(),
});
break Ok((inp, (UserInputAst::Clause(terms), errs)));
break Ok((i, (UserInputAst::Clause(terms), errs)));
}
if let Some(inp) = inp.strip_prefix(')') {
break Ok((inp, (UserInputAst::Clause(terms), errs)));
if let Some(i) = i.strip_prefix(')') {
break Ok((i, (UserInputAst::Clause(terms), errs)));
}
// only append missing space error if we did not reach the end of group
errs.append(&mut space_error);
@@ -291,57 +291,26 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, ((occur, leaf), mut err)) =
tuple_infallible((occur_symbol, term_or_phrase_infallible))(inp)?;
tuple_infallible((occur_symbol, term_or_phrase_infallible))(i)?;
errs.append(&mut err);
if let Some(leaf) = leaf {
terms.push((occur, leaf.set_field(Some(field_name.clone())).into()));
}
inp = rest;
i = rest;
}
}
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
value(
UserInputLeaf::Exists {
field: String::new(),
},
tuple((space0, char('*'))),
)(inp)
}
fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
field_name,
space0,
char('*'), // when we are here, we know it can't be anything but a exists
))),
)(inp)
.map_err(|e| e.map(|_| ()))
}
fn exists_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _)) =
tuple((field_name, space0, char('*')))(inp).expect("precondition failed");
let exists = UserInputLeaf::Exists { field: field_name }.into();
Ok((inp, (exists, Vec::new())))
}
fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// * alone is already parsed by our caller, so if `exists` succeed, we can be confident
// something (a field name) got parsed before
fn literal(i: &str) -> IResult<&str, UserInputAst> {
alt((
map(
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
tuple((opt(field_name), alt((range, set, term_or_phrase)))),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
),
term_group,
))(inp)
))(i)
}
fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
map(
tuple_infallible((
opt_i(field_name),
@@ -368,7 +337,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
@@ -377,7 +346,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "parsed keyword NOT as term. It should be quoted".to_string(),
});
}
@@ -386,40 +355,34 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
errors,
)
},
)(inp)
)(i)
}
fn literal_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn literal_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible(
(
(
term_group_precond,
map(term_group_infallible, |(group, errs)| (Some(group), errs)),
),
(
exists_precond,
map(exists_infallible, |(exists, errs)| (Some(exists), errs)),
),
),
((
term_group_precond,
map(term_group_infallible, |(group, errs)| (Some(group), errs)),
),),
literal_no_group_infallible,
)(inp)
)(i)
}
fn slop_or_prefix_val(inp: &str) -> JResult<&str, (u32, bool)> {
fn slop_or_prefix_val(i: &str) -> JResult<&str, (u32, bool)> {
map(
opt_i(alt((
value((0, true), char('*')),
map(preceded(char('~'), u32), |slop| (slop, false)),
))),
|(slop_or_prefix_opt, err)| (slop_or_prefix_opt.unwrap_or_default(), err),
)(inp)
)(i)
}
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
fn range(i: &str) -> IResult<&str, UserInputLeaf> {
let range_term_val = || {
map(
alt((negative_number, relaxed_word, tag("*"))),
@@ -479,10 +442,10 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
lower,
upper,
},
)(inp)
)(i)
}
fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
fn range_infallible(i: &str) -> JResult<&str, UserInputLeaf> {
let lower_to_upper = map(
tuple_infallible((
opt_i(anychar),
@@ -590,10 +553,10 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
errors,
)
},
)(inp)
)(i)
}
fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
fn set(i: &str) -> IResult<&str, UserInputLeaf> {
map(
preceded(
tuple((space0, tag("IN"), space1)),
@@ -607,10 +570,10 @@ fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
field: None,
elements,
},
)(inp)
)(i)
}
fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
fn set_infallible(mut i: &str) -> JResult<&str, UserInputLeaf> {
// `IN [` has already been parsed when we enter, we only need to parse simple terms until we
// find a `]`
let mut elements = Vec::new();
@@ -621,41 +584,41 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
first_round = false;
Vec::new()
} else {
let (rest, (_, err)) = space1_infallible(inp)?;
inp = rest;
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
err
};
if inp.is_empty() {
if i.is_empty() {
// TODO push error about missing ]
//
errs.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "missing ]".to_string(),
});
let res = UserInputLeaf::Set {
field: None,
elements,
};
return Ok((inp, (res, errs)));
return Ok((i, (res, errs)));
}
if let Some(inp) = inp.strip_prefix(']') {
if let Some(i) = i.strip_prefix(']') {
let res = UserInputLeaf::Set {
field: None,
elements,
};
return Ok((inp, (res, errs)));
return Ok((i, (res, errs)));
}
errs.append(&mut space_error);
// TODO
// here we do the assumption term_or_phrase_infallible always consume something if the
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(inp)?;
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(i)?;
errs.append(&mut err);
if let Some((_, term)) = delim_term {
elements.push(term);
}
inp = rest;
i = rest;
}
}
@@ -663,16 +626,16 @@ fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
fn leaf(i: &str) -> IResult<&str, UserInputAst> {
alt((
delimited(char('('), ast, char(')')),
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
map(preceded(tuple((tag("NOT"), space1)), leaf), negate),
literal,
))(inp)
))(i)
}
fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible(
(
(
@@ -702,23 +665,23 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
),
),
literal_infallible,
)(inp)
)(i)
}
fn positive_float_number(inp: &str) -> IResult<&str, f64> {
fn positive_float_number(i: &str) -> IResult<&str, f64> {
map(
recognize(tuple((digit1, opt(tuple((char('.'), digit1)))))),
// TODO this is actually dangerous if the number is actually not representable as a f64
// (too big for instance)
|float_str: &str| float_str.parse::<f64>().unwrap(),
)(inp)
)(i)
}
fn boost(inp: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(inp)
fn boost(i: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(i)
}
fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
fn boosted_leaf(i: &str) -> IResult<&str, UserInputAst> {
map(
tuple((leaf, fallible(boost))),
|(leaf, boost_opt)| match boost_opt {
@@ -727,10 +690,10 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
}
_ => leaf,
},
)(inp)
)(i)
}
fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn boosted_leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
map(
tuple_infallible((leaf_infallible, boost)),
|((leaf, boost_opt), error)| match boost_opt {
@@ -740,30 +703,30 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
),
_ => (leaf, error),
},
)(inp)
)(i)
}
fn occur_symbol(inp: &str) -> JResult<&str, Option<Occur>> {
fn occur_symbol(i: &str) -> JResult<&str, Option<Occur>> {
opt_i(alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
)))(inp)
)))(i)
}
fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(inp)
fn occur_leaf(i: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(i)
}
#[allow(clippy::type_complexity)]
fn operand_occur_leaf_infallible(
inp: &str,
i: &str,
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
// TODO maybe this should support multiple chained AND/OR, and "fuse" them?
tuple_infallible((
delimited_infallible(nothing, opt_i(binary_operand), space0_infallible),
occur_symbol,
boosted_leaf_infallible,
))(inp)
))(i)
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -772,11 +735,11 @@ enum BinaryOperand {
And,
}
fn binary_operand(inp: &str) -> IResult<&str, BinaryOperand> {
fn binary_operand(i: &str) -> IResult<&str, BinaryOperand> {
alt((
value(BinaryOperand::And, tag("AND ")),
value(BinaryOperand::Or, tag("OR ")),
))(inp)
))(i)
}
fn aggregate_binary_expressions(
@@ -917,14 +880,14 @@ fn aggregate_infallible_expressions(
}
}
fn operand_leaf(inp: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
fn operand_leaf(i: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
tuple((
terminated(binary_operand, space0),
terminated(boosted_leaf, space0),
))(inp)
))(i)
}
fn ast(inp: &str) -> IResult<&str, UserInputAst> {
fn ast(i: &str) -> IResult<&str, UserInputAst> {
let boolean_expr = map(
separated_pair(boosted_leaf, space1, many1(operand_leaf)),
|(left, right)| aggregate_binary_expressions(left, right),
@@ -945,10 +908,10 @@ fn ast(inp: &str) -> IResult<&str, UserInputAst> {
space0,
alt((boolean_expr, whitespace_separated_leaves)),
space0,
)(inp)
)(i)
}
fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
fn ast_infallible(i: &str) -> JResult<&str, UserInputAst> {
// ast() parse either `term AND term OR term` or `+term term -term`
// both are locally ambiguous, and as we allow error, it's hard to permit backtracking.
// Instead, we allow a mix of both syntaxes, trying to make sense of what a user meant.
@@ -965,13 +928,13 @@ fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
},
);
delimited_infallible(space0_infallible, expression, space0_infallible)(inp)
delimited_infallible(space0_infallible, expression, space0_infallible)(i)
}
pub fn parse_to_ast(inp: &str) -> IResult<&str, UserInputAst> {
pub fn parse_to_ast(i: &str) -> IResult<&str, UserInputAst> {
map(delimited(space0, opt(ast), eof), |opt_ast| {
rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query))
})(inp)
})(i)
}
pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>) {
@@ -1575,17 +1538,6 @@ mod test {
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
}
#[test]
fn test_exist_query() {
test_parse_query_to_ast_helper("a:*", "\"a\":*");
test_parse_query_to_ast_helper("a: *", "\"a\":*");
// an exist followed by default term being b
test_is_parse_err("a:*b", "(*\"a\":* *b)");
// this is a term query (not a phrase prefix)
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
}
#[test]
fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");

View File

@@ -16,9 +16,6 @@ pub enum UserInputLeaf {
field: Option<String>,
elements: Vec<String>,
},
Exists {
field: String,
},
}
impl UserInputLeaf {
@@ -39,9 +36,6 @@ impl UserInputLeaf {
upper,
},
UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements },
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
}
}
}
@@ -80,9 +74,6 @@ impl Debug for UserInputLeaf {
write!(formatter, "]")
}
UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "\"{field}\":*")
}
}
}
}

View File

@@ -134,142 +134,3 @@ impl Drop for ResourceLimitGuard {
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_merge() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb" }"#],
vec![r#"{ "text": "aaa", "text2": "bbb" }"#],
];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 1, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 1,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_data() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![vec![r#"{ "text": "aaa", "text2": "bbb" }"#]];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
// Empty result since there is no doc with dates
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 0, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 0,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
}

View File

@@ -103,8 +103,7 @@ impl AggregationWithAccessor {
field: field_name, ..
}) => {
let (accessor, column_type) =
// Only DateTime is supported for DateHistogram
get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?;
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
Terms(TermsAggregation {
@@ -118,10 +117,10 @@ impl AggregationWithAccessor {
ColumnType::U64,
ColumnType::F64,
ColumnType::Str,
ColumnType::DateTime,
// ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
];
// In case the column is empty we want the shim column to match the missing type
@@ -146,18 +145,7 @@ impl AggregationWithAccessor {
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
// Actually we could convert the text to a number and have the fast path, if it is
// provided in Rfc3339 format. But this use case is probably common
// enough to justify the effort.
let text_on_date_col = column_and_types.len() == 1
&& column_and_types[0].1 == ColumnType::DateTime
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg =
missing_and_more_than_one_col || text_on_non_text_col || text_on_date_col;
let use_special_missing_agg = missing_and_more_than_one_col || text_on_non_text_col;
if use_special_missing_agg {
let column_and_types =
get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;

View File

@@ -132,7 +132,6 @@ impl DateHistogramAggregationReq {
hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds,
keyed: self.keyed,
is_normalized_to_ns: false,
})
}
@@ -244,14 +243,14 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
}
#[cfg(test)]
pub mod tests {
mod tests {
use pretty_assertions::assert_eq;
use super::*;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
@@ -307,8 +306,7 @@ pub mod tests {
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("text", FAST | STRING);
schema_builder.add_text_field("text2", FAST | STRING);
schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{

View File

@@ -122,14 +122,11 @@ pub struct HistogramAggregation {
/// Whether to return the buckets as a hash map
#[serde(default)]
pub keyed: bool,
/// Whether the values are normalized to ns for date time values. Defaults to false.
#[serde(default)]
pub is_normalized_to_ns: bool,
}
impl HistogramAggregation {
pub(crate) fn normalize_date_time(&mut self) {
if !self.is_normalized_to_ns {
pub(crate) fn normalize(&mut self, column_type: ColumnType) {
if column_type.is_date_time() {
// values are provided in ms, but the fastfield is in nano seconds
self.interval *= 1_000_000.0;
self.offset = self.offset.map(|off| off * 1_000_000.0);
@@ -141,7 +138,6 @@ impl HistogramAggregation {
min: bounds.min * 1_000_000.0,
max: bounds.max * 1_000_000.0,
});
self.is_normalized_to_ns = true;
}
}
@@ -374,7 +370,7 @@ impl SegmentHistogramCollector {
Ok(IntermediateBucketResult::Histogram {
buckets,
is_date_agg: self.column_type == ColumnType::DateTime,
column_type: Some(self.column_type),
})
}
@@ -385,9 +381,7 @@ impl SegmentHistogramCollector {
accessor_idx: usize,
) -> crate::Result<Self> {
req.validate()?;
if field_type == ColumnType::DateTime {
req.normalize_date_time();
}
req.normalize(field_type);
let sub_aggregation_blueprint = if sub_aggregation.is_empty() {
None
@@ -445,7 +439,6 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// memory check upfront
let (_, first_bucket_num, last_bucket_num) =
generate_bucket_pos_with_opt_minmax(histogram_req, min_max);
// It's based on user input, so we need to account for overflows
let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64)
.saturating_sub(buckets.len() as u64);
@@ -489,7 +482,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// Convert to BucketEntry
pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>,
is_date_agg: bool,
column_type: Option<ColumnType>,
histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations,
limits: &AggregationLimits,
@@ -498,8 +491,8 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// The request used in the the call to final is not yet be normalized.
// Normalization is changing the precision from milliseconds to nanoseconds.
let mut histogram_req = histogram_req.clone();
if is_date_agg {
histogram_req.normalize_date_time();
if let Some(column_type) = column_type {
histogram_req.normalize(column_type);
}
let mut buckets = if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no
@@ -523,7 +516,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
// and normalize from nanoseconds to milliseconds
if is_date_agg {
if column_type == Some(ColumnType::DateTime) {
for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(ref mut val) = bucket.key {
let key_as_string = format_date(*val as i64)?;

View File

@@ -1,6 +1,6 @@
use std::fmt::Debug;
use columnar::{BytesColumn, ColumnType, MonotonicallyMappableToU64, StrColumn};
use columnar::{BytesColumn, ColumnType, StrColumn};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
@@ -16,7 +16,7 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
use crate::aggregation::{f64_from_fastfield_u64, Key};
use crate::error::DataCorruption;
use crate::TantivyError;
@@ -531,13 +531,6 @@ impl SegmentTermCollector {
});
}
}
} else if self.field_type == ColumnType::DateTime {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = i64::from_u64(val);
let date = format_date(val)?;
dict.insert(IntermediateKey::Str(date), intermediate_entry);
}
} else {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
@@ -590,9 +583,6 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
#[cfg(test)]
mod tests {
use common::DateTime;
use time::{Date, Month};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
@@ -1823,75 +1813,4 @@ mod tests {
Ok(())
}
#[test]
fn terms_aggregation_date() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_date_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!())?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field",
"missing": "1982-09-17T00:00:00Z"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
}

View File

@@ -172,16 +172,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
Default::default(),
)),
Histogram(_) => {
Histogram(_) | DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: false,
})
}
DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: true,
column_type: None,
})
}
Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average(
@@ -349,8 +343,8 @@ pub enum IntermediateBucketResult {
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
Histogram {
/// The column_type of the underlying `Column` is DateTime
is_date_agg: bool,
/// The column_type of the underlying `Column`
column_type: Option<ColumnType>,
/// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>,
},
@@ -405,7 +399,7 @@ impl IntermediateBucketResult {
Ok(BucketResult::Range { buckets })
}
IntermediateBucketResult::Histogram {
is_date_agg,
column_type,
buckets,
} => {
let histogram_req = &req
@@ -414,7 +408,7 @@ impl IntermediateBucketResult {
.expect("unexpected aggregation, expected histogram aggregation");
let buckets = intermediate_histogram_buckets_to_final_buckets(
buckets,
is_date_agg,
column_type,
histogram_req,
req.sub_aggregation(),
limits,
@@ -463,11 +457,11 @@ impl IntermediateBucketResult {
(
IntermediateBucketResult::Histogram {
buckets: buckets_left,
is_date_agg: _,
..
},
IntermediateBucketResult::Histogram {
buckets: buckets_right,
is_date_agg: _,
..
},
) => {
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =

View File

@@ -234,22 +234,6 @@ impl FastFieldReaders {
Ok(dynamic_column_handle_opt)
}
/// Returning all `dynamic_column_handle`.
pub fn dynamic_column_handles(
&self,
field_name: &str,
) -> crate::Result<Vec<DynamicColumnHandle>> {
let Some(resolved_field_name) = self.resolve_field(field_name)? else {
return Ok(Vec::new());
};
let dynamic_column_handles = self
.columnar
.read_columns(&resolved_field_name)?
.into_iter()
.collect();
Ok(dynamic_column_handles)
}
#[doc(hidden)]
pub async fn list_dynamic_column_handles(
&self,
@@ -354,8 +338,6 @@ impl FastFieldReaders {
#[cfg(test)]
mod tests {
use columnar::ColumnType;
use crate::schema::{JsonObjectOptions, Schema, FAST};
use crate::{Document, Index};
@@ -435,45 +417,4 @@ mod tests {
Some("_dyna\u{1}notinschema\u{1}attr\u{1}color".to_string())
);
}
#[test]
fn test_fast_field_reader_dynamic_column_handles() {
let mut schema_builder = Schema::builder();
let id = schema_builder.add_u64_field("id", FAST);
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
.unwrap();
index_writer
.add_document(doc!(id=> 2u64, json => json!({"foo": true})))
.unwrap();
index_writer
.add_document(doc!(id=> 3u64, json => json!({"foo": "bar"})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0u32);
let fast_fields = reader.fast_fields();
let id_columns = fast_fields.dynamic_column_handles("id").unwrap();
assert_eq!(id_columns.len(), 1);
assert_eq!(id_columns.first().unwrap().column_type(), ColumnType::U64);
let foo_columns = fast_fields.dynamic_column_handles("json.foo").unwrap();
assert_eq!(foo_columns.len(), 3);
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::I64));
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::Bool));
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::Str));
println!("*** {:?}", fast_fields.columnar().list_columns());
}
}

View File

@@ -620,7 +620,7 @@ impl IndexWriter {
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
.join()
.map_err(|e| TantivyError::ErrorInThread(e.to_string()))?;
.map_err(|e| TantivyError::ErrorInThread(format!("{e:?}")))?;
indexing_worker_result?;
self.add_indexing_worker()?;
}
@@ -1703,7 +1703,8 @@ mod tests {
let old_reader = index.reader()?;
let id_exists = |id| id % 3 != 0; // 0 does not exist
// Every 3rd doc has only id field
let id_is_full_doc = |id| id % 3 != 0;
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
@@ -1719,7 +1720,7 @@ mod tests {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_exists(id) {
if !id_is_full_doc(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
@@ -1839,7 +1840,7 @@ mod tests {
let num_docs_with_values = expected_ids_and_num_occurrences
.iter()
.filter(|(id, _id_occurrences)| id_exists(**id))
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
.map(|(_, id_occurrences)| *id_occurrences as usize)
.sum::<usize>();
@@ -1863,7 +1864,7 @@ mod tests {
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| id_exists(**id))
.filter(|id| id_is_full_doc(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1901,7 +1902,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.flat_map(|id| {
if !id_exists(*id) {
if !id_is_full_doc(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1913,7 +1914,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.filter_map(|id| {
if !id_exists(*id) {
if !id_is_full_doc(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1948,7 +1949,7 @@ mod tests {
let id = id_reader.first(doc).unwrap();
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
if id_exists(id) {
if id_is_full_doc(id) {
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1958,7 +1959,7 @@ mod tests {
}
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
if id_exists(id) {
if id_is_full_doc(id) {
assert_eq!(bool_vals.len(), 2);
assert_ne!(bool_vals[0], bool_vals[1]);
} else {
@@ -1987,7 +1988,7 @@ mod tests {
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_exists(id) {
if id_is_full_doc(id) {
let id2 = store_reader
.get(doc_id)
.unwrap()
@@ -2034,7 +2035,7 @@ mod tests {
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_exists(existing_id) {
if !id_is_full_doc(existing_id) {
continue;
}
assert_eq!(get_num_hits(text_field), count);
@@ -2084,7 +2085,7 @@ mod tests {
//
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
if !id_is_full_doc(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2101,34 +2102,84 @@ mod tests {
}
}
// assert data is like expected
// Range query
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
// Take half as sample
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
sample.sort_by_key(|(k, _num_occurences)| *k);
// sample.truncate(sample.len() / 2);
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.sum::<u64>()
};
let ip = ip_from_id(existing_id);
fn gen_query_inclusive<T1: ToString, T2: ToString>(
field: &str,
from: T1,
to: T2,
) -> String {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
assert_eq!(do_search_ip_field(&query), count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
}
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
if !id_is_full_doc(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2156,7 +2207,7 @@ mod tests {
.first_or_default_col(9999);
for doc_id in segment_reader.doc_ids_alive() {
let id = ff_reader.get_val(doc_id);
if !id_exists(id) {
if !id_is_full_doc(id) {
continue;
}
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2194,6 +2245,12 @@ mod tests {
Ok(index)
}
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
#[test]
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(

View File

@@ -596,15 +596,10 @@ impl SegmentUpdater {
);
{
if let Some(after_merge_segment_entry) = after_merge_segment_entry.as_mut() {
// Deletes and commits could have happened as we were merging.
// We need to make sure we are up to date with deletes before accepting the
// segment.
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_meta().opstamp;
if delete_operation.opstamp < committed_opstamp {
// We are not up to date! Let's create a new tombstone file for our
// freshly create split.
let index = &segment_updater.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(advance_deletes_err) = advance_deletes(

View File

@@ -4,7 +4,6 @@ use std::sync::Arc;
use common::BitSet;
use tantivy_fst::Automaton;
use super::phrase_prefix_query::prefix_end;
use crate::core::SegmentReader;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
@@ -15,10 +14,6 @@ use crate::{DocId, Score, TantivyError};
pub struct AutomatonWeight<A> {
field: Field,
automaton: Arc<A>,
// For JSON fields, the term dictionary include terms from all paths.
// We apply additional filtering based on the given JSON path, when searching within the term
// dictionary. This prevents terms from unrelated paths from matching the search criteria.
json_path_bytes: Option<Box<[u8]>>,
}
impl<A> AutomatonWeight<A>
@@ -31,20 +26,6 @@ where
AutomatonWeight {
field,
automaton: automaton.into(),
json_path_bytes: None,
}
}
/// Create a new AutomationWeight for a json path
pub fn new_for_json_path<IntoArcA: Into<Arc<A>>>(
field: Field,
automaton: IntoArcA,
json_path_bytes: &[u8],
) -> AutomatonWeight<A> {
AutomatonWeight {
field,
automaton: automaton.into(),
json_path_bytes: Some(json_path_bytes.to_vec().into_boxed_slice()),
}
}
@@ -53,15 +34,7 @@ where
term_dict: &'a TermDictionary,
) -> io::Result<TermStreamer<'a, &'a A>> {
let automaton: &A = &self.automaton;
let mut term_stream_builder = term_dict.search(automaton);
if let Some(json_path_bytes) = &self.json_path_bytes {
term_stream_builder = term_stream_builder.ge(json_path_bytes);
if let Some(end) = prefix_end(json_path_bytes) {
term_stream_builder = term_stream_builder.lt(&end);
}
}
let term_stream_builder = term_dict.search(automaton);
term_stream_builder.into_stream()
}
}

View File

@@ -1,345 +0,0 @@
use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use super::{ConstScorer, EmptyScorer};
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, Score, TantivyError};
/// Query that matches all documents with a non-null value in the specified field.
///
/// All of the matched documents get the score 1.0.
#[derive(Clone, Debug)]
pub struct ExistsQuery {
field_name: String,
}
impl ExistsQuery {
/// Creates a new `ExistQuery` from the given field.
///
/// This query matches all documents with at least one non-null value in the specified field.
/// This constructor never fails, but executing the search with this query will return an
/// error if the specified field doesn't exists or is not a fast field.
pub fn new_exists_query(field: String) -> ExistsQuery {
ExistsQuery { field_name: field }
}
}
impl Query for ExistsQuery {
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let Some((field, _path)) = schema.find_field(&self.field_name) else {
return Err(TantivyError::FieldNotFound(self.field_name.clone()));
};
let field_type = schema.get_field_entry(field).field_type();
if !field_type.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {} is not a fast field.",
self.field_name
)));
}
Ok(Box::new(ExistsWeight {
field_name: self.field_name.clone(),
}))
}
}
/// Weight associated with the `ExistsQuery` query.
pub struct ExistsWeight {
field_name: String,
}
impl Weight for ExistsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader
.dynamic_column_handles(&self.field_name)?
.into_iter()
.map(|handle| handle.open().map_err(|io_error| io_error.into()))
.collect();
let mut non_empty_columns = Vec::new();
for column in dynamic_columns? {
if !matches!(column.column_index(), ColumnIndex::Empty { .. }) {
non_empty_columns.push(column)
}
}
// TODO: we can optimizer more here since in most cases we will have only one index
if !non_empty_columns.is_empty() {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(Explanation::new("ExistsQuery", 1.0))
}
}
pub(crate) struct ExistsDocSet {
columns: Vec<DynamicColumn>,
doc: DocId,
max_doc: DocId,
}
impl ExistsDocSet {
pub(crate) fn new(columns: Vec<DynamicColumn>, max_doc: DocId) -> Self {
let mut set = Self {
columns,
doc: 0u32,
max_doc,
};
set.find_next();
set
}
fn find_next(&mut self) -> DocId {
while self.doc < self.max_doc {
if self
.columns
.iter()
.any(|col| col.column_index().has_value(self.doc))
{
return self.doc;
}
self.doc += 1;
}
self.doc = TERMINATED;
TERMINATED
}
}
impl DocSet for ExistsDocSet {
fn advance(&mut self) -> DocId {
self.seek(self.doc + 1)
}
fn size_hint(&self) -> u32 {
0
}
fn doc(&self) -> DocId {
self.doc
}
#[inline(always)]
fn seek(&mut self, target: DocId) -> DocId {
self.doc = target;
self.find_next()
}
}
#[cfg(test)]
mod tests {
use std::net::Ipv6Addr;
use std::ops::Bound;
use common::DateTime;
use time::OffsetDateTime;
use crate::collector::Count;
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{doc, Index, Searcher};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let all_field = schema_builder.add_u64_field("all", INDEXED | FAST);
let even_field = schema_builder.add_u64_field("even", INDEXED | FAST);
let odd_field = schema_builder.add_text_field("odd", STRING | FAST);
let multi_field = schema_builder.add_text_field("multi", FAST);
let _never_field = schema_builder.add_u64_field("never", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
if i % 10 == 0 {
index_writer.add_document(doc!(all_field => i, even_field => i, multi_field => i.to_string(), multi_field => (i + 1).to_string()))?;
} else {
index_writer.add_document(doc!(all_field => i, even_field => i))?;
}
} else {
index_writer.add_document(doc!(all_field => i, odd_field => i.to_string()))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "all")?, 100);
assert_eq!(count_existing_fields(&searcher, "odd")?, 50);
assert_eq!(count_existing_fields(&searcher, "even")?, 50);
assert_eq!(count_existing_fields(&searcher, "multi")?, 10);
assert_eq!(count_existing_fields(&searcher, "never")?, 0);
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(50),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(0),
Bound::Excluded(50),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
Ok(())
}
#[test]
fn test_exists_query_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
index_writer.add_document(doc!(json => json!({"all": i, "even": true})))?;
} else {
index_writer
.add_document(doc!(json => json!({"all": i.to_string(), "odd": true})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "json.all")?, 100);
assert_eq!(count_existing_fields(&searcher, "json.even")?, 50);
assert_eq!(count_existing_fields(&searcher, "json.odd")?, 50);
// Handling of non-existing fields:
assert_eq!(count_existing_fields(&searcher, "json.absent")?, 0);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists.absent".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists.absent'"
);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let bool = schema_builder.add_bool_field("bool", FAST);
let bytes = schema_builder.add_bytes_field("bytes", FAST);
let date = schema_builder.add_date_field("date", FAST);
let f64 = schema_builder.add_f64_field("f64", FAST);
let ip_addr = schema_builder.add_ip_addr_field("ip_addr", FAST);
let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let now = OffsetDateTime::now_utc().unix_timestamp();
for i in 0u8..100u8 {
if i % 2 == 0 {
let date_val = DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
now + i as i64 * 100,
)?);
index_writer.add_document(
doc!(bool => i % 3 == 0, bytes => vec![i, i + 1, i + 2], date => date_val),
)?;
} else {
let ip_addr_v6 = Ipv6Addr::new(0, 0, 0, 0, 0, 0xffff, 0xc00a, i.into());
index_writer
.add_document(doc!(f64 => i as f64 * 0.5, ip_addr => ip_addr_v6, facet => Facet::from("/facet/foo"), facet => Facet::from("/facet/bar")))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "bool")?, 50);
assert_eq!(count_existing_fields(&searcher, "bytes")?, 50);
assert_eq!(count_existing_fields(&searcher, "date")?, 50);
assert_eq!(count_existing_fields(&searcher, "f64")?, 50);
assert_eq!(count_existing_fields(&searcher, "ip_addr")?, 50);
assert_eq!(count_existing_fields(&searcher, "facet")?, 50);
Ok(())
}
#[test]
fn test_exists_query_unsupported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let not_fast = schema_builder.add_text_field("not_fast", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
not_fast => "slow",
))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("not_fast".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"Schema error: 'Field not_fast is not a fast field.'"
);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists'"
);
Ok(())
}
fn count_existing_fields(searcher: &Searcher, field: &str) -> crate::Result<usize> {
let query = ExistsQuery::new_exists_query(field.to_string());
searcher.search(&query, &Count)
}
}

View File

@@ -3,7 +3,7 @@ use once_cell::sync::OnceCell;
use tantivy_fst::Automaton;
use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
use crate::schema::{Term, Type};
use crate::schema::Term;
use crate::TantivyError::InvalidArgument;
pub(crate) struct DfaWrapper(pub DFA);
@@ -132,46 +132,18 @@ impl FuzzyTermQuery {
});
let term_value = self.term.value();
let term_text = if term_value.typ() == Type::Json {
if let Some(json_path_type) = term_value.json_path_type() {
if json_path_type != Type::Str {
return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \
{:?}",
json_path_type
)));
}
}
std::str::from_utf8(self.term.serialized_value_bytes()).map_err(|_| {
InvalidArgument(
"Failed to convert json term value bytes to utf8 string.".to_string(),
)
})?
} else {
term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?
};
let term_text = term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?;
let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text)
} else {
automaton_builder.build_dfa(term_text)
};
if let Some((json_path_bytes, _)) = term_value.as_json() {
Ok(AutomatonWeight::new_for_json_path(
self.term.field(),
DfaWrapper(automaton),
json_path_bytes,
))
} else {
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
}
@@ -185,89 +157,9 @@ impl Query for FuzzyTermQuery {
mod test {
use super::FuzzyTermQuery;
use crate::collector::{Count, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Schema, STORED, TEXT};
use crate::schema::{Schema, TEXT};
use crate::{assert_nearly_equals, Index, Term};
#[test]
pub fn test_fuzzy_json_path() -> crate::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let attributes = schema_builder.add_json_field("attributes", TEXT | STORED);
let schema = schema_builder.build();
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let doc = schema.parse_document(
r#"{
"attributes": {
"a": "japan"
}
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
r#"{
"attributes": {
"aa": "japan"
}
}"#,
)?;
index_writer.add_document(doc)?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
// # Fuzzy search
let query_parser = QueryParser::for_index(&index, vec![attributes]);
let get_json_path_term = |query: &str| -> crate::Result<Term> {
let query = query_parser.parse_query(query)?;
let mut terms = Vec::new();
query.query_terms(&mut |term, _| {
terms.push(term.clone());
});
Ok(terms[0].clone())
};
// shall not match the first document due to json path mismatch
{
let term = get_json_path_term("attributes.aa:japan")?;
let fuzzy_query = FuzzyTermQuery::new(term, 2, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
assert_eq!(top_docs[0].1.doc_id, 1, "Expected the second document");
}
// shall match the first document because Levenshtein distance is 1 (substitute 'o' with
// 'a')
{
let term = get_json_path_term("attributes.a:japon")?;
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
assert_eq!(top_docs[0].1.doc_id, 0, "Expected the first document");
}
// shall not match because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
{
let term = get_json_path_term("attributes.a:jap")?;
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 0, "Expected no document");
}
Ok(())
}
#[test]
pub fn test_fuzzy_term() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -8,7 +8,6 @@ mod const_score_query;
mod disjunction_max_query;
mod empty_query;
mod exclude;
mod exist_query;
mod explanation;
mod fuzzy_query;
mod intersection;
@@ -42,7 +41,6 @@ pub use self::const_score_query::{ConstScoreQuery, ConstScorer};
pub use self::disjunction_max_query::DisjunctionMaxQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::exist_query::ExistsQuery;
pub use self::explanation::Explanation;
#[cfg(test)]
pub(crate) use self::fuzzy_query::DfaWrapper;

View File

@@ -6,7 +6,7 @@ pub use phrase_prefix_query::PhrasePrefixQuery;
pub use phrase_prefix_scorer::PhrasePrefixScorer;
pub use phrase_prefix_weight::PhrasePrefixWeight;
pub(crate) fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
let mut res = prefix_start.to_owned();
while !res.is_empty() {
let end = res.len() - 1;

View File

@@ -847,12 +847,6 @@ impl QueryParser {
}));
(Some(logical_ast), errors)
}
UserInputLeaf::Exists { .. } => (
None,
vec![QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
)],
),
}
}
}

View File

@@ -31,8 +31,8 @@ impl VecCursor {
self.current_pos = 0;
&mut self.docs
}
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
fn last_doc(&self) -> Option<u32> {
self.docs.last().cloned()
}
fn is_empty(&self) -> bool {
self.current().is_none()
@@ -112,15 +112,15 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
finished_to_end = true;
}
let last_value = self.loaded_docs.last_value();
let last_doc = self.loaded_docs.last_doc();
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
doc_buffer,
);
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
if let Some(last_doc) = last_doc {
while self.loaded_docs.current() == Some(last_doc) {
self.loaded_docs.next();
}
}
@@ -136,7 +136,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
if let Some(docid) = self.loaded_docs.next() {
return docid;
}
if self.next_fetch_start >= self.column.values.num_vals() {
if self.next_fetch_start >= self.column.num_docs() {
return TERMINATED;
}
self.fetch_block();
@@ -177,3 +177,54 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, Document, IndexBuilder};
#[test]
fn range_query_fast_optional_field_minimum() {
let mut schema_builder = schema::SchemaBuilder::new();
let id_field = schema_builder.add_text_field("id", schema::STRING);
let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED);
let dir = RamDirectory::default();
let index = IndexBuilder::new()
.schema(schema_builder.build())
.open_or_create(dir)
.unwrap();
{
let mut writer = index.writer(15_000_000).unwrap();
let count = 1000;
for i in 0..count {
let mut doc = Document::new();
doc.add_text(id_field, format!("doc{i}"));
let nb_scores = i % 2; // 0 or 1 scores
for _ in 0..nb_scores {
doc.add_u64(score_field, 80);
}
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();
assert_eq!(count, 500);
}
}

View File

@@ -397,29 +397,20 @@ where B: AsRef<[u8]>
Some(Ipv6Addr::from_u128(ip_u128))
}
/// Returns the json path type.
///
/// Returns `None` if the value is not JSON.
pub fn json_path_type(&self) -> Option<Type> {
let json_value_bytes = self.as_json_value_bytes()?;
Some(json_value_bytes.typ())
}
/// Returns the json path bytes (including the JSON_END_OF_PATH byte),
/// Returns the json path (without non-human friendly separators),
/// and the encoded ValueBytes after the json path.
///
/// Returns `None` if the value is not JSON.
pub(crate) fn as_json(&self) -> Option<(&[u8], ValueBytes<&[u8]>)> {
pub(crate) fn as_json(&self) -> Option<(&str, ValueBytes<&[u8]>)> {
if self.typ() != Type::Json {
return None;
}
let bytes = self.value_bytes();
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
// split at pos + 1, so that json_path_bytes includes the JSON_END_OF_PATH byte.
let (json_path_bytes, term) = bytes.split_at(pos + 1);
Some((json_path_bytes, ValueBytes::wrap(&term)))
let (json_path_bytes, term) = bytes.split_at(pos);
let json_path = str::from_utf8(json_path_bytes).ok()?;
Some((json_path, ValueBytes::wrap(&term[1..])))
}
/// Returns the encoded ValueBytes after the json path.
@@ -478,10 +469,7 @@ where B: AsRef<[u8]>
write_opt(f, self.as_bytes())?;
}
Type::Json => {
if let Some((path_bytes, sub_value_bytes)) = self.as_json() {
// Remove the JSON_END_OF_PATH byte & convert to utf8.
let path = str::from_utf8(&path_bytes[..path_bytes.len() - 1])
.map_err(|_| std::fmt::Error)?;
if let Some((path, sub_value_bytes)) = self.as_json() {
let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, ".");
write!(f, "path={path_pretty}, ")?;
sub_value_bytes.debug_value_bytes(f)?;