Compare commits

..

9 Commits

Author SHA1 Message Date
Paul Masurel
2a45af77e0 low hanging fruit in optimization 2024-06-11 15:53:22 +09:00
trinity-1686a
08b9fc0b31 fix de-escaping too much in query parser (#2427)
* fix de-escaping too much in query parser
2024-06-10 11:19:01 +02:00
PSeitz
714f363d43 add bench & test for columnar merging (#2428)
* add merge columnar proptest

* add columnar merge benchmark
2024-06-10 16:26:16 +08:00
PSeitz
93ff7365b0 reduce top hits aggregation memory consumption (#2426)
move request structure out of top hits aggregation collector and use from the
passed structure instead

full
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 425.9680ms (-21.38%)    Median: 415.1097ms (-23.56%)    [395.5303ms .. 484.6325ms]
dense
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 440.0817ms (-19.68%)    Median: 432.2286ms (-21.10%)    [403.5632ms .. 497.7541ms]
sparse
terms_many_with_top_hits    Memory: 13.1 MB (-49.31%)    Avg: 33.3568ms (-32.19%)    Median: 33.0834ms (-31.86%)    [32.5126ms .. 35.7397ms]
multivalue
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 414.2340ms (-25.44%)    Median: 413.4144ms (-25.64%)    [403.9919ms .. 430.3170ms]
2024-06-06 22:32:58 +08:00
Adam Reichold
8151925068 Panicking in spawned Rayon tasks will abort the process by default. (#2409) 2024-06-04 17:04:30 +09:00
dependabot[bot]
b960e40bc8 Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423)
Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version.
- [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases)
- [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0)

---
updated-dependencies:
- dependency-name: sketches-ddsketch
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-06-04 15:50:23 +08:00
giovannicuccu
1095c9b073 Issue 1787 extended stats (#2247)
* first version of extended stats along with its tests

* using IntermediateExtendStats instead of IntermediateStats with all tests passing

* Created struct for request and response

* first test with extended_stats

* kahan summation and tests with approximate equality

* version ready for merge

* removed approx dependency

* refactor for using ExtendedStats only when needed

* interim version

* refined version with code formatted

* refactored a struct

* cosmetic refactor

* fix after merge

* fix format

* added extended_stat bench

* merge and new benchmark for extended stats

* split stat segment collectors

* wrapped intermediate extended stat with a box to limit memory usage

* Revert "wrapped intermediate extended stat with a box to limit memory usage"

This reverts commit 5b4aa9f393.

* some code reformat, commented kahan summation

* refactor after review

* refactor after code review

* fix after incorrectly restoring kahan summation

* modifications for code review + bug fix in merge_fruit

* refactor assert_nearly_equals macro

* update after code review

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
2024-06-04 14:25:17 +08:00
PSeitz
c0686515a9 update one_shot (#2420) 2024-05-31 11:07:35 +08:00
trinity-1686a
455156f51c improve query parser (#2416)
* support escape sequence in more place

and fix bug with singlequoted strings

* add query parser test for range query on default field
2024-05-30 17:29:27 +02:00
24 changed files with 1747 additions and 130 deletions

View File

@@ -15,8 +15,7 @@ rust-version = "1.63"
exclude = ["benches/*.json", "benches/*.txt"] exclude = ["benches/*.json", "benches/*.txt"]
[dependencies] [dependencies]
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged oneshot = "0.1.7"
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" }
base64 = "0.22.0" base64 = "0.22.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
@@ -64,7 +63,7 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" } tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" } common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" } tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true } futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7" fnv = "1.0.7"

View File

@@ -47,6 +47,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, average_f64); register!(group, average_f64);
register!(group, average_f64_u64); register!(group, average_f64_u64);
register!(group, stats_f64); register!(group, stats_f64);
register!(group, extendedstats_f64);
register!(group, percentiles_f64); register!(group, percentiles_f64);
register!(group, terms_few); register!(group, terms_few);
register!(group, terms_many); register!(group, terms_many);
@@ -105,7 +106,12 @@ fn stats_f64(index: &Index) {
}); });
exec_term_with_agg(index, agg_req) exec_term_with_agg(index, agg_req)
} }
fn extendedstats_f64(index: &Index) {
let agg_req = json!({
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) { fn percentiles_f64(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"mypercentiles": { "mypercentiles": {

View File

@@ -23,6 +23,12 @@ downcast-rs = "1.2.0"
proptest = "1" proptest = "1"
more-asserts = "0.3.1" more-asserts = "0.3.1"
rand = "0.8" rand = "0.8"
binggan = "0.8.1"
[[bench]]
name = "bench_merge"
harness = false
[features] [features]
unstable = [] unstable = []

View File

@@ -0,0 +1,101 @@
#![feature(test)]
extern crate test;
use core::fmt;
use std::fmt::{Display, Formatter};
use binggan::{black_box, BenchRunner};
use tantivy_columnar::*;
enum Card {
Multi,
Sparse,
Dense,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::Multi => write!(f, "multi"),
Card::Sparse => write!(f, "sparse"),
Card::Dense => write!(f, "dense"),
}
}
}
const NUM_DOCS: u32 = 100_000;
fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
use tantivy_columnar::ColumnarWriter;
let mut columnar_writer = ColumnarWriter::default();
match card {
Card::Multi => {
columnar_writer.record_numerical(0, "price", 10u64);
columnar_writer.record_numerical(0, "price", 10u64);
}
_ => {}
}
for i in 0..num_docs {
match card {
Card::Multi | Card::Sparse => {
if i % 8 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
Card::Dense => {
if i % 6 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
}
}
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
ColumnarReader::open(wrt).unwrap()
}
fn main() {
let mut inputs = Vec::new();
let mut add_combo = |card1: Card, card2: Card| {
inputs.push((
format!("merge_{card1}_and_{card2}"),
vec![
generate_columnar(card1, NUM_DOCS),
generate_columnar(card2, NUM_DOCS),
],
));
};
add_combo(Card::Multi, Card::Multi);
add_combo(Card::Dense, Card::Dense);
add_combo(Card::Sparse, Card::Sparse);
add_combo(Card::Sparse, Card::Dense);
add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse);
let runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input(
input_name,
columnar_readers,
move |columnar_readers: &Vec<ColumnarReader>| {
let mut out = vec![];
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
let _ = black_box(merge_columnar(
&columnar_readers,
&[],
merge_row_order.into(),
&mut out,
));
},
);
}
group.run();
}

View File

@@ -196,6 +196,7 @@ impl Set<RowId> for OptionalIndex {
} = row_addr_from_row_id(doc_id); } = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize]; let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta); let block = self.block(block_meta);
let block_offset_row_id = match block { let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank(in_block_row_id), Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id), Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),

View File

@@ -19,14 +19,13 @@ use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// Normally you would use `MMapDirectory` instead to persist data on disk. // Let's create a temporary directory for the
// https://docs.rs/tantivy/latest/tantivy/directory/struct.MmapDirectory.html // sake of this example
// But for this example, we will use a temporary directory `TempDir`.
let index_path = TempDir::new()?; let index_path = TempDir::new()?;
// # Defining the schema // # Defining the schema
// //
// The Tantivy index requires a schema. // The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index, // The schema declares which fields are in the index,
// and for each field, its type and "the way it should // and for each field, its type and "the way it should
// be indexed". // be indexed".

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::iter::once; use std::iter::once;
use nom::branch::alt; use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters. // special characters.
const SPECIAL_CHARS: &[char] = &[ const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ', '+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
]; ];
/// consume a field name followed by colon. Return the field name with escape sequence /// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
)(inp) )(inp)
} }
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
fn interpret_escape(source: &str) -> String {
let mut res = String::with_capacity(source.len());
let mut in_escape = false;
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
for c in source.chars() {
if in_escape {
if !require_escape(c) {
// we re-add the escape sequence
res.push('\\');
}
res.push(c);
in_escape = false;
} else if c == '\\' {
in_escape = true;
} else {
res.push(c);
}
}
res
}
/// Consume a word outside of any context. /// Consume a word outside of any context.
// TODO should support escape sequences // TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> { fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res( map_res(
recognize(tuple(( recognize(tuple((
satisfy(|c| { alt((
!c.is_whitespace() preceded(char('\\'), anychar),
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c) satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
}), )),
many0(satisfy(|c: char| { many0(alt((
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c) preceded(char('\\'), anychar),
})), satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
))),
))), ))),
|s| match s { |s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)), "OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
_ => Ok(s), s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
s => Ok(Cow::Borrowed(s)),
}, },
)(inp) )(inp)
} }
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ { fn word_infallible(
|inp| { delimiter: &str,
opt_i_err( emit_error: bool,
preceded( ) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
multispace0, // emit error is set when receiving an unescaped `:` should emit an error
recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c) move |inp| {
}))), map(
opt_i_err(
preceded(
multispace0,
recognize(many1(alt((
preceded(char::<&str, _>('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
)))),
),
"expected word",
), ),
"expected word", |(opt_s, mut errors)| match opt_s {
Some(s) => {
if emit_error
&& (s
.as_bytes()
.windows(2)
.any(|window| window[0] != b'\\' && window[1] == b':')
|| s.starts_with(':'))
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if s.contains('\\') {
(Some(Cow::Owned(interpret_escape(s))), errors)
} else {
(Some(Cow::Borrowed(s)), errors)
}
}
None => (None, errors),
},
)(inp) )(inp)
} }
} }
@@ -159,7 +216,7 @@ fn simple_term_infallible(
(value((), char('\'')), simple_quotes), (value((), char('\'')), simple_quotes),
), ),
// numbers are parsed with words in this case, as we allow string starting with a - // numbers are parsed with words in this case, as we allow string starting with a -
map(word_infallible(delimiter), |(text, errors)| { map(word_infallible(delimiter, true), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors) (text.map(|text| (Delimiter::None, text.to_string())), errors)
}), }),
)(inp) )(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|((field_name, _, leaf), mut errors)| { |((field_name, _, leaf), mut errors)| {
( (
leaf.map(|leaf| { leaf.map(|leaf| {
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if matches!(&leaf, UserInputLeaf::Literal(literal) if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None) if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
&& field_name.is_none() && field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
tuple_infallible(( tuple_infallible((
opt_i(anychar), opt_i(anychar),
space0_infallible, space0_infallible,
word_infallible("]}"), word_infallible("]}", false),
space1_infallible, space1_infallible,
opt_i_err( opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))), terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
"missing keyword TO", "missing keyword TO",
), ),
word_infallible("]}"), word_infallible("]}", false),
opt_i_err(one_of("]}"), "missing range delimiter"), opt_i_err(one_of("]}"), "missing range delimiter"),
)), )),
|( |(
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind), (lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs, errs,
)| { )| {
let lower_bound = match (lower_bound_kind, lower) { let lower_bound = match (lower_bound_kind, lower.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded, (_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded, (_, None) => UserInputBound::Unbounded,
// if it is some, TO was actually the bound (i.e. [TO TO something]) // if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()), (Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
_ => unreachable!("precondition failed, range did not start with [ or {{"), _ => unreachable!("precondition failed, range did not start with [ or {{"),
}; };
let upper_bound = match (upper_bound_kind, upper) { let upper_bound = match (upper_bound_kind, upper.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded, (_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded, (_, None) => UserInputBound::Unbounded,
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()), (Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
( (
( (
value((), tag(">=")), value((), tag(">=")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
bound bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
), ),
( (
value((), tag("<=")), value((), tag("<=")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
UserInputBound::Unbounded, UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
), ),
( (
value((), tag(">")), value((), tag(">")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
bound bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
), ),
( (
value((), tag("<")), value((), tag("<")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
UserInputBound::Unbounded, UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]"); test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]"); test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
} }
#[test] #[test]
@@ -1590,5 +1644,21 @@ mod test {
r#"myfield:'hello\"happy\'tax'"#, r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#, r#""myfield":'hello"happy'tax'"#,
); );
// we don't process escape sequence for chars which don't require it
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
}
#[test]
fn test_queries_with_colons() {
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
}
#[test]
fn test_invalid_field() {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
} }
} }

View File

@@ -34,7 +34,7 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation, DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
}; };
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
}; };
@@ -146,6 +146,11 @@ pub enum AggregationVariants {
/// extracted values. /// extracted values.
#[serde(rename = "stats")] #[serde(rename = "stats")]
Stats(StatsAggregation), Stats(StatsAggregation),
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
/// `std_deviation_sampling`) over the extracted values.
#[serde(rename = "extended_stats")]
ExtendedStats(ExtendedStatsAggregation),
/// Computes the sum of the extracted values. /// Computes the sum of the extracted values.
#[serde(rename = "sum")] #[serde(rename = "sum")]
Sum(SumAggregation), Sum(SumAggregation),
@@ -170,6 +175,7 @@ impl AggregationVariants {
AggregationVariants::Max(max) => vec![max.field_name()], AggregationVariants::Max(max) => vec![max.field_name()],
AggregationVariants::Min(min) => vec![min.field_name()], AggregationVariants::Min(min) => vec![min.field_name()],
AggregationVariants::Stats(stats) => vec![stats.field_name()], AggregationVariants::Stats(stats) => vec![stats.field_name()],
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
AggregationVariants::Sum(sum) => vec![sum.field_name()], AggregationVariants::Sum(sum) => vec![sum.field_name()],
AggregationVariants::Percentiles(per) => vec![per.field_name()], AggregationVariants::Percentiles(per) => vec![per.field_name()],
AggregationVariants::TopHits(top_hits) => top_hits.field_names(), AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
@@ -197,6 +203,12 @@ impl AggregationVariants {
_ => None, _ => None,
} }
} }
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
match &self {
AggregationVariants::TopHits(top_hits) => Some(top_hits),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> { pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self { match &self {

View File

@@ -11,8 +11,8 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation, DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
}; };
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation, AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SumAggregation, StatsAggregation, SumAggregation,
}; };
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::VecWithNames; use super::VecWithNames;
@@ -276,6 +276,10 @@ impl AggregationWithAccessor {
field: ref field_name, field: ref field_name,
.. ..
}) })
| ExtendedStats(ExtendedStatsAggregation {
field: ref field_name,
..
})
| Sum(SumAggregation { | Sum(SumAggregation {
field: ref field_name, field: ref field_name,
.. ..

View File

@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::bucket::GetDocCount; use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult}; use super::metric::{
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
};
use super::{AggregationError, Key}; use super::{AggregationError, Key};
use crate::TantivyError; use crate::TantivyError;
@@ -88,6 +90,8 @@ pub enum MetricResult {
Min(SingleMetricResult), Min(SingleMetricResult),
/// Stats metric result. /// Stats metric result.
Stats(Stats), Stats(Stats),
/// ExtendedStats metric result.
ExtendedStats(Box<ExtendedStats>),
/// Sum metric result. /// Sum metric result.
Sum(SingleMetricResult), Sum(SingleMetricResult),
/// Percentiles metric result. /// Percentiles metric result.
@@ -104,6 +108,7 @@ impl MetricResult {
MetricResult::Max(max) => Ok(max.value), MetricResult::Max(max) => Ok(max.value),
MetricResult::Min(min) => Ok(min.value), MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property), MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value), MetricResult::Sum(sum) => Ok(sum.value),
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError( MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()), AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),

View File

@@ -19,8 +19,8 @@ use super::bucket::{
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation, GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
}; };
use super::metric::{ use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats, IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer, IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
}; };
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey}; use super::{format_date, AggregationError, Key, SerializedKey};
@@ -215,6 +215,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats( Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
IntermediateStats::default(), IntermediateStats::default(),
)), )),
ExtendedStats(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
),
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum( Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
IntermediateSum::default(), IntermediateSum::default(),
)), )),
@@ -222,7 +225,7 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
IntermediateMetricResult::Percentiles(PercentilesCollector::default()), IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
), ),
TopHits(ref req) => IntermediateAggregationResult::Metric( TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())), IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
), ),
} }
} }
@@ -282,6 +285,8 @@ pub enum IntermediateMetricResult {
Min(IntermediateMin), Min(IntermediateMin),
/// Intermediate stats result. /// Intermediate stats result.
Stats(IntermediateStats), Stats(IntermediateStats),
/// Intermediate stats result.
ExtendedStats(IntermediateExtendedStats),
/// Intermediate sum result. /// Intermediate sum result.
Sum(IntermediateSum), Sum(IntermediateSum),
/// Intermediate top_hits result /// Intermediate top_hits result
@@ -306,6 +311,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::Stats(intermediate_stats) => { IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize()) MetricResult::Stats(intermediate_stats.finalize())
} }
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
MetricResult::ExtendedStats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => { IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into()) MetricResult::Sum(intermediate_sum.finalize().into())
} }
@@ -346,6 +354,12 @@ impl IntermediateMetricResult {
) => { ) => {
stats_left.merge_fruits(stats_right); stats_left.merge_fruits(stats_right);
} }
(
IntermediateMetricResult::ExtendedStats(extended_stats_left),
IntermediateMetricResult::ExtendedStats(extended_stats_right),
) => {
extended_stats_left.merge_fruits(extended_stats_right);
}
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => { (IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
sum_left.merge_fruits(sum_right); sum_left.merge_fruits(sum_right);
} }

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@
mod average; mod average;
mod count; mod count;
mod extended_stats;
mod max; mod max;
mod min; mod min;
mod percentiles; mod percentiles;
@@ -29,6 +30,7 @@ use std::collections::HashMap;
pub use average::*; pub use average::*;
pub use count::*; pub use count::*;
pub use extended_stats::*;
pub use max::*; pub use max::*;
pub use min::*; pub use min::*;
pub use percentiles::*; pub use percentiles::*;

View File

@@ -1,3 +1,5 @@
use std::fmt::Debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;
@@ -85,13 +87,15 @@ impl Stats {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats { pub struct IntermediateStats {
/// The number of extracted values. /// The number of extracted values.
count: u64, pub(crate) count: u64,
/// The sum of the extracted values. /// The sum of the extracted values.
sum: f64, pub(crate) sum: f64,
/// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
pub(crate) delta: f64,
/// The min value. /// The min value.
min: f64, pub(crate) min: f64,
/// The max value. /// The max value.
max: f64, pub(crate) max: f64,
} }
impl Default for IntermediateStats { impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
Self { Self {
count: 0, count: 0,
sum: 0.0, sum: 0.0,
delta: 0.0,
min: f64::MAX, min: f64::MAX,
max: f64::MIN, max: f64::MIN,
} }
@@ -109,7 +114,13 @@ impl IntermediateStats {
/// Merges the other stats intermediate result into self. /// Merges the other stats intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateStats) { pub fn merge_fruits(&mut self, other: IntermediateStats) {
self.count += other.count; self.count += other.count;
self.sum += other.sum;
// kahan algorithm for sum
let y = other.sum - (self.delta + other.delta);
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(other.min); self.min = self.min.min(other.min);
self.max = self.max.max(other.max); self.max = self.max.max(other.max);
} }
@@ -141,9 +152,15 @@ impl IntermediateStats {
} }
#[inline] #[inline]
fn collect(&mut self, value: f64) { pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
self.count += 1; self.count += 1;
self.sum += value;
// kahan algorithm for sum
let y = value - self.delta;
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(value); self.min = self.min.min(value);
self.max = self.max.max(value); self.max = self.max.max(value);
} }
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use serde_json::Value; use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations}; use crate::aggregation::agg_req::{Aggregation, Aggregations};

View File

@@ -1,7 +1,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use columnar::{ColumnarReader, DynamicColumn}; use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR; use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
use common::DateTime; use common::DateTime;
use regex::Regex; use regex::Regex;
@@ -443,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
impl TopHitsTopNComputer { impl TopHitsTopNComputer {
/// Create a new TopHitsCollector /// Create a new TopHitsCollector
pub fn new(req: TopHitsAggregation) -> Self { pub fn new(req: &TopHitsAggregation) -> Self {
Self { Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)), top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req, req: req.clone(),
} }
} }
@@ -491,7 +491,6 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector { pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
accessor_idx: usize, accessor_idx: usize,
req: TopHitsAggregation,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>, top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
} }
@@ -502,7 +501,6 @@ impl TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
) -> Self { ) -> Self {
Self { Self {
req: req.clone(),
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)), top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
segment_ordinal, segment_ordinal,
accessor_idx, accessor_idx,
@@ -511,14 +509,13 @@ impl TopHitsSegmentCollector {
fn into_top_hits_collector( fn into_top_hits_collector(
self, self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>, value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregation,
) -> TopHitsTopNComputer { ) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone()); let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec(); let top_results = self.top_n.into_vec();
for res in top_results { for res in top_results {
let doc_value_fields = self let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
.req
.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect( top_hits_computer.collect(
DocSortValuesAndFields { DocSortValuesAndFields {
sorts: res.feature, sorts: res.feature,
@@ -530,34 +527,15 @@ impl TopHitsSegmentCollector {
top_hits_computer top_hits_computer
} }
}
impl SegmentAggregationCollector for TopHitsSegmentCollector { /// TODO add a specialized variant for a single sort field
fn add_intermediate_aggregation_result( fn collect_with(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let intermediate_result =
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
fn collect(
&mut self, &mut self,
doc_id: crate::DocId, doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, req: &TopHitsAggregation,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> { ) -> crate::Result<()> {
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors; let sorts: Vec<DocValueAndOrder> = req
let sorts: Vec<DocValueAndOrder> = self
.req
.sort .sort
.iter() .iter()
.enumerate() .enumerate()
@@ -582,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
); );
Ok(()) Ok(())
} }
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let intermediate_result = IntermediateMetricResult::TopHits(
self.into_top_hits_collector(value_accessors, tophits_req),
);
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
/// TODO: Consider a caching layer to reduce the call overhead
fn collect(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
self.collect_with(doc_id, tophits_req, accessors)?;
Ok(())
}
fn collect_block( fn collect_block(
&mut self, &mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> { ) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
// TODO: Consider getting fields with the column block accessor. // TODO: Consider getting fields with the column block accessor.
for doc in docs { for doc in docs {
self.collect(*doc, agg_with_accessor)?; self.collect_with(*doc, tophits_req, accessors)?;
} }
Ok(()) Ok(())
} }

View File

@@ -11,12 +11,12 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector}; use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::IntermediateAggregationResults; use super::intermediate_agg_result::IntermediateAggregationResults;
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation, SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation, SumAggregation,
}; };
use crate::aggregation::bucket::TermMissingAgg; use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::TopHitsSegmentCollector; use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug { pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
@@ -148,6 +148,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx, accessor_idx,
*missing, *missing,
))), ))),
ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
)),
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req( Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type, req.field_type,
SegmentStatsType::Sum, SegmentStatsType::Sum,

View File

@@ -871,7 +871,10 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader}; use crate::{
assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
SegmentReader,
};
fn make_index() -> crate::Result<Index> { fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();

View File

@@ -195,7 +195,7 @@ mod tests {
let (tx, rx) = crossbeam_channel::bounded::<()>(0); let (tx, rx) = crossbeam_channel::bounded::<()>(0);
let rx = Arc::new(rx); let rx = Arc::new(rx);
let executor = Executor::multi_thread(3, "search-test").unwrap(); let executor = Executor::multi_thread(3, "search-test").unwrap();
for i in 0..1000 { for _ in 0..1000 {
let counter_clone: Arc<AtomicU64> = counter.clone(); let counter_clone: Arc<AtomicU64> = counter.clone();
let other_counter_clone: Arc<AtomicU64> = other_counter.clone(); let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
@@ -203,18 +203,18 @@ mod tests {
let rx_clone2 = rx.clone(); let rx_clone2 = rx.clone();
let fut = executor.spawn_blocking(move || { let fut = executor.spawn_blocking(move || {
counter_clone.fetch_add(1, Ordering::SeqCst); counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone.recv().unwrap(); let _ = rx_clone.recv();
}); });
futures.push(fut); futures.push(fut);
let other_fut = executor.spawn_blocking(move || { let other_fut = executor.spawn_blocking(move || {
other_counter_clone.fetch_add(1, Ordering::SeqCst); other_counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone2.recv().unwrap(); let _ = rx_clone2.recv();
}); });
other_futures.push(other_fut); other_futures.push(other_fut);
} }
// We execute 100 futures. // We execute 100 futures.
for i in 0..100 { for _ in 0..100 {
tx.send(()).unwrap(); tx.send(()).unwrap();
} }
@@ -226,7 +226,7 @@ mod tests {
drop(other_futures); drop(other_futures);
// We execute 100 futures. // We execute 100 futures.
for i in 0..100 { for _ in 0..100 {
tx.send(()).unwrap(); tx.send(()).unwrap();
} }

View File

@@ -787,6 +787,8 @@ impl IndexMerger {
mod tests { mod tests {
use columnar::Column; use columnar::Column;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use schema::FAST; use schema::FAST;
use crate::collector::tests::{ use crate::collector::tests::{
@@ -794,6 +796,7 @@ mod tests {
}; };
use crate::collector::{Count, FacetCollector}; use crate::collector::{Count, FacetCollector};
use crate::index::{Index, SegmentId}; use crate::index::{Index, SegmentId};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::{ use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
@@ -1531,6 +1534,112 @@ mod tests {
Ok(()) Ok(())
} }
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum IndexingOp {
ZeroVal,
OneVal { val: u64 },
TwoVal { val: u64 },
Commit,
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
(0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
(0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
]
}
use proptest::prelude::*;
proptest! {
#[test]
fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
assert!(test_merge_int_fields(&ops[..]).is_ok());
}
}
fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
if ops.iter().all(|op| *op == IndexingOp::Commit) {
return Ok(());
}
let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
.iter()
.filter(|op| *op != &IndexingOp::Commit)
.map(|op| match op {
IndexingOp::ZeroVal => vec![],
IndexingOp::OneVal { val } => vec![*val],
IndexingOp::TwoVal { val } => vec![*val, *val],
IndexingOp::Commit => unreachable!(),
})
.enumerate()
.map(|(id, val)| (id as u32, val))
.collect();
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
index_writer.add_document(doc).unwrap();
};
for op in ops {
match op {
IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
IndexingOp::Commit => {
index_writer.commit().expect("commit failed");
}
}
}
index_writer.commit().expect("commit failed");
}
{
let mut segment_ids = index.searchable_segment_ids()?;
segment_ids.sort();
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
reader.reload()?;
let mut vals: Vec<u64> = Vec::new();
let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
vals.clear();
vals.extend(col.values_for_doc(doc));
assert_eq!(&vals[..], expected);
};
let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
for (doc_id, vals) in column_expected.iter() {
test_vals(col, *doc_id, vals);
}
};
{
let searcher = reader.searcher();
let segment = searcher.segment_reader(0u32);
let col = segment
.fast_fields()
.column_opt::<u64>("intvals")
.unwrap()
.unwrap();
test_col(&col, &expected_doc_and_vals);
}
Ok(())
}
#[test] #[test]
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> { fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();

View File

@@ -397,16 +397,20 @@ pub mod tests {
#[macro_export] #[macro_export]
macro_rules! assert_nearly_equals { macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{ ($left:expr, $right:expr) => {{
match (&$left, &$right) { assert_nearly_equals!($left, $right, 0.0005);
(left_val, right_val) => { }};
($left:expr, $right:expr, $epsilon:expr) => {{
match (&$left, &$right, &$epsilon) {
(left_val, right_val, epsilon_val) => {
let diff = (left_val - right_val).abs(); let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add { if diff > *epsilon_val {
panic!( panic!(
r#"assertion failed: `(left ~= right)` r#"assertion failed: `abs(left-right)>epsilon`
left: `{:?}`, left: `{:?}`,
right: `{:?}`"#, right: `{:?}`,
&*left_val, &*right_val epsilon: `{:?}`"#,
&*left_val, &*right_val, &*epsilon_val
) )
} }
} }

View File

@@ -2,7 +2,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings; use crate::postings::Postings;
use crate::query::bm25::Bm25Weight; use crate::query::bm25::Bm25Weight;
use crate::query::phrase_query::{intersection_count, PhraseScorer}; use crate::query::phrase_query::{intersection_count, intersection_exists, PhraseScorer};
use crate::query::Scorer; use crate::query::Scorer;
use crate::{DocId, Score}; use crate::{DocId, Score};
@@ -92,14 +92,17 @@ impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
} }
} }
pub struct PhrasePrefixScorer<TPostings: Postings> { pub struct PhrasePrefixScorer<TPostings: Postings, const SCORING_ENABLED: bool> {
phrase_scorer: PhraseKind<TPostings>, phrase_scorer: PhraseKind<TPostings>,
suffixes: Vec<TPostings>, suffixes: Vec<TPostings>,
suffix_offset: u32, suffix_offset: u32,
phrase_count: u32, phrase_count: u32,
suffix_position_buffer: Vec<u32>,
} }
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> { impl<TPostings: Postings, const SCORING_ENABLED: bool>
PhrasePrefixScorer<TPostings, SCORING_ENABLED>
{
// If similarity_weight is None, then scoring is disabled. // If similarity_weight is None, then scoring is disabled.
pub fn new( pub fn new(
mut term_postings: Vec<(usize, TPostings)>, mut term_postings: Vec<(usize, TPostings)>,
@@ -107,7 +110,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
fieldnorm_reader: FieldNormReader, fieldnorm_reader: FieldNormReader,
suffixes: Vec<TPostings>, suffixes: Vec<TPostings>,
suffix_pos: usize, suffix_pos: usize,
) -> PhrasePrefixScorer<TPostings> { ) -> PhrasePrefixScorer<TPostings, SCORING_ENABLED> {
// correct indices so we can merge with our suffix term the PhraseScorer doesn't know about // correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
let max_offset = term_postings let max_offset = term_postings
.iter() .iter()
@@ -140,6 +143,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
suffixes, suffixes,
suffix_offset: (max_offset - suffix_pos) as u32, suffix_offset: (max_offset - suffix_pos) as u32,
phrase_count: 0, phrase_count: 0,
suffix_position_buffer: Vec::with_capacity(100),
}; };
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() { if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
phrase_prefix_scorer.advance(); phrase_prefix_scorer.advance();
@@ -153,7 +157,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
fn matches_prefix(&mut self) -> bool { fn matches_prefix(&mut self) -> bool {
let mut count = 0; let mut count = 0;
let mut positions = Vec::new();
let current_doc = self.doc(); let current_doc = self.doc();
let pos_matching = self.phrase_scorer.get_intersection(); let pos_matching = self.phrase_scorer.get_intersection();
for suffix in &mut self.suffixes { for suffix in &mut self.suffixes {
@@ -162,16 +165,27 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
} }
let doc = suffix.seek(current_doc); let doc = suffix.seek(current_doc);
if doc == current_doc { if doc == current_doc {
suffix.positions_with_offset(self.suffix_offset, &mut positions); suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
count += intersection_count(pos_matching, &positions); if SCORING_ENABLED {
count += intersection_count(pos_matching, &self.suffix_position_buffer);
} else {
if intersection_exists(pos_matching, &self.suffix_position_buffer) {
return true;
}
}
} }
} }
if !SCORING_ENABLED {
return false;
}
self.phrase_count = count as u32; self.phrase_count = count as u32;
count != 0 count != 0
} }
} }
impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> { impl<TPostings: Postings, const SCORING_ENABLED: bool> DocSet
for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
{
fn advance(&mut self) -> DocId { fn advance(&mut self) -> DocId {
loop { loop {
let doc = self.phrase_scorer.advance(); let doc = self.phrase_scorer.advance();
@@ -198,9 +212,15 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
} }
} }
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> { impl<TPostings: Postings, const SCORING_ENABLED: bool> Scorer
for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
{
fn score(&mut self) -> Score { fn score(&mut self) -> Score {
if SCORING_ENABLED {
self.phrase_scorer.score()
} else {
1.0f32
}
// TODO modify score?? // TODO modify score??
self.phrase_scorer.score()
} }
} }

View File

@@ -42,11 +42,11 @@ impl PhrasePrefixWeight {
Ok(FieldNormReader::constant(reader.max_doc(), 1)) Ok(FieldNormReader::constant(reader.max_doc(), 1))
} }
pub(crate) fn phrase_scorer( pub(crate) fn phrase_prefix_scorer<const SCORING_ENABLED: bool>(
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, boost: Score,
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> { ) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings, SCORING_ENABLED>>> {
let similarity_weight_opt = self let similarity_weight_opt = self
.similarity_weight_opt .similarity_weight_opt
.as_ref() .as_ref()
@@ -128,15 +128,20 @@ impl PhrasePrefixWeight {
impl Weight for PhrasePrefixWeight { impl Weight for PhrasePrefixWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? { if self.similarity_weight_opt.is_some() {
Ok(Box::new(scorer)) if let Some(scorer) = self.phrase_prefix_scorer::<true>(reader, boost)? {
return Ok(Box::new(scorer));
}
} else { } else {
Ok(Box::new(EmptyScorer)) if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? {
return Ok(Box::new(scorer));
}
} }
Ok(Box::new(EmptyScorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?; let scorer_opt = self.phrase_prefix_scorer::<true>(reader, 1.0)?;
if scorer_opt.is_none() { if scorer_opt.is_none() {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
@@ -200,7 +205,7 @@ mod tests {
.unwrap() .unwrap()
.unwrap(); .unwrap();
let mut phrase_scorer = phrase_weight let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)? .phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
.unwrap(); .unwrap();
assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.phrase_count(), 2);
@@ -211,6 +216,38 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
pub fn test_phrase_no_count() -> crate::Result<()> {
let index = create_index(&[
"aa bb dd cc",
"aa aa bb c dd aa bb cc aa bb dc",
" aa bb cd",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = PhrasePrefixQuery::new(vec![
Term::from_field_text(text_field, "aa"),
Term::from_field_text(text_field, "bb"),
Term::from_field_text(text_field, "c"),
]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.phrase_prefix_scorer::<false>(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 0);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 0);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test] #[test]
pub fn test_phrase_count_mid() -> crate::Result<()> { pub fn test_phrase_count_mid() -> crate::Result<()> {
let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?; let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
@@ -227,7 +264,7 @@ mod tests {
.unwrap() .unwrap()
.unwrap(); .unwrap();
let mut phrase_scorer = phrase_weight let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)? .phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
.unwrap(); .unwrap();
assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.phrase_count(), 2);

View File

@@ -3,8 +3,8 @@ mod phrase_scorer;
mod phrase_weight; mod phrase_weight;
pub use self::phrase_query::PhraseQuery; pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_scorer::intersection_count;
pub use self::phrase_scorer::PhraseScorer; pub use self::phrase_scorer::PhraseScorer;
pub(crate) use self::phrase_scorer::{intersection_count, intersection_exists};
pub use self::phrase_weight::PhraseWeight; pub use self::phrase_weight::PhraseWeight;
#[cfg(test)] #[cfg(test)]

View File

@@ -58,7 +58,7 @@ pub struct PhraseScorer<TPostings: Postings> {
} }
/// Returns true if and only if the two sorted arrays contain a common element /// Returns true if and only if the two sorted arrays contain a common element
fn intersection_exists(left: &[u32], right: &[u32]) -> bool { pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
let mut left_index = 0; let mut left_index = 0;
let mut right_index = 0; let mut right_index = 0;
while left_index < left.len() && right_index < right.len() { while left_index < left.len() && right_index < right.len() {