Issue/1070 (#1071)

Add a boolean flag in the Query::query_terms informing on whether
position information is required.

Closes #1070
This commit is contained in:
Paul Masurel
2021-06-03 22:33:20 +09:00
committed by GitHub
parent 2aad0ced77
commit 6e4b61154f
10 changed files with 83 additions and 81 deletions

View File

@@ -13,12 +13,14 @@ Tantivy 0.15.0
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
- Add iterator over documents in doc store (@PSeitz). #1044
- Fix log merge policy (@PSeitz). #1043
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
- Make doc store compression dynamic (@PSeitz). #1060
- Switch to json for footer version handling (@PSeitz). #1060
- Add iterator over documents in doc store (@PSeitz). #1044
- Fix log merge policy (@PSeitz). #1043
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
- Make doc store compression dynamic (@PSeitz). #1060
- Switch to json for footer version handling (@PSeitz). #1060
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
Tantivy 0.14.0
=========================

View File

@@ -412,9 +412,9 @@ impl IndexMerger {
Ok(everything_is_in_order)
}
pub(crate) fn get_sort_field_accessor<'b>(
pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader,
sort_by_field: &'b IndexSortByField,
sort_by_field: &IndexSortByField,
) -> crate::Result<impl FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
@@ -456,22 +456,22 @@ impl IndexMerger {
// Loading the field accessor on demand causes a 15x regression
// create iterators over segment/sort_accessor/doc_id tuple
let doc_id_reader_pair = reader_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
})
.collect::<Vec<_>>();
let doc_id_reader_pair =
reader_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
});
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
@@ -1017,12 +1017,12 @@ impl IndexMerger {
if reader.num_deleted_docs() > 0
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
// we start stacking. In the worst case 2/7 of the blocks would be very small.
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
// => 5 * full blocks, 2 * 1 document blocks
//
// In a more realistic scenario the segments are of the same size, so 1/6 of
// the doc stores would be on average half full, given total randomness (which
// the doc stores would be on average half full, given total randomness (which
// is not the case here, but not sure how it behaves exactly).
//
// https://github.com/tantivy-search/tantivy/issues/1053

View File

@@ -6,7 +6,7 @@ use crate::query::Weight;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Searcher;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
/// The boolean query returns a set of documents
/// that matches the Boolean combination of constituent subqueries.
@@ -159,9 +159,9 @@ impl Query for BooleanQuery {
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_occur, subquery) in &self.subqueries {
subquery.query_terms(term_set);
subquery.query_terms(terms);
}
}
}

View File

@@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;
/// `BoostQuery` is a wrapper over a query used to boost its score.
@@ -48,8 +48,8 @@ impl Query for BoostQuery {
Ok(boosted_weight)
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
self.query.query_terms(term_set)
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.query.query_terms(terms)
}
}

View File

@@ -66,7 +66,7 @@ mod tests {
use crate::schema::{Schema, TEXT};
use crate::Index;
use crate::Term;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
#[test]
fn test_query_terms() {
@@ -78,49 +78,49 @@ mod tests {
let term_a = Term::from_field_text(text_field, "a");
let term_b = Term::from_field_text(text_field, "b");
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a b")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("\"a b\"")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a a a a a")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a -b")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
}
}

View File

@@ -233,10 +233,9 @@ impl MoreLikeThis {
}
FieldType::U64(_) => {
for field_value in field_values {
let val = field_value
.value()
.u64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().u64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_u64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
@@ -249,7 +248,7 @@ impl MoreLikeThis {
let val = field_value
.value()
.date_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.timestamp();
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
@@ -259,10 +258,9 @@ impl MoreLikeThis {
}
FieldType::I64(_) => {
for field_value in field_values {
let val = field_value
.value()
.i64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().i64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
@@ -271,10 +269,9 @@ impl MoreLikeThis {
}
FieldType::F64(_) => {
for field_value in field_values {
let val = field_value
.value()
.f64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().f64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_f64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
@@ -306,7 +303,7 @@ impl MoreLikeThis {
{
return true;
}
return self.stop_words.contains(&word);
self.stop_words.contains(&word)
}
/// Couputes the score for each term while ignoring not useful terms

View File

@@ -1,3 +1,5 @@
use std::collections::BTreeMap;
use super::PhraseWeight;
use crate::core::searcher::Searcher;
use crate::query::bm25::Bm25Weight;
@@ -5,7 +7,6 @@ use crate::query::Query;
use crate::query::Weight;
use crate::schema::IndexRecordOption;
use crate::schema::{Field, Term};
use std::collections::BTreeSet;
/// `PhraseQuery` matches a specific sequence of words.
///
@@ -113,9 +114,9 @@ impl Query for PhraseQuery {
Ok(Box::new(phrase_weight))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
for (_, query_term) in &self.phrase_terms {
term_set.insert(query_term.clone());
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_, term) in &self.phrase_terms {
terms.insert(term.clone(), true);
}
}
}

View File

@@ -4,7 +4,7 @@ use crate::query::Explanation;
use crate::DocAddress;
use crate::Term;
use downcast_rs::impl_downcast;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;
/// The `Query` trait defines a set of documents and a scoring method
@@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
/// Extract all of the terms associated to the query and insert them in the
/// term set given in arguments.
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
///
/// Each term is associated with a boolean indicating whether
/// Positions are required or not.
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
}
/// Implements `box_clone`.
@@ -95,8 +98,8 @@ impl Query for Box<dyn Query> {
self.as_ref().count(searcher)
}
fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
self.as_ref().query_terms(term_set);
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.as_ref().query_terms(terms);
}
}

View File

@@ -5,7 +5,7 @@ use crate::query::{Explanation, Query};
use crate::schema::IndexRecordOption;
use crate::Searcher;
use crate::Term;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;
/// A Term query matches all of the documents
@@ -127,7 +127,7 @@ impl Query for TermQuery {
self.specialized_weight(searcher, scoring_enabled)?,
))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
term_set.insert(self.term.clone());
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
terms.insert(self.term.clone(), false);
}
}

View File

@@ -7,7 +7,6 @@ use crate::{Document, Score};
use htmlescape::encode_minimal;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::ops::Range;
const DEFAULT_MAX_NUM_CHARS: usize = 150;
@@ -239,10 +238,10 @@ impl SnippetGenerator {
query: &dyn Query,
field: Field,
) -> crate::Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
let mut terms = BTreeMap::new();
query.query_terms(&mut terms);
let mut terms_text: BTreeMap<String, Score> = Default::default();
for term in terms {
for (term, _) in terms {
if term.field() != field {
continue;
}