mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Issue/1070 (#1071)
Add a boolean flag in the Query::query_terms informing on whether position information is required. Closes #1070
This commit is contained in:
12
CHANGELOG.md
12
CHANGELOG.md
@@ -13,12 +13,14 @@ Tantivy 0.15.0
|
||||
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
|
||||
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
|
||||
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
|
||||
- Add iterator over documents in doc store (@PSeitz). #1044
|
||||
- Fix log merge policy (@PSeitz). #1043
|
||||
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
|
||||
- Make doc store compression dynamic (@PSeitz). #1060
|
||||
- Switch to json for footer version handling (@PSeitz). #1060
|
||||
- Add iterator over documents in doc store (@PSeitz). #1044
|
||||
- Fix log merge policy (@PSeitz). #1043
|
||||
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
|
||||
- Make doc store compression dynamic (@PSeitz). #1060
|
||||
- Switch to json for footer version handling (@PSeitz). #1060
|
||||
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
|
||||
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
|
||||
|
||||
|
||||
Tantivy 0.14.0
|
||||
=========================
|
||||
|
||||
@@ -412,9 +412,9 @@ impl IndexMerger {
|
||||
Ok(everything_is_in_order)
|
||||
}
|
||||
|
||||
pub(crate) fn get_sort_field_accessor<'b>(
|
||||
pub(crate) fn get_sort_field_accessor(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &'b IndexSortByField,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<impl FastFieldReader<u64>> {
|
||||
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
@@ -456,22 +456,22 @@ impl IndexMerger {
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
|
||||
// create iterators over segment/sort_accessor/doc_id tuple
|
||||
let doc_id_reader_pair = reader_and_field_accessors
|
||||
.iter()
|
||||
.map(|reader_and_field_accessor| {
|
||||
reader_and_field_accessor
|
||||
.0
|
||||
.reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| {
|
||||
(
|
||||
doc_id,
|
||||
reader_and_field_accessor.0,
|
||||
&reader_and_field_accessor.1,
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let doc_id_reader_pair =
|
||||
reader_and_field_accessors
|
||||
.iter()
|
||||
.map(|reader_and_field_accessor| {
|
||||
reader_and_field_accessor
|
||||
.0
|
||||
.reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| {
|
||||
(
|
||||
doc_id,
|
||||
reader_and_field_accessor.0,
|
||||
&reader_and_field_accessor.1,
|
||||
)
|
||||
})
|
||||
});
|
||||
|
||||
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
|
||||
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
|
||||
@@ -1017,12 +1017,12 @@ impl IndexMerger {
|
||||
if reader.num_deleted_docs() > 0
|
||||
// If there is not enough data in the store, we avoid stacking in order to
|
||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
|
||||
// => 5 * full blocks, 2 * 1 document blocks
|
||||
//
|
||||
// In a more realistic scenario the segments are of the same size, so 1/6 of
|
||||
// the doc stores would be on average half full, given total randomness (which
|
||||
// the doc stores would be on average half full, given total randomness (which
|
||||
// is not the case here, but not sure how it behaves exactly).
|
||||
//
|
||||
// https://github.com/tantivy-search/tantivy/issues/1053
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Searcher;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
/// The boolean query returns a set of documents
|
||||
/// that matches the Boolean combination of constituent subqueries.
|
||||
@@ -159,9 +159,9 @@ impl Query for BooleanQuery {
|
||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
for (_occur, subquery) in &self.subqueries {
|
||||
subquery.query_terms(term_set);
|
||||
subquery.query_terms(terms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// `BoostQuery` is a wrapper over a query used to boost its score.
|
||||
@@ -48,8 +48,8 @@ impl Query for BoostQuery {
|
||||
Ok(boosted_weight)
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
self.query.query_terms(term_set)
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
self.query.query_terms(terms)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ mod tests {
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[test]
|
||||
fn test_query_terms() {
|
||||
@@ -78,49 +78,49 @@ mod tests {
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a b")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("\"a b\"")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a a a a a")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a -b")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,10 +233,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.u64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().u64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_u64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -249,7 +248,7 @@ impl MoreLikeThis {
|
||||
let val = field_value
|
||||
.value()
|
||||
.date_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.timestamp();
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
@@ -259,10 +258,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.i64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().i64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -271,10 +269,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.f64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().f64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_f64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -306,7 +303,7 @@ impl MoreLikeThis {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return self.stop_words.contains(&word);
|
||||
self.stop_words.contains(&word)
|
||||
}
|
||||
|
||||
/// Couputes the score for each term while ignoring not useful terms
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use super::PhraseWeight;
|
||||
use crate::core::searcher::Searcher;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
@@ -5,7 +7,6 @@ use crate::query::Query;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, Term};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of words.
|
||||
///
|
||||
@@ -113,9 +114,9 @@ impl Query for PhraseQuery {
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
for (_, query_term) in &self.phrase_terms {
|
||||
term_set.insert(query_term.clone());
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
for (_, term) in &self.phrase_terms {
|
||||
terms.insert(term.clone(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::query::Explanation;
|
||||
use crate::DocAddress;
|
||||
use crate::Term;
|
||||
use downcast_rs::impl_downcast;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
@@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
|
||||
|
||||
/// Extract all of the terms associated to the query and insert them in the
|
||||
/// term set given in arguments.
|
||||
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
||||
///
|
||||
/// Each term is associated with a boolean indicating whether
|
||||
/// Positions are required or not.
|
||||
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
|
||||
}
|
||||
|
||||
/// Implements `box_clone`.
|
||||
@@ -95,8 +98,8 @@ impl Query for Box<dyn Query> {
|
||||
self.as_ref().count(searcher)
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
|
||||
self.as_ref().query_terms(term_set);
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
self.as_ref().query_terms(terms);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::query::{Explanation, Query};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Searcher;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
@@ -127,7 +127,7 @@ impl Query for TermQuery {
|
||||
self.specialized_weight(searcher, scoring_enabled)?,
|
||||
))
|
||||
}
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
term_set.insert(self.term.clone());
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
terms.insert(self.term.clone(), false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::{Document, Score};
|
||||
use htmlescape::encode_minimal;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
@@ -239,10 +238,10 @@ impl SnippetGenerator {
|
||||
query: &dyn Query,
|
||||
field: Field,
|
||||
) -> crate::Result<SnippetGenerator> {
|
||||
let mut terms = BTreeSet::new();
|
||||
let mut terms = BTreeMap::new();
|
||||
query.query_terms(&mut terms);
|
||||
let mut terms_text: BTreeMap<String, Score> = Default::default();
|
||||
for term in terms {
|
||||
for (term, _) in terms {
|
||||
if term.field() != field {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user