mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
* add RegexPhraseQuery RegexPhraseQuery supports phrase queries with regex. It supports regex and wildcards. E.g. a query with wildcards: "b* b* wolf" matches "big bad wolf" Slop is supported as well: "b* wolf"~2 matches "big bad wolf" Regex queries may match a lot of terms where we still need to keep track which term hit to load the positions. The phrase query algorithm groups terms by their frequency together in the union to prefilter groups early. This PR comes with some new datastructures: SimpleUnion - A union docset for a list of docsets. It doesn't do any caching and is therefore well suited for datasets with lots of skipping. (phrase search, but intersections in general) LoadedPostings - Like SegmentPostings, but all docs and positions are loaded in memory. SegmentPostings uses 1840 bytes per instance with its caches, which is equivalent to 460 docids. LoadedPostings is used for terms which have less than 100 docs. LoadedPostings is only used to reduce memory consumption. BitSetPostingUnion - Creates a `Posting` that uses the bitset for docid hits and the docsets for positions. The BitSet is the precalculated union of the docsets In the RegexPhraseQuery there is a size limit of 512 docsets per PreAggregatedUnion, before creating a new one. Renamed Union to BufferedUnionScorer Added proptests to test different union types. * cleanup * use Box instead of Vec * use RefCell instead of term_freq(&mut) * remove wildcard mode * move RefCell to outer * clippy
117 lines
4.0 KiB
Rust
117 lines
4.0 KiB
Rust
mod all_query;
|
|
mod automaton_weight;
|
|
mod bitset;
|
|
mod bm25;
|
|
mod boolean_query;
|
|
mod boost_query;
|
|
mod const_score_query;
|
|
mod disjunction;
|
|
mod disjunction_max_query;
|
|
mod empty_query;
|
|
mod exclude;
|
|
mod exist_query;
|
|
mod explanation;
|
|
mod fuzzy_query;
|
|
mod intersection;
|
|
mod more_like_this;
|
|
mod phrase_prefix_query;
|
|
mod phrase_query;
|
|
mod query;
|
|
mod query_parser;
|
|
mod range_query;
|
|
mod regex_query;
|
|
mod reqopt_scorer;
|
|
mod scorer;
|
|
mod set_query;
|
|
mod term_query;
|
|
mod union;
|
|
mod weight;
|
|
|
|
#[cfg(test)]
|
|
mod vec_docset;
|
|
|
|
pub(crate) mod score_combiner;
|
|
pub use query_grammar::Occur;
|
|
|
|
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
|
pub use self::automaton_weight::AutomatonWeight;
|
|
pub use self::bitset::BitSetDocSet;
|
|
pub use self::bm25::{Bm25StatisticsProvider, Bm25Weight};
|
|
pub use self::boolean_query::{BooleanQuery, BooleanWeight};
|
|
pub use self::boost_query::{BoostQuery, BoostWeight};
|
|
pub use self::const_score_query::{ConstScoreQuery, ConstScorer};
|
|
pub use self::disjunction_max_query::DisjunctionMaxQuery;
|
|
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
|
pub use self::exclude::Exclude;
|
|
pub use self::exist_query::ExistsQuery;
|
|
pub use self::explanation::Explanation;
|
|
#[cfg(test)]
|
|
pub(crate) use self::fuzzy_query::DfaWrapper;
|
|
pub use self::fuzzy_query::FuzzyTermQuery;
|
|
pub use self::intersection::{intersect_scorers, Intersection};
|
|
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
|
pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
|
pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery};
|
|
pub use self::phrase_query::PhraseQuery;
|
|
pub use self::query::{EnableScoring, Query, QueryClone};
|
|
pub use self::query_parser::{QueryParser, QueryParserError};
|
|
pub use self::range_query::*;
|
|
pub use self::regex_query::RegexQuery;
|
|
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
|
pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner};
|
|
pub use self::scorer::Scorer;
|
|
pub use self::set_query::TermSetQuery;
|
|
pub use self::term_query::TermQuery;
|
|
pub use self::union::BufferedUnionScorer;
|
|
#[cfg(test)]
|
|
pub use self::vec_docset::VecDocSet;
|
|
pub use self::weight::Weight;
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::query::QueryParser;
|
|
use crate::schema::{Schema, TEXT};
|
|
use crate::{Index, Term};
|
|
|
|
#[test]
|
|
fn test_query_terms() {
|
|
let mut schema_builder = Schema::builder();
|
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
|
let term_a = Term::from_field_text(text_field, "a");
|
|
let term_b = Term::from_field_text(text_field, "b");
|
|
{
|
|
let query = query_parser.parse_query("a").unwrap();
|
|
let mut terms = Vec::new();
|
|
query.query_terms(&mut |term, pos| terms.push((term, pos)));
|
|
assert_eq!(vec![(&term_a, false)], terms);
|
|
}
|
|
{
|
|
let query = query_parser.parse_query("a b").unwrap();
|
|
let mut terms = Vec::new();
|
|
query.query_terms(&mut |term, pos| terms.push((term, pos)));
|
|
assert_eq!(vec![(&term_a, false), (&term_b, false)], terms);
|
|
}
|
|
{
|
|
let query = query_parser.parse_query("\"a b\"").unwrap();
|
|
let mut terms = Vec::new();
|
|
query.query_terms(&mut |term, pos| terms.push((term, pos)));
|
|
assert_eq!(vec![(&term_a, true), (&term_b, true)], terms);
|
|
}
|
|
{
|
|
let query = query_parser.parse_query("a a a a a").unwrap();
|
|
let mut terms = Vec::new();
|
|
query.query_terms(&mut |term, pos| terms.push((term, pos)));
|
|
assert_eq!(vec![(&term_a, false); 5], terms);
|
|
}
|
|
{
|
|
let query = query_parser.parse_query("a -b").unwrap();
|
|
let mut terms = Vec::new();
|
|
query.query_terms(&mut |term, pos| terms.push((term, pos)));
|
|
assert_eq!(vec![(&term_a, false), (&term_b, false)], terms);
|
|
}
|
|
}
|
|
}
|