Files
tantivy/src/query/regex_query.rs
PSeitz 2d7390341c increase min memory to 15MB for indexing (#2176)
With tantivy 0.20 the minimum memory consumption per SegmentWriter increased to
12MB. 7MB are for the different fast field collectors types (they could be
lazily created). Increase the minimum memory from 3MB to 15MB.

Change memory variable naming from arena to budget.

closes #2156
2023-09-13 07:38:34 +02:00

180 lines
5.6 KiB
Rust

use std::clone::Clone;
use std::sync::Arc;
use tantivy_fst::Regex;
use crate::error::TantivyError;
use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
use crate::schema::Field;
/// A Regex Query matches all of the documents
/// containing a specific term that matches
/// a regex pattern.
///
/// Wildcard queries (e.g. ho*se) can be achieved
/// by converting them to their regex counterparts.
///
/// ```rust
/// use tantivy::collector::Count;
/// use tantivy::query::RegexQuery;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index, Term};
///
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ))?;
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ))?;
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ))?;
/// index_writer.commit()?;
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let term = Term::from_field_text(title, "Diary");
/// let query = RegexQuery::from_pattern("d[ai]{2}ry", title)?;
/// let count = searcher.search(&query, &Count)?;
/// assert_eq!(count, 3);
/// Ok(())
/// # }
/// # assert!(test().is_ok());
/// ```
#[derive(Debug, Clone)]
pub struct RegexQuery {
regex: Arc<Regex>,
field: Field,
}
impl RegexQuery {
/// Creates a new RegexQuery from a given pattern
pub fn from_pattern(regex_pattern: &str, field: Field) -> crate::Result<Self> {
let regex = Regex::new(regex_pattern)
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_string()))?;
Ok(RegexQuery::from_regex(regex, field))
}
/// Creates a new RegexQuery from a fully built Regex
pub fn from_regex<T: Into<Arc<Regex>>>(regex: T, field: Field) -> Self {
RegexQuery {
regex: regex.into(),
field,
}
}
fn specialized_weight(&self) -> AutomatonWeight<Regex> {
AutomatonWeight::new(self.field, self.regex.clone())
}
}
impl Query for RegexQuery {
fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight()))
}
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use tantivy_fst::Regex;
use super::RegexQuery;
use crate::collector::TopDocs;
use crate::schema::{Field, Schema, TEXT};
use crate::{assert_nearly_equals, Index, IndexReader};
fn build_test_index() -> crate::Result<(IndexReader, Field)> {
let mut schema_builder = Schema::builder();
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
))?;
index_writer.add_document(doc!(
country_field => "korea",
))?;
index_writer.commit()?;
}
let reader = index.reader()?;
Ok((reader, country_field))
}
fn verify_regex_query(
query_matching_one: RegexQuery,
query_matching_zero: RegexQuery,
reader: IndexReader,
) {
let searcher = reader.searcher();
{
let scored_docs = searcher
.search(&query_matching_one, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals!(1.0, score);
}
let top_docs = searcher
.search(&query_matching_zero, &TopDocs::with_limit(2))
.unwrap();
assert!(top_docs.is_empty(), "Expected ZERO document");
}
#[test]
pub fn test_regex_query() -> crate::Result<()> {
let (reader, field) = build_test_index()?;
let matching_one = RegexQuery::from_pattern("jap[ao]n", field)?;
let matching_zero = RegexQuery::from_pattern("jap[A-Z]n", field)?;
verify_regex_query(matching_one, matching_zero, reader);
Ok(())
}
#[test]
pub fn test_construct_from_regex() -> crate::Result<()> {
let (reader, field) = build_test_index()?;
let matching_one = RegexQuery::from_regex(Regex::new("jap[ao]n").unwrap(), field);
let matching_zero = RegexQuery::from_regex(Regex::new("jap[A-Z]n").unwrap(), field);
verify_regex_query(matching_one, matching_zero, reader);
Ok(())
}
#[test]
pub fn test_construct_from_reused_regex() -> crate::Result<()> {
let r1 = Arc::new(Regex::new("jap[ao]n").unwrap());
let r2 = Arc::new(Regex::new("jap[A-Z]n").unwrap());
let (reader, field) = build_test_index()?;
let matching_one = RegexQuery::from_regex(r1.clone(), field);
let matching_zero = RegexQuery::from_regex(r2.clone(), field);
verify_regex_query(matching_one, matching_zero, reader.clone());
let matching_one = RegexQuery::from_regex(r1, field);
let matching_zero = RegexQuery::from_regex(r2, field);
verify_regex_query(matching_one, matching_zero, reader);
Ok(())
}
}