Files
tantivy/src/query/phrase_query/mod.rs
Harrison Burt 1c7c6fd591 POC: Tantivy documents as a trait (#2071)
* fix windows build (#1)

* Fix windows build

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Fix generic bugs

* Reformat code

* Add generic to index writer which I forgot about

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Rebase main and fix conflicts

* Reformat code

* Merge upstream

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add tokenizer improvements from previous commits

* Add tokenizer improvements from previous commits

* Reformat

* Fix unit tests

* Fix unit tests

* Use enum in changes

* Stage changes

* Add new deserializer logic

* Add serializer integration

* Add document deserializer

* Implement new (de)serialization api for existing types

* Fix bugs and type errors

* Add helper implementations

* Fix errors

* Reformat code

* Add unit tests and some code organisation for serialization

* Add unit tests to deserializer

* Add some small docs

* Add support for deserializing serde values

* Reformat

* Fix typo

* Fix typo

* Change repr of facet

* Remove unused trait methods

* Add child value type

* Resolve comments

* Fix build

* Fix more build errors

* Fix more build errors

* Fix the tests I missed

* Fix examples

* fix numerical order, serialize PreTok Str

* fix coverage

* rename Document to TantivyDocument, rename DocumentAccess to Document

add Binary prefix to binary de/serialization

* fix coverage

---------

Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2023-10-02 10:01:16 +02:00

398 lines
15 KiB
Rust

mod phrase_query;
mod phrase_scorer;
mod phrase_weight;
pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_scorer::intersection_count;
pub use self::phrase_scorer::PhraseScorer;
pub use self::phrase_weight::PhraseWeight;
#[cfg(test)]
pub mod tests {
use serde_json::json;
use super::*;
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
use crate::core::Index;
use crate::query::{EnableScoring, QueryParser, Weight};
use crate::schema::{Schema, Term, TEXT};
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc)?;
}
index_writer.commit()?;
}
Ok(index)
}
#[test]
pub fn test_phrase_query() -> crate::Result<()> {
let index = create_index(&[
"b b b d c g c",
"a b b d c g c",
"a b a b c",
"c a b a d ga a",
"a b c",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let test_fruits = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.unwrap();
test_fruits
.docs()
.iter()
.map(|docaddr| docaddr.doc_id)
.collect::<Vec<_>>()
};
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
assert!(test_query(vec!["g", "a"]).is_empty());
Ok(())
}
#[test]
pub fn test_phrase_query_simple() -> crate::Result<()> {
let index = create_index(&["a b b d c g c", "a b a b c"])?;
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let terms: Vec<Term> = ["a", "b", "c"]
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let phrase_weight =
phrase_query.phrase_weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_query_no_score() -> crate::Result<()> {
let index = create_index(&[
"b b b d c g c",
"a b b d c g c",
"a b a b c",
"c a b a d ga a",
"a b c",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let test_fruits = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE)
.unwrap();
test_fruits
.docs()
.iter()
.map(|docaddr| docaddr.doc_id)
.collect::<Vec<_>>()
};
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
assert!(test_query(vec!["g", "a"]).is_empty());
Ok(())
}
#[test]
pub fn test_phrase_query_no_positions() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
let no_positions = TextOptions::default().set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
);
let text_field = schema_builder.add_text_field("text", no_positions);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"),
]);
let search_error = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.err();
assert!(matches!(
search_error,
Some(crate::TantivyError::SchemaError(msg))
if msg == "Applied phrase query on field \"text\", which does not have positions \
indexed"
));
Ok(())
}
#[test]
pub fn test_phrase_score() -> crate::Result<()> {
let index = create_index(&["a b c", "a b c a b"])?;
let scores = test_query(0, &index, vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
Ok(())
}
#[ignore]
#[test]
pub fn test_phrase_score_with_slop() -> crate::Result<()> {
let index = create_index(&["a c b", "a b c a b"])?;
let scores = test_query(1, &index, vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_bug() -> crate::Result<()> {
let index = create_index(&["asdf asdf Captain Subject Wendy", "Captain"])?;
let scores = test_query(1, &index, vec!["captain", "wendy"]);
assert_eq!(scores.len(), 1);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_bug_2() -> crate::Result<()> {
// fails
let index = create_index(&["a x b x c", "a a c"])?;
let scores = test_query(2, &index, vec!["a", "b", "c"]);
assert_eq!(scores.len(), 1);
let index = create_index(&["a x b x c", "b c c"])?;
let scores = test_query(2, &index, vec!["a", "b", "c"]);
assert_eq!(scores.len(), 1);
Ok(())
}
fn test_query(slop: u32, index: &Index, texts: Vec<&str>) -> Vec<f32> {
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(slop);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
}
#[test]
pub fn test_phrase_score_with_slop_repeating() -> crate::Result<()> {
let index = create_index(&["wendy subject subject captain", "Captain"])?;
let scores = test_query(1, &index, vec!["wendy", "subject", "captain"]);
assert_eq!(scores.len(), 1);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_size() -> crate::Result<()> {
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?;
let scores = test_query(3, &index, vec!["a", "c"]);
assert_eq!(scores.len(), 2);
assert_nearly_equals!(scores[0], 0.29086056);
assert_nearly_equals!(scores[1], 0.26706287);
Ok(())
}
#[test]
pub fn test_phrase_slop() -> crate::Result<()> {
let index = create_index(&["a x b c"])?;
let scores = test_query(1, &index, vec!["a", "b", "c"]);
assert_eq!(scores.len(), 1);
let index = create_index(&["a x b x c"])?;
let scores = test_query(1, &index, vec!["a", "b", "c"]);
assert_eq!(scores.len(), 0);
let index = create_index(&["a b"])?;
let scores = test_query(1, &index, vec!["b", "a"]);
assert_eq!(scores.len(), 0);
let index = create_index(&["a b"])?;
let scores = test_query(2, &index, vec!["b", "a"]);
assert_eq!(scores.len(), 1);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_ordering() -> crate::Result<()> {
let index = create_index(&[
"a e b e c",
"a e e e e e b e e e e c",
"a c b", // also matches
"a c e b e",
"a e c b",
"a e b c",
])?;
let scores = test_query(3, &index, vec!["a", "b", "c"]);
// The first and last matches.
assert_nearly_equals!(scores[0], 0.23091172);
assert_nearly_equals!(scores[1], 0.27310878);
assert_nearly_equals!(scores[3], 0.25024384);
Ok(())
}
#[test] // motivated by #234
pub fn test_phrase_query_docfreq_order() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.add_document(doc!(text_field=>"b a"))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.docs()
.to_vec()
};
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]);
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]);
Ok(())
}
#[test] // motivated by #234
pub fn test_phrase_query_non_trivial_offsets() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?;
index_writer.commit()?;
}
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<(usize, &str)>| {
let terms: Vec<(usize, Term)> = texts
.iter()
.map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)))
.collect();
let phrase_query = PhraseQuery::new_with_offset(terms);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.docs()
.iter()
.map(|doc_address| doc_address.doc_id)
.collect::<Vec<DocId>>()
};
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);
assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]);
assert!(test_query(vec![(0, "a"), (2, "b")]).is_empty());
assert_eq!(test_query(vec![(0, "a"), (2, "c")]), vec![0]);
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (3, "d")]), vec![0]);
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (4, "e")]), vec![0]);
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
Ok(())
}
#[test]
pub fn test_phrase_query_on_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(json_field=>json!({
"text": "elliot smith the happy who"
})))?;
index_writer.add_document(doc!(json_field=>json!({
"text": "the who elliot smith"
})))?;
index_writer.add_document(doc!(json_field=>json!({
"arr": [{"text":"the who"}, {"text":"elliot smith"}]
})))?;
index_writer.add_document(doc!(json_field=>json!({
"text2": "the smith"
})))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let matching_docs = |query: &str| {
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let phrase_query = query_parser.parse_query(query).unwrap();
let phrase_weight = phrase_query
.weight(EnableScoring::disabled_from_schema(searcher.schema()))
.unwrap();
let mut phrase_scorer = phrase_weight
.scorer(searcher.segment_reader(0), 1.0f32)
.unwrap();
let mut docs = Vec::new();
loop {
let doc = phrase_scorer.doc();
if doc == TERMINATED {
break;
}
docs.push(doc);
phrase_scorer.advance();
}
docs
};
assert!(matching_docs(r#"text:"the smith""#).is_empty());
assert_eq!(&matching_docs(r#"text:the"#), &[0u32, 1u32]);
assert_eq!(&matching_docs(r#"text:"the""#), &[0u32, 1u32]);
assert_eq!(&matching_docs(r#"text:"smith""#), &[0u32, 1u32]);
assert_eq!(&matching_docs(r#"text:"elliot smith""#), &[0u32, 1u32]);
assert_eq!(&matching_docs(r#"text2:"the smith""#), &[3u32]);
assert!(&matching_docs(r#"arr.text:"the smith""#).is_empty());
assert_eq!(&matching_docs(r#"arr.text:"elliot smith""#), &[2]);
Ok(())
}
}