mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
* fix windows build (#1) * Fix windows build * Add doc traits * Add field value iter * Add value and serialization * Adjust order * Fix bug * Correct type * Fix generic bugs * Reformat code * Add generic to index writer which I forgot about * Fix missing generics on single segment writer * Add missing type export * Add default methods for convenience * Cleanup * Fix more-like-this query to use standard types * Update API and fix tests * Add doc traits * Add field value iter * Add value and serialization * Adjust order * Fix bug * Correct type * Rebase main and fix conflicts * Reformat code * Merge upstream * Fix missing generics on single segment writer * Add missing type export * Add default methods for convenience * Cleanup * Fix more-like-this query to use standard types * Update API and fix tests * Add tokenizer improvements from previous commits * Add tokenizer improvements from previous commits * Reformat * Fix unit tests * Fix unit tests * Use enum in changes * Stage changes * Add new deserializer logic * Add serializer integration * Add document deserializer * Implement new (de)serialization api for existing types * Fix bugs and type errors * Add helper implementations * Fix errors * Reformat code * Add unit tests and some code organisation for serialization * Add unit tests to deserializer * Add some small docs * Add support for deserializing serde values * Reformat * Fix typo * Fix typo * Change repr of facet * Remove unused trait methods * Add child value type * Resolve comments * Fix build * Fix more build errors * Fix more build errors * Fix the tests I missed * Fix examples * fix numerical order, serialize PreTok Str * fix coverage * rename Document to TantivyDocument, rename DocumentAccess to Document add Binary prefix to binary de/serialization * fix coverage --------- Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
398 lines
15 KiB
Rust
398 lines
15 KiB
Rust
mod phrase_query;
|
|
mod phrase_scorer;
|
|
mod phrase_weight;
|
|
|
|
pub use self::phrase_query::PhraseQuery;
|
|
pub(crate) use self::phrase_scorer::intersection_count;
|
|
pub use self::phrase_scorer::PhraseScorer;
|
|
pub use self::phrase_weight::PhraseWeight;
|
|
|
|
#[cfg(test)]
|
|
pub mod tests {
|
|
|
|
use serde_json::json;
|
|
|
|
use super::*;
|
|
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
|
|
use crate::core::Index;
|
|
use crate::query::{EnableScoring, QueryParser, Weight};
|
|
use crate::schema::{Schema, Term, TEXT};
|
|
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
|
|
|
|
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
|
|
let mut schema_builder = Schema::builder();
|
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
for &text in texts {
|
|
let doc = doc!(text_field=>text);
|
|
index_writer.add_document(doc)?;
|
|
}
|
|
index_writer.commit()?;
|
|
}
|
|
Ok(index)
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_query() -> crate::Result<()> {
|
|
let index = create_index(&[
|
|
"b b b d c g c",
|
|
"a b b d c g c",
|
|
"a b a b c",
|
|
"c a b a d ga a",
|
|
"a b c",
|
|
])?;
|
|
let schema = index.schema();
|
|
let text_field = schema.get_field("text").unwrap();
|
|
let searcher = index.reader()?.searcher();
|
|
let test_query = |texts: Vec<&str>| {
|
|
let terms: Vec<Term> = texts
|
|
.iter()
|
|
.map(|text| Term::from_field_text(text_field, text))
|
|
.collect();
|
|
let phrase_query = PhraseQuery::new(terms);
|
|
let test_fruits = searcher
|
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
|
.unwrap();
|
|
test_fruits
|
|
.docs()
|
|
.iter()
|
|
.map(|docaddr| docaddr.doc_id)
|
|
.collect::<Vec<_>>()
|
|
};
|
|
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
|
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
|
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
|
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
|
assert!(test_query(vec!["g", "a"]).is_empty());
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_query_simple() -> crate::Result<()> {
|
|
let index = create_index(&["a b b d c g c", "a b a b c"])?;
|
|
let text_field = index.schema().get_field("text").unwrap();
|
|
let searcher = index.reader()?.searcher();
|
|
let terms: Vec<Term> = ["a", "b", "c"]
|
|
.iter()
|
|
.map(|text| Term::from_field_text(text_field, text))
|
|
.collect();
|
|
let phrase_query = PhraseQuery::new(terms);
|
|
let phrase_weight =
|
|
phrase_query.phrase_weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
|
|
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?;
|
|
assert_eq!(phrase_scorer.doc(), 1);
|
|
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_query_no_score() -> crate::Result<()> {
|
|
let index = create_index(&[
|
|
"b b b d c g c",
|
|
"a b b d c g c",
|
|
"a b a b c",
|
|
"c a b a d ga a",
|
|
"a b c",
|
|
])?;
|
|
let schema = index.schema();
|
|
let text_field = schema.get_field("text").unwrap();
|
|
let searcher = index.reader()?.searcher();
|
|
let test_query = |texts: Vec<&str>| {
|
|
let terms: Vec<Term> = texts
|
|
.iter()
|
|
.map(|text| Term::from_field_text(text_field, text))
|
|
.collect();
|
|
let phrase_query = PhraseQuery::new(terms);
|
|
let test_fruits = searcher
|
|
.search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE)
|
|
.unwrap();
|
|
test_fruits
|
|
.docs()
|
|
.iter()
|
|
.map(|docaddr| docaddr.doc_id)
|
|
.collect::<Vec<_>>()
|
|
};
|
|
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
|
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
|
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
|
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
|
assert!(test_query(vec!["g", "a"]).is_empty());
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_query_no_positions() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
|
let no_positions = TextOptions::default().set_indexing_options(
|
|
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
|
);
|
|
|
|
let text_field = schema_builder.add_text_field("text", no_positions);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
|
index_writer.commit()?;
|
|
}
|
|
let searcher = index.reader()?.searcher();
|
|
let phrase_query = PhraseQuery::new(vec![
|
|
Term::from_field_text(text_field, "a"),
|
|
Term::from_field_text(text_field, "b"),
|
|
]);
|
|
|
|
let search_error = searcher
|
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
|
.err();
|
|
assert!(matches!(
|
|
search_error,
|
|
Some(crate::TantivyError::SchemaError(msg))
|
|
if msg == "Applied phrase query on field \"text\", which does not have positions \
|
|
indexed"
|
|
));
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_score() -> crate::Result<()> {
|
|
let index = create_index(&["a b c", "a b c a b"])?;
|
|
let scores = test_query(0, &index, vec!["a", "b"]);
|
|
assert_nearly_equals!(scores[0], 0.40618482);
|
|
assert_nearly_equals!(scores[1], 0.46844664);
|
|
Ok(())
|
|
}
|
|
|
|
#[ignore]
|
|
#[test]
|
|
pub fn test_phrase_score_with_slop() -> crate::Result<()> {
|
|
let index = create_index(&["a c b", "a b c a b"])?;
|
|
let scores = test_query(1, &index, vec!["a", "b"]);
|
|
assert_nearly_equals!(scores[0], 0.40618482);
|
|
assert_nearly_equals!(scores[1], 0.46844664);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_score_with_slop_bug() -> crate::Result<()> {
|
|
let index = create_index(&["asdf asdf Captain Subject Wendy", "Captain"])?;
|
|
let scores = test_query(1, &index, vec!["captain", "wendy"]);
|
|
assert_eq!(scores.len(), 1);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_score_with_slop_bug_2() -> crate::Result<()> {
|
|
// fails
|
|
let index = create_index(&["a x b x c", "a a c"])?;
|
|
let scores = test_query(2, &index, vec!["a", "b", "c"]);
|
|
assert_eq!(scores.len(), 1);
|
|
|
|
let index = create_index(&["a x b x c", "b c c"])?;
|
|
let scores = test_query(2, &index, vec!["a", "b", "c"]);
|
|
assert_eq!(scores.len(), 1);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn test_query(slop: u32, index: &Index, texts: Vec<&str>) -> Vec<f32> {
|
|
let text_field = index.schema().get_field("text").unwrap();
|
|
let searcher = index.reader().unwrap().searcher();
|
|
let terms: Vec<Term> = texts
|
|
.iter()
|
|
.map(|text| Term::from_field_text(text_field, text))
|
|
.collect();
|
|
let mut phrase_query = PhraseQuery::new(terms);
|
|
phrase_query.set_slop(slop);
|
|
searcher
|
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
|
.expect("search should succeed")
|
|
.scores()
|
|
.to_vec()
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_score_with_slop_repeating() -> crate::Result<()> {
|
|
let index = create_index(&["wendy subject subject captain", "Captain"])?;
|
|
let scores = test_query(1, &index, vec!["wendy", "subject", "captain"]);
|
|
assert_eq!(scores.len(), 1);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_score_with_slop_size() -> crate::Result<()> {
|
|
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?;
|
|
let scores = test_query(3, &index, vec!["a", "c"]);
|
|
assert_eq!(scores.len(), 2);
|
|
assert_nearly_equals!(scores[0], 0.29086056);
|
|
assert_nearly_equals!(scores[1], 0.26706287);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_slop() -> crate::Result<()> {
|
|
let index = create_index(&["a x b c"])?;
|
|
let scores = test_query(1, &index, vec!["a", "b", "c"]);
|
|
assert_eq!(scores.len(), 1);
|
|
|
|
let index = create_index(&["a x b x c"])?;
|
|
let scores = test_query(1, &index, vec!["a", "b", "c"]);
|
|
assert_eq!(scores.len(), 0);
|
|
|
|
let index = create_index(&["a b"])?;
|
|
let scores = test_query(1, &index, vec!["b", "a"]);
|
|
assert_eq!(scores.len(), 0);
|
|
|
|
let index = create_index(&["a b"])?;
|
|
let scores = test_query(2, &index, vec!["b", "a"]);
|
|
assert_eq!(scores.len(), 1);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_score_with_slop_ordering() -> crate::Result<()> {
|
|
let index = create_index(&[
|
|
"a e b e c",
|
|
"a e e e e e b e e e e c",
|
|
"a c b", // also matches
|
|
"a c e b e",
|
|
"a e c b",
|
|
"a e b c",
|
|
])?;
|
|
let scores = test_query(3, &index, vec!["a", "b", "c"]);
|
|
// The first and last matches.
|
|
assert_nearly_equals!(scores[0], 0.23091172);
|
|
assert_nearly_equals!(scores[1], 0.27310878);
|
|
assert_nearly_equals!(scores[3], 0.25024384);
|
|
Ok(())
|
|
}
|
|
|
|
#[test] // motivated by #234
|
|
pub fn test_phrase_query_docfreq_order() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(text_field=>"b"))?;
|
|
index_writer.add_document(doc!(text_field=>"a b"))?;
|
|
index_writer.add_document(doc!(text_field=>"b a"))?;
|
|
index_writer.commit()?;
|
|
}
|
|
|
|
let searcher = index.reader()?.searcher();
|
|
let test_query = |texts: Vec<&str>| {
|
|
let terms: Vec<Term> = texts
|
|
.iter()
|
|
.map(|text| Term::from_field_text(text_field, text))
|
|
.collect();
|
|
let phrase_query = PhraseQuery::new(terms);
|
|
searcher
|
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
|
.expect("search should succeed")
|
|
.docs()
|
|
.to_vec()
|
|
};
|
|
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]);
|
|
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]);
|
|
Ok(())
|
|
}
|
|
|
|
#[test] // motivated by #234
|
|
pub fn test_phrase_query_non_trivial_offsets() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?;
|
|
index_writer.commit()?;
|
|
}
|
|
let searcher = index.reader().unwrap().searcher();
|
|
let test_query = |texts: Vec<(usize, &str)>| {
|
|
let terms: Vec<(usize, Term)> = texts
|
|
.iter()
|
|
.map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)))
|
|
.collect();
|
|
let phrase_query = PhraseQuery::new_with_offset(terms);
|
|
searcher
|
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
|
.expect("search should succeed")
|
|
.docs()
|
|
.iter()
|
|
.map(|doc_address| doc_address.doc_id)
|
|
.collect::<Vec<DocId>>()
|
|
};
|
|
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);
|
|
assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]);
|
|
assert!(test_query(vec![(0, "a"), (2, "b")]).is_empty());
|
|
assert_eq!(test_query(vec![(0, "a"), (2, "c")]), vec![0]);
|
|
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (3, "d")]), vec![0]);
|
|
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (4, "e")]), vec![0]);
|
|
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
|
|
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
|
|
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_phrase_query_on_json() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let json_field = schema_builder.add_json_field("json", TEXT);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(json_field=>json!({
|
|
"text": "elliot smith the happy who"
|
|
})))?;
|
|
index_writer.add_document(doc!(json_field=>json!({
|
|
"text": "the who elliot smith"
|
|
})))?;
|
|
index_writer.add_document(doc!(json_field=>json!({
|
|
"arr": [{"text":"the who"}, {"text":"elliot smith"}]
|
|
})))?;
|
|
index_writer.add_document(doc!(json_field=>json!({
|
|
"text2": "the smith"
|
|
})))?;
|
|
index_writer.commit()?;
|
|
}
|
|
let searcher = index.reader()?.searcher();
|
|
let matching_docs = |query: &str| {
|
|
let query_parser = QueryParser::for_index(&index, vec![json_field]);
|
|
let phrase_query = query_parser.parse_query(query).unwrap();
|
|
let phrase_weight = phrase_query
|
|
.weight(EnableScoring::disabled_from_schema(searcher.schema()))
|
|
.unwrap();
|
|
let mut phrase_scorer = phrase_weight
|
|
.scorer(searcher.segment_reader(0), 1.0f32)
|
|
.unwrap();
|
|
let mut docs = Vec::new();
|
|
loop {
|
|
let doc = phrase_scorer.doc();
|
|
if doc == TERMINATED {
|
|
break;
|
|
}
|
|
docs.push(doc);
|
|
phrase_scorer.advance();
|
|
}
|
|
docs
|
|
};
|
|
assert!(matching_docs(r#"text:"the smith""#).is_empty());
|
|
assert_eq!(&matching_docs(r#"text:the"#), &[0u32, 1u32]);
|
|
assert_eq!(&matching_docs(r#"text:"the""#), &[0u32, 1u32]);
|
|
assert_eq!(&matching_docs(r#"text:"smith""#), &[0u32, 1u32]);
|
|
assert_eq!(&matching_docs(r#"text:"elliot smith""#), &[0u32, 1u32]);
|
|
assert_eq!(&matching_docs(r#"text2:"the smith""#), &[3u32]);
|
|
assert!(&matching_docs(r#"arr.text:"the smith""#).is_empty());
|
|
assert_eq!(&matching_docs(r#"arr.text:"elliot smith""#), &[2]);
|
|
Ok(())
|
|
}
|
|
}
|