mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
373 lines
15 KiB
Rust
373 lines
15 KiB
Rust
mod boolean_query;
|
||
mod boolean_weight;
|
||
|
||
pub use self::boolean_query::BooleanQuery;
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
|
||
use super::*;
|
||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||
use crate::query::score_combiner::SumWithCoordsCombiner;
|
||
use crate::query::term_query::TermScorer;
|
||
use crate::query::Intersection;
|
||
use crate::query::Occur;
|
||
use crate::query::Query;
|
||
use crate::query::QueryParser;
|
||
use crate::query::RequiredOptionalScorer;
|
||
use crate::query::Scorer;
|
||
use crate::query::TermQuery;
|
||
use crate::schema::*;
|
||
use crate::Index;
|
||
use crate::{DocAddress, DocId};
|
||
|
||
fn aux_test_helper() -> (Index, Field) {
|
||
let mut schema_builder = Schema::builder();
|
||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
{
|
||
// writing the segment
|
||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||
{
|
||
let doc = doc!(text_field => "a b c");
|
||
index_writer.add_document(doc);
|
||
}
|
||
{
|
||
let doc = doc!(text_field => "a c");
|
||
index_writer.add_document(doc);
|
||
}
|
||
{
|
||
let doc = doc!(text_field => "b c");
|
||
index_writer.add_document(doc);
|
||
}
|
||
{
|
||
let doc = doc!(text_field => "a b c d");
|
||
index_writer.add_document(doc);
|
||
}
|
||
{
|
||
let doc = doc!(text_field => "d");
|
||
index_writer.add_document(doc);
|
||
}
|
||
assert!(index_writer.commit().is_ok());
|
||
}
|
||
(index, text_field)
|
||
}
|
||
|
||
#[test]
|
||
pub fn test_boolean_non_all_term_disjunction() {
|
||
let (index, text_field) = aux_test_helper();
|
||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||
let query = query_parser.parse_query("(+a +b) d").unwrap();
|
||
let searcher = index.reader().unwrap().searcher();
|
||
assert_eq!(query.count(&searcher).unwrap(), 3);
|
||
}
|
||
|
||
#[test]
|
||
pub fn test_boolean_single_must_clause() {
|
||
let (index, text_field) = aux_test_helper();
|
||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||
let query = query_parser.parse_query("+a").unwrap();
|
||
let searcher = index.reader().unwrap().searcher();
|
||
let weight = query.weight(&searcher, true).unwrap();
|
||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||
assert!(scorer.is::<TermScorer>());
|
||
}
|
||
|
||
#[test]
|
||
pub fn test_boolean_termonly_intersection() {
|
||
let (index, text_field) = aux_test_helper();
|
||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||
let searcher = index.reader().unwrap().searcher();
|
||
{
|
||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||
let weight = query.weight(&searcher, true).unwrap();
|
||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||
assert!(scorer.is::<Intersection<TermScorer>>());
|
||
}
|
||
{
|
||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||
let weight = query.weight(&searcher, true).unwrap();
|
||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
pub fn test_boolean_reqopt() {
|
||
let (index, text_field) = aux_test_helper();
|
||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||
let searcher = index.reader().unwrap().searcher();
|
||
{
|
||
let query = query_parser.parse_query("+a b").unwrap();
|
||
let weight = query.weight(&searcher, true).unwrap();
|
||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||
assert!(scorer.is::<RequiredOptionalScorer<
|
||
Box<dyn Scorer>,
|
||
Box<dyn Scorer>,
|
||
SumWithCoordsCombiner,
|
||
>>());
|
||
}
|
||
{
|
||
let query = query_parser.parse_query("+a b").unwrap();
|
||
let weight = query.weight(&searcher, false).unwrap();
|
||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||
assert!(scorer.is::<TermScorer>());
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
pub fn test_boolean_query() {
|
||
let (index, text_field) = aux_test_helper();
|
||
|
||
let make_term_query = |text: &str| {
|
||
let term_query = TermQuery::new(
|
||
Term::from_field_text(text_field, text),
|
||
IndexRecordOption::Basic,
|
||
);
|
||
let query: Box<dyn Query> = Box::new(term_query);
|
||
query
|
||
};
|
||
|
||
let reader = index.reader().unwrap();
|
||
|
||
let matching_docs = |boolean_query: &dyn Query| {
|
||
reader
|
||
.searcher()
|
||
.search(boolean_query, &TEST_COLLECTOR_WITH_SCORE)
|
||
.unwrap()
|
||
.docs()
|
||
.iter()
|
||
.cloned()
|
||
.map(|doc| doc.1)
|
||
.collect::<Vec<DocId>>()
|
||
};
|
||
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
|
||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||
}
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a"))]);
|
||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||
}
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![
|
||
(Occur::Should, make_term_query("a")),
|
||
(Occur::Should, make_term_query("b")),
|
||
]);
|
||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
|
||
}
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![
|
||
(Occur::Must, make_term_query("a")),
|
||
(Occur::Should, make_term_query("b")),
|
||
]);
|
||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||
}
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![
|
||
(Occur::Must, make_term_query("a")),
|
||
(Occur::Should, make_term_query("b")),
|
||
(Occur::MustNot, make_term_query("d")),
|
||
]);
|
||
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
|
||
}
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d"))]);
|
||
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
pub fn test_intersection_score() {
|
||
let (index, text_field) = aux_test_helper();
|
||
|
||
let make_term_query = |text: &str| {
|
||
let term_query = TermQuery::new(
|
||
Term::from_field_text(text_field, text),
|
||
IndexRecordOption::Basic,
|
||
);
|
||
let query: Box<dyn Query> = Box::new(term_query);
|
||
query
|
||
};
|
||
let reader = index.reader().unwrap();
|
||
let score_docs = |boolean_query: &dyn Query| {
|
||
let fruit = reader
|
||
.searcher()
|
||
.search(boolean_query, &TEST_COLLECTOR_WITH_SCORE)
|
||
.unwrap();
|
||
fruit.scores().to_vec()
|
||
};
|
||
|
||
{
|
||
let boolean_query = BooleanQuery::from(vec![
|
||
(Occur::Must, make_term_query("a")),
|
||
(Occur::Must, make_term_query("b")),
|
||
]);
|
||
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
|
||
}
|
||
}
|
||
|
||
// motivated by #554
|
||
#[test]
|
||
fn test_bm25_several_fields() {
|
||
let mut schema_builder = Schema::builder();
|
||
let title = schema_builder.add_text_field("title", TEXT);
|
||
let text = schema_builder.add_text_field("text", TEXT);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
|
||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||
index_writer.add_document(doc!(
|
||
// tf = 1 0
|
||
title => "Законы притяжения Оксана Кулакова",
|
||
// tf = 1 0
|
||
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
|
||
));
|
||
index_writer.add_document(doc!(
|
||
// tf = 1 0
|
||
title => "Любимые русские пироги (Оксана Путан)",
|
||
// tf = 2 0
|
||
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
|
||
));
|
||
index_writer.add_document(doc!(
|
||
// tf = 1 1
|
||
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
|
||
// tf = 0 0
|
||
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
|
||
));
|
||
for _ in 0..1_000 {
|
||
index_writer.add_document(doc!(
|
||
title => "a b d e f g",
|
||
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
|
||
));
|
||
}
|
||
index_writer.commit().unwrap();
|
||
let reader = index.reader().unwrap();
|
||
let searcher = reader.searcher();
|
||
let query_parser = QueryParser::for_index(&index, vec![title, text]);
|
||
let query = query_parser.parse_query("Оксана Лифенко").unwrap();
|
||
let weight = query.weight(&searcher, true).unwrap();
|
||
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||
scorer.advance();
|
||
|
||
let explanation = query.explain(&searcher, DocAddress(0u32, 0u32)).unwrap();
|
||
assert_eq!(
|
||
explanation.to_pretty_json(),
|
||
r#"{
|
||
"value": 12.997711,
|
||
"description": "BooleanClause. Sum of ...",
|
||
"details": [
|
||
{
|
||
"value": 12.997711,
|
||
"description": "BooleanClause. Sum of ...",
|
||
"details": [
|
||
{
|
||
"value": 6.551476,
|
||
"description": "TermQuery, product of...",
|
||
"details": [
|
||
{
|
||
"value": 2.2,
|
||
"description": "(K1+1)"
|
||
},
|
||
{
|
||
"value": 5.658984,
|
||
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
|
||
"details": [
|
||
{
|
||
"value": 3.0,
|
||
"description": "n, number of docs containing this term"
|
||
},
|
||
{
|
||
"value": 1003.0,
|
||
"description": "N, total number of docs"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"value": 0.5262329,
|
||
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||
"details": [
|
||
{
|
||
"value": 1.0,
|
||
"description": "freq, occurrences of term within document"
|
||
},
|
||
{
|
||
"value": 1.2,
|
||
"description": "k1, term saturation parameter"
|
||
},
|
||
{
|
||
"value": 0.75,
|
||
"description": "b, length normalization parameter"
|
||
},
|
||
{
|
||
"value": 4.0,
|
||
"description": "dl, length of field"
|
||
},
|
||
{
|
||
"value": 5.997009,
|
||
"description": "avgdl, average length of field"
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"value": 6.446235,
|
||
"description": "TermQuery, product of...",
|
||
"details": [
|
||
{
|
||
"value": 2.2,
|
||
"description": "(K1+1)"
|
||
},
|
||
{
|
||
"value": 5.9954567,
|
||
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
|
||
"details": [
|
||
{
|
||
"value": 2.0,
|
||
"description": "n, number of docs containing this term"
|
||
},
|
||
{
|
||
"value": 1003.0,
|
||
"description": "N, total number of docs"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"value": 0.4887212,
|
||
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||
"details": [
|
||
{
|
||
"value": 1.0,
|
||
"description": "freq, occurrences of term within document"
|
||
},
|
||
{
|
||
"value": 1.2,
|
||
"description": "k1, term saturation parameter"
|
||
},
|
||
{
|
||
"value": 0.75,
|
||
"description": "b, length normalization parameter"
|
||
},
|
||
{
|
||
"value": 20.0,
|
||
"description": "dl, length of field"
|
||
},
|
||
{
|
||
"value": 24.123629,
|
||
"description": "avgdl, average length of field"
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}"#
|
||
);
|
||
}
|
||
}
|