Explanation as a struct

This commit is contained in:
Paul Masurel
2016-08-06 00:46:50 +09:00
parent 140f6fd2b4
commit bc54db6872
10 changed files with 147 additions and 79 deletions

View File

@@ -1,4 +1,4 @@
query explain, complete, proper term names
pass over offset from previous block
error management
add merge policy
@@ -15,7 +15,6 @@ intersection
masks for union
lenient mode for query parser
WAND
query explain
rethink query iteration mechanics / API (should we setScorer, should
collector take different objects?)
Dig issue monoids idea

View File

@@ -42,7 +42,7 @@ struct Serp {
struct Hit {
title: String,
body: String,
explain: Option<String>,
explain: String,
score: Score,
}
@@ -75,12 +75,12 @@ impl IndexServer {
}
}
fn create_hit(&self, doc: &Document, score: Score, explain: Explanation) -> Hit {
fn create_hit(&self, doc: &Document, explain: Explanation) -> Hit {
Hit {
title: String::from(doc.get_first(self.title_field).unwrap().text()),
body: String::from(doc.get_first(self.body_field).unwrap().text().clone()),
explain: explain.to_string(),
score: score,
explain: format!("{:?}", explain),
score: explain.val(),
}
}
@@ -100,8 +100,8 @@ impl IndexServer {
.iter()
.map(|doc_address| {
let doc: Document = searcher.doc(doc_address).unwrap();
let (score, explanation): (Score, Explanation) = query.explain(&searcher, doc_address).unwrap().unwrap();
self.create_hit(&doc, score, explanation)
let explanation = query.explain(&searcher, doc_address).unwrap();
self.create_hit(&doc, explanation)
})
.collect();
Ok(Serp {

View File

@@ -41,6 +41,7 @@ mod compression;
mod fastfield;
mod store;
mod common;
mod error;
pub mod postings;
pub mod query;
@@ -48,9 +49,9 @@ pub mod directory;
pub mod datastruct;
pub mod analyzer;
pub mod collector;
pub mod schema;
pub use directory::Directory;
pub use core::searcher::Searcher;
pub use core::index::Index;

View File

@@ -2,8 +2,7 @@ use DocId;
use postings::{Postings, DocSet};
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use query::MultiTermScorer;
use postings::ScoredDocSet;
use query::MultiTermAccumulator;
use fastfield::U32FastFieldReader;
use std::iter;
@@ -22,18 +21,18 @@ impl Ord for HeapItem {
}
}
pub struct UnionPostings<TPostings: Postings, TScorer: MultiTermScorer> {
pub struct UnionPostings<TPostings: Postings, TAccumulator: MultiTermAccumulator> {
fieldnorms_readers: Vec<U32FastFieldReader>,
postings: Vec<TPostings>,
term_frequencies: Vec<u32>,
queue: BinaryHeap<HeapItem>,
doc: DocId,
scorer: TScorer
scorer: TAccumulator
}
impl<TPostings: Postings, TScorer: MultiTermScorer> UnionPostings<TPostings, TScorer> {
impl<TPostings: Postings, TAccumulator: MultiTermAccumulator> UnionPostings<TPostings, TAccumulator> {
pub fn new(fieldnorms_reader: Vec<U32FastFieldReader>, mut postings: Vec<TPostings>, scorer: TScorer) -> UnionPostings<TPostings, TScorer> {
pub fn new(fieldnorms_reader: Vec<U32FastFieldReader>, mut postings: Vec<TPostings>, scorer: TAccumulator) -> UnionPostings<TPostings, TAccumulator> {
let num_postings = postings.len();
assert_eq!(fieldnorms_reader.len(), num_postings);
for posting in &mut postings {
@@ -63,7 +62,7 @@ impl<TPostings: Postings, TScorer: MultiTermScorer> UnionPostings<TPostings, TSc
}
pub fn scorer(&self,) -> &TScorer {
pub fn scorer(&self,) -> &TAccumulator {
&self.scorer
}
@@ -86,7 +85,7 @@ impl<TPostings: Postings, TScorer: MultiTermScorer> UnionPostings<TPostings, TSc
}
impl<TPostings: Postings, TScorer: MultiTermScorer> DocSet for UnionPostings<TPostings, TScorer> {
impl<TPostings: Postings, TAccumulator: MultiTermAccumulator> DocSet for UnionPostings<TPostings, TAccumulator> {
fn next(&mut self,) -> bool {
self.scorer.clear();
@@ -134,19 +133,13 @@ impl<TPostings: Postings, TScorer: MultiTermScorer> DocSet for UnionPostings<TPo
}
}
impl<TPostings: Postings, TScorer: MultiTermScorer> ScoredDocSet for UnionPostings<TPostings, TScorer> {
fn score(&self,) -> f32 {
self.scorer.score()
}
}
#[cfg(test)]
mod tests {
use super::*;
use postings::{DocSet, VecPostings, ScoredDocSet};
use query::MultiTermScorer;
use postings::{DocSet, VecPostings};
use query::TfIdfScorer;
use query::Scorer;
use directory::ReadOnlySource;
use directory::SharedVec;
use schema::Field;
@@ -176,7 +169,7 @@ mod tests {
let right_fieldnorms = create_u32_fastfieldreader(Field(2), vec!(15,25,35));
let left = VecPostings::from(vec!(1, 2, 3));
let right = VecPostings::from(vec!(1, 3, 8));
let multi_term_scorer = TfIdfScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32));
let multi_term_scorer = TfIdfScorer::new(vec!(0f32, 1f32, 2f32), vec!(1f32, 4f32));
let mut union = UnionPostings::new(
vec!(left_fieldnorms, right_fieldnorms),
vec!(left, right),
@@ -184,14 +177,14 @@ mod tests {
);
assert!(union.next());
assert_eq!(union.doc(), 1);
assert!(abs_diff(union.score(), 2.182179f32) < 0.001);
assert!(abs_diff(union.scorer().score(), 2.182179f32) < 0.001);
assert!(union.next());
assert_eq!(union.doc(), 2);
assert!(abs_diff(union.score(), 0.2236068) < 0.001f32);
assert!(abs_diff(union.scorer().score(), 0.2236068) < 0.001f32);
assert!(union.next());
assert_eq!(union.doc(), 3);
assert!(union.next());
assert!(abs_diff(union.score(), 0.8944272f32) < 0.001f32);
assert!(abs_diff(union.scorer().score(), 0.8944272f32) < 0.001f32);
assert_eq!(union.doc(), 8);
assert!(!union.next());
}

View File

@@ -1,14 +1,65 @@
#[derive(RustcDecodable, Debug)]
pub enum Explanation {
NotImplementedYet,
Explanation(String),
use std::fmt;
use std::iter;
#[derive(RustcDecodable)]
pub struct Explanation {
val: f32,
description: String,
formula: String,
children: Vec<(String, Explanation)>,
}
impl Explanation {
pub fn to_string(&self,) -> Option<String> {
match self {
&Explanation::Explanation(ref expl) => Some(expl.clone()),
&Explanation::NotImplementedYet => None
pub fn with_val(val: f32) -> Explanation {
Explanation {
val: val,
description: String::new(),
formula: String::new(),
children: Vec::new(),
}
}
pub fn val(&self,) -> f32 {
self.val
}
pub fn description(&mut self, description: &str) {
self.description.clear();
self.description.push_str(description);
}
pub fn set_formula(&mut self, formula: &str) {
self.formula.clear();
self.formula.push_str(formula);
}
pub fn add_child(&mut self, name: &str, val: f32) -> &mut Explanation {
let explanation = Explanation::with_val(val);
let name = String::from(name);
self.children.push((name, explanation));
let &mut (_, ref mut child_experience) = self.children.last_mut().unwrap();
child_experience
}
pub fn format_with_indent(&self, f: &mut fmt::Formatter, indent: usize) -> fmt::Result {
let padding: String = iter::repeat(' ').take(indent).collect();
try!(write!(f, "{}{}: {}\n", padding, self.val, self.description));
if !self.formula.is_empty() {
try!(write!(f, "{}: {}\n", padding, self.formula));
}
for &(ref child_name, ref child) in &self.children {
try!(write!(f, "- {}:\n", child_name));
try!(child.format_with_indent(f, indent + 2));
}
Ok(())
}
}
impl fmt::Debug for Explanation {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.format_with_indent(f, 0)
}
}

View File

@@ -12,4 +12,5 @@ pub use self::multi_term_scorer::TfIdfScorer;
pub use self::multi_term_scorer::MultiTermExplainScorer;
pub use self::scorer::Scorer;
pub use self::query_parser::QueryParser;
pub use self::explanation::Explanation;
pub use self::explanation::Explanation;
pub use self::multi_term_scorer::MultiTermAccumulator;

View File

@@ -1,6 +1,5 @@
use schema::Term;
use query::Query;
use query::Scorer;
use common::TimerTree;
use common::OpenTimer;
use std::io;
@@ -11,15 +10,14 @@ use core::SegmentReader;
use query::MultiTermExplainScorer;
use postings::SegmentPostings;
use postings::UnionPostings;
use postings::ScoredDocSet;
use postings::DocSet;
use query::TfIdfScorer;
use postings::SkipResult;
use fastfield::U32FastFieldReader;
use ScoredDoc;
use query::MultiTermScorer;
use query::Scorer;
use query::MultiTermAccumulator;
use DocAddress;
use Score;
use query::Explanation;
#[derive(Eq, PartialEq, Debug)]
@@ -32,7 +30,7 @@ impl Query for MultiTermQuery {
fn explain(
&self,
searcher: &Searcher,
doc_address: &DocAddress) -> Result<Option<(Score, Explanation)>, io::Error> {
doc_address: &DocAddress) -> Result<Explanation, String> {
let segment_reader = &searcher.segments()[doc_address.segment_ord() as usize];
let multi_term_scorer = MultiTermExplainScorer::from(self.scorer(searcher));
let mut timer_tree = TimerTree::new();
@@ -44,11 +42,13 @@ impl Query for MultiTermQuery {
SkipResult::Reached => {
let scorer = postings.scorer();
let explanation = scorer.explain_score();
let result = (scorer.score(), explanation);
Ok(Some(result))
Ok(explanation)
}
_ => Ok(None)
}
_ => {
// TODO return some kind of Error
panic!("could not compute explain");
}
}
}
fn search<C: Collector>(
@@ -73,7 +73,7 @@ impl Query for MultiTermQuery {
{
let _collection_timer = segment_search_timer.open("collection");
while postings.next() {
let scored_doc = ScoredDoc(postings.score(), postings.doc());
let scored_doc = ScoredDoc(postings.scorer().score(), postings.doc());
collector.collect(scored_doc);
}
}
@@ -86,7 +86,6 @@ impl Query for MultiTermQuery {
impl MultiTermQuery {
pub fn num_terms(&self,) -> usize {
self.terms.len()
}
@@ -107,7 +106,14 @@ impl MultiTermQuery {
let query_coords = (0..self.terms.len() + 1)
.map(|i| (i as f32) / (self.terms.len() as f32))
.collect();
TfIdfScorer::new(query_coords, idfs)
// TODO have the actual terms in these names
let term_names = self.terms
.iter()
.map(|term| format!("{:?}", term))
.collect();
let mut tfidf_scorer = TfIdfScorer::new(query_coords, idfs);
tfidf_scorer.set_term_names(term_names);
tfidf_scorer
}
pub fn new(terms: Vec<Term>) -> MultiTermQuery {
@@ -116,7 +122,7 @@ impl MultiTermQuery {
}
}
fn search_segment<'a, 'b, TScorer: MultiTermScorer>(&'b self, reader: &'b SegmentReader, multi_term_scorer: TScorer, mut timer: OpenTimer<'a>) -> UnionPostings<SegmentPostings, TScorer> {
fn search_segment<'a, 'b, TScorer: MultiTermAccumulator>(&'b self, reader: &'b SegmentReader, multi_term_scorer: TScorer, mut timer: OpenTimer<'a>) -> UnionPostings<SegmentPostings, TScorer> {
let mut segment_postings: Vec<SegmentPostings> = Vec::with_capacity(self.terms.len());
let mut fieldnorms_readers: Vec<U32FastFieldReader> = Vec::with_capacity(self.terms.len());
{

View File

@@ -1,10 +1,12 @@
use query::Scorer;
use query::Explanation;
use query::Explanation;
pub trait MultiTermScorer: Scorer {
pub trait MultiTermAccumulator {
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32);
fn clear(&mut self,);
}
pub trait MultiTermScorer: Scorer + MultiTermAccumulator {
fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation;
}
@@ -14,6 +16,7 @@ pub struct TfIdfScorer {
idf: Vec<f32>,
score: f32,
num_fields: usize,
term_names: Option<Vec<String>>, //< only here for explain
}
pub struct MultiTermExplainScorer<TScorer: MultiTermScorer + Sized> {
@@ -36,14 +39,7 @@ impl<TScorer: MultiTermScorer + Sized> From<TScorer> for MultiTermExplainScorer<
}
}
impl<TScorer: MultiTermScorer + Sized> Scorer for MultiTermExplainScorer<TScorer> {
fn score(&self,) -> f32 {
self.scorer.score()
}
}
impl<TScorer: MultiTermScorer + Sized> MultiTermScorer for MultiTermExplainScorer<TScorer> {
impl<TScorer: MultiTermScorer + Sized> MultiTermAccumulator for MultiTermExplainScorer<TScorer> {
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
self.vals.push((term_ord, term_freq, fieldnorm));
self.scorer.update(term_ord, term_freq, fieldnorm);
@@ -52,24 +48,38 @@ impl<TScorer: MultiTermScorer + Sized> MultiTermScorer for MultiTermExplainScore
self.vals.clear();
self.scorer.clear();
}
fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation {
self.scorer.explain(vals)
}
}
impl TfIdfScorer {
pub fn new(mut coords: Vec<f32>, idf: Vec<f32>) -> TfIdfScorer {
pub fn new(coords: Vec<f32>, idf: Vec<f32>) -> TfIdfScorer {
TfIdfScorer {
coords: coords,
idf: idf,
score: 0f32,
num_fields: 0,
term_names: None,
}
}
fn coord(&self,) -> f32 {
self.coords[self.num_fields]
}
pub fn set_term_names(&mut self, term_names: Vec<String>) {
self.term_names = Some(term_names);
}
fn term_name(&self, ord: usize) -> String {
match &self.term_names {
&Some(ref term_names_vec) => term_names_vec[ord].clone(),
&None => format!("Field({})", ord)
}
}
fn term_score(&self, term_ord: usize, term_freq: u32, field_norm: u32) -> f32 {
(term_freq as f32 / field_norm as f32).sqrt() * self.idf[term_ord]
}
}
impl Scorer for TfIdfScorer {
@@ -81,19 +91,28 @@ impl Scorer for TfIdfScorer {
impl MultiTermScorer for TfIdfScorer {
fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation {
let mut explain = String::new();
let score = self.score();
let mut explanation = Explanation::with_val(score);
let formula_components: Vec<String> = vals.iter()
.map(|&(ord, _, _)| ord)
.map(|ord| format!("<score for ({}>", self.term_name(ord)))
.collect();
let formula = format!("<coord> * ({})", formula_components.join(" + "));
explanation.set_formula(&formula);
for &(ord, term_freq, field_norm) in vals.iter() {
explain += &format!("{} {} {}.\n", ord, term_freq, field_norm);
let term_score = self.term_score(ord, term_freq, field_norm);
let term_explanation = explanation.add_child(&self.term_name(ord), term_score);
term_explanation.set_formula(" sqrt(<term_freq> / <field_norm>) * <idf>");
}
let count = vals.len();
explain += &format!("coord({}) := {}", count, self.coords[count]);
Explanation::Explanation(explain)
explanation
}
}
impl MultiTermAccumulator for TfIdfScorer {
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
assert!(term_freq != 0u32);
self.score += (term_freq as f32 / fieldnorm as f32).sqrt() * self.idf[term_ord];
self.score += self.term_score(term_ord, term_freq, fieldnorm);
self.num_fields += 1;
}
@@ -101,7 +120,6 @@ impl MultiTermScorer for TfIdfScorer {
self.score = 0f32;
self.num_fields = 0;
}
}
@@ -118,7 +136,7 @@ mod tests {
#[test]
pub fn test_multiterm_scorer() {
let mut tfidf_scorer = TfIdfScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32));
let mut tfidf_scorer = TfIdfScorer::new(vec!(0f32, 1f32, 2f32), vec!(1f32, 4f32));
{
tfidf_scorer.update(0, 1, 1);
assert!(abs_diff(tfidf_scorer.score(), 1f32) < 0.001f32);

View File

@@ -4,7 +4,6 @@ use core::searcher::Searcher;
use common::TimerTree;
use DocAddress;
use query::Explanation;
use Score;
pub trait Query {
@@ -16,7 +15,7 @@ pub trait Query {
fn explain(
&self,
searcher: &Searcher,
doc_address: &DocAddress) -> Result<Option<(Score, Explanation)>, io::Error> {
doc_address: &DocAddress) -> Result<Explanation, String> {
// TODO check that the document is there or return an error.
panic!("Not implemented");
}

View File

@@ -9,7 +9,6 @@ use schema::{Term, Field};
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
use DocAddress;
use Score;
use query::Explanation;
#[derive(Debug)]
@@ -50,13 +49,14 @@ impl Query for StandardQuery {
fn explain(
&self,
searcher: &Searcher,
doc_address: &DocAddress) -> Result<Option<(Score, Explanation)>, io::Error> {
doc_address: &DocAddress) -> Result<Explanation, String> {
match self {
&StandardQuery::MultiTerm(ref q) => q.explain(searcher, doc_address)
}
}
}
fn compute_terms(field: Field, text: &str) -> Vec<Term> {
let tokenizer = SimpleTokenizer::new();
let mut tokens = Vec::new();