diff --git a/TODO.md b/TODO.md index 80168c5ec..2c8a3b620 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,4 @@ - +query explain, complete, proper term names pass over offset from previous block error management add merge policy @@ -15,7 +15,6 @@ intersection masks for union lenient mode for query parser WAND -query explain rethink query iteration mechanics / API (should we setScorer, should collector take different objects?) Dig issue monoids idea diff --git a/src/cli/serve.rs b/src/cli/serve.rs index a82ecbf45..d1789c1b3 100644 --- a/src/cli/serve.rs +++ b/src/cli/serve.rs @@ -42,7 +42,7 @@ struct Serp { struct Hit { title: String, body: String, - explain: Option, + explain: String, score: Score, } @@ -75,12 +75,12 @@ impl IndexServer { } } - fn create_hit(&self, doc: &Document, score: Score, explain: Explanation) -> Hit { + fn create_hit(&self, doc: &Document, explain: Explanation) -> Hit { Hit { title: String::from(doc.get_first(self.title_field).unwrap().text()), body: String::from(doc.get_first(self.body_field).unwrap().text().clone()), - explain: explain.to_string(), - score: score, + explain: format!("{:?}", explain), + score: explain.val(), } } @@ -100,8 +100,8 @@ impl IndexServer { .iter() .map(|doc_address| { let doc: Document = searcher.doc(doc_address).unwrap(); - let (score, explanation): (Score, Explanation) = query.explain(&searcher, doc_address).unwrap().unwrap(); - self.create_hit(&doc, score, explanation) + let explanation = query.explain(&searcher, doc_address).unwrap(); + self.create_hit(&doc, explanation) }) .collect(); Ok(Serp { diff --git a/src/lib.rs b/src/lib.rs index c51b80abf..49b2ddb90 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,6 +41,7 @@ mod compression; mod fastfield; mod store; mod common; +mod error; pub mod postings; pub mod query; @@ -48,9 +49,9 @@ pub mod directory; pub mod datastruct; pub mod analyzer; pub mod collector; - pub mod schema; + pub use directory::Directory; pub use core::searcher::Searcher; pub use core::index::Index; diff --git a/src/postings/union_postings.rs b/src/postings/union_postings.rs index 04f98402b..86b2b865f 100644 --- a/src/postings/union_postings.rs +++ b/src/postings/union_postings.rs @@ -2,8 +2,7 @@ use DocId; use postings::{Postings, DocSet}; use std::cmp::Ordering; use std::collections::BinaryHeap; -use query::MultiTermScorer; -use postings::ScoredDocSet; +use query::MultiTermAccumulator; use fastfield::U32FastFieldReader; use std::iter; @@ -22,18 +21,18 @@ impl Ord for HeapItem { } } -pub struct UnionPostings { +pub struct UnionPostings { fieldnorms_readers: Vec, postings: Vec, term_frequencies: Vec, queue: BinaryHeap, doc: DocId, - scorer: TScorer + scorer: TAccumulator } -impl UnionPostings { +impl UnionPostings { - pub fn new(fieldnorms_reader: Vec, mut postings: Vec, scorer: TScorer) -> UnionPostings { + pub fn new(fieldnorms_reader: Vec, mut postings: Vec, scorer: TAccumulator) -> UnionPostings { let num_postings = postings.len(); assert_eq!(fieldnorms_reader.len(), num_postings); for posting in &mut postings { @@ -63,7 +62,7 @@ impl UnionPostings &TScorer { + pub fn scorer(&self,) -> &TAccumulator { &self.scorer } @@ -86,7 +85,7 @@ impl UnionPostings DocSet for UnionPostings { +impl DocSet for UnionPostings { fn next(&mut self,) -> bool { self.scorer.clear(); @@ -134,19 +133,13 @@ impl DocSet for UnionPostings ScoredDocSet for UnionPostings { - fn score(&self,) -> f32 { - self.scorer.score() - } -} - #[cfg(test)] mod tests { use super::*; - use postings::{DocSet, VecPostings, ScoredDocSet}; - use query::MultiTermScorer; + use postings::{DocSet, VecPostings}; use query::TfIdfScorer; + use query::Scorer; use directory::ReadOnlySource; use directory::SharedVec; use schema::Field; @@ -176,7 +169,7 @@ mod tests { let right_fieldnorms = create_u32_fastfieldreader(Field(2), vec!(15,25,35)); let left = VecPostings::from(vec!(1, 2, 3)); let right = VecPostings::from(vec!(1, 3, 8)); - let multi_term_scorer = TfIdfScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32)); + let multi_term_scorer = TfIdfScorer::new(vec!(0f32, 1f32, 2f32), vec!(1f32, 4f32)); let mut union = UnionPostings::new( vec!(left_fieldnorms, right_fieldnorms), vec!(left, right), @@ -184,14 +177,14 @@ mod tests { ); assert!(union.next()); assert_eq!(union.doc(), 1); - assert!(abs_diff(union.score(), 2.182179f32) < 0.001); + assert!(abs_diff(union.scorer().score(), 2.182179f32) < 0.001); assert!(union.next()); assert_eq!(union.doc(), 2); - assert!(abs_diff(union.score(), 0.2236068) < 0.001f32); + assert!(abs_diff(union.scorer().score(), 0.2236068) < 0.001f32); assert!(union.next()); assert_eq!(union.doc(), 3); assert!(union.next()); - assert!(abs_diff(union.score(), 0.8944272f32) < 0.001f32); + assert!(abs_diff(union.scorer().score(), 0.8944272f32) < 0.001f32); assert_eq!(union.doc(), 8); assert!(!union.next()); } diff --git a/src/query/explanation.rs b/src/query/explanation.rs index 63d3bd90c..4ebe86df5 100644 --- a/src/query/explanation.rs +++ b/src/query/explanation.rs @@ -1,14 +1,65 @@ -#[derive(RustcDecodable, Debug)] -pub enum Explanation { - NotImplementedYet, - Explanation(String), +use std::fmt; +use std::iter; + +#[derive(RustcDecodable)] +pub struct Explanation { + val: f32, + description: String, + formula: String, + children: Vec<(String, Explanation)>, } + impl Explanation { - pub fn to_string(&self,) -> Option { - match self { - &Explanation::Explanation(ref expl) => Some(expl.clone()), - &Explanation::NotImplementedYet => None + + pub fn with_val(val: f32) -> Explanation { + Explanation { + val: val, + description: String::new(), + formula: String::new(), + children: Vec::new(), } } + + pub fn val(&self,) -> f32 { + self.val + } + + + pub fn description(&mut self, description: &str) { + self.description.clear(); + self.description.push_str(description); + } + + pub fn set_formula(&mut self, formula: &str) { + self.formula.clear(); + self.formula.push_str(formula); + } + + pub fn add_child(&mut self, name: &str, val: f32) -> &mut Explanation { + let explanation = Explanation::with_val(val); + let name = String::from(name); + self.children.push((name, explanation)); + let &mut (_, ref mut child_experience) = self.children.last_mut().unwrap(); + child_experience + } + + pub fn format_with_indent(&self, f: &mut fmt::Formatter, indent: usize) -> fmt::Result { + let padding: String = iter::repeat(' ').take(indent).collect(); + try!(write!(f, "{}{}: {}\n", padding, self.val, self.description)); + if !self.formula.is_empty() { + try!(write!(f, "{}: {}\n", padding, self.formula)); + } + for &(ref child_name, ref child) in &self.children { + try!(write!(f, "- {}:\n", child_name)); + try!(child.format_with_indent(f, indent + 2)); + } + Ok(()) + } +} + +impl fmt::Debug for Explanation { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.format_with_indent(f, 0) + } } \ No newline at end of file diff --git a/src/query/mod.rs b/src/query/mod.rs index e121e256d..075db6b27 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -12,4 +12,5 @@ pub use self::multi_term_scorer::TfIdfScorer; pub use self::multi_term_scorer::MultiTermExplainScorer; pub use self::scorer::Scorer; pub use self::query_parser::QueryParser; -pub use self::explanation::Explanation; \ No newline at end of file +pub use self::explanation::Explanation; +pub use self::multi_term_scorer::MultiTermAccumulator; \ No newline at end of file diff --git a/src/query/multi_term_query.rs b/src/query/multi_term_query.rs index 2a8fe4be9..d03c3e313 100644 --- a/src/query/multi_term_query.rs +++ b/src/query/multi_term_query.rs @@ -1,6 +1,5 @@ use schema::Term; use query::Query; -use query::Scorer; use common::TimerTree; use common::OpenTimer; use std::io; @@ -11,15 +10,14 @@ use core::SegmentReader; use query::MultiTermExplainScorer; use postings::SegmentPostings; use postings::UnionPostings; -use postings::ScoredDocSet; use postings::DocSet; use query::TfIdfScorer; use postings::SkipResult; use fastfield::U32FastFieldReader; use ScoredDoc; -use query::MultiTermScorer; +use query::Scorer; +use query::MultiTermAccumulator; use DocAddress; -use Score; use query::Explanation; #[derive(Eq, PartialEq, Debug)] @@ -32,7 +30,7 @@ impl Query for MultiTermQuery { fn explain( &self, searcher: &Searcher, - doc_address: &DocAddress) -> Result, io::Error> { + doc_address: &DocAddress) -> Result { let segment_reader = &searcher.segments()[doc_address.segment_ord() as usize]; let multi_term_scorer = MultiTermExplainScorer::from(self.scorer(searcher)); let mut timer_tree = TimerTree::new(); @@ -44,11 +42,13 @@ impl Query for MultiTermQuery { SkipResult::Reached => { let scorer = postings.scorer(); let explanation = scorer.explain_score(); - let result = (scorer.score(), explanation); - Ok(Some(result)) + Ok(explanation) } - _ => Ok(None) - } + _ => { + // TODO return some kind of Error + panic!("could not compute explain"); + } + } } fn search( @@ -73,7 +73,7 @@ impl Query for MultiTermQuery { { let _collection_timer = segment_search_timer.open("collection"); while postings.next() { - let scored_doc = ScoredDoc(postings.score(), postings.doc()); + let scored_doc = ScoredDoc(postings.scorer().score(), postings.doc()); collector.collect(scored_doc); } } @@ -86,7 +86,6 @@ impl Query for MultiTermQuery { impl MultiTermQuery { - pub fn num_terms(&self,) -> usize { self.terms.len() } @@ -107,7 +106,14 @@ impl MultiTermQuery { let query_coords = (0..self.terms.len() + 1) .map(|i| (i as f32) / (self.terms.len() as f32)) .collect(); - TfIdfScorer::new(query_coords, idfs) + // TODO have the actual terms in these names + let term_names = self.terms + .iter() + .map(|term| format!("{:?}", term)) + .collect(); + let mut tfidf_scorer = TfIdfScorer::new(query_coords, idfs); + tfidf_scorer.set_term_names(term_names); + tfidf_scorer } pub fn new(terms: Vec) -> MultiTermQuery { @@ -116,7 +122,7 @@ impl MultiTermQuery { } } - fn search_segment<'a, 'b, TScorer: MultiTermScorer>(&'b self, reader: &'b SegmentReader, multi_term_scorer: TScorer, mut timer: OpenTimer<'a>) -> UnionPostings { + fn search_segment<'a, 'b, TScorer: MultiTermAccumulator>(&'b self, reader: &'b SegmentReader, multi_term_scorer: TScorer, mut timer: OpenTimer<'a>) -> UnionPostings { let mut segment_postings: Vec = Vec::with_capacity(self.terms.len()); let mut fieldnorms_readers: Vec = Vec::with_capacity(self.terms.len()); { diff --git a/src/query/multi_term_scorer.rs b/src/query/multi_term_scorer.rs index ae0993bb1..ca52e3e3d 100644 --- a/src/query/multi_term_scorer.rs +++ b/src/query/multi_term_scorer.rs @@ -1,10 +1,12 @@ use query::Scorer; -use query::Explanation; +use query::Explanation; - -pub trait MultiTermScorer: Scorer { +pub trait MultiTermAccumulator { fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32); fn clear(&mut self,); +} + +pub trait MultiTermScorer: Scorer + MultiTermAccumulator { fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation; } @@ -14,6 +16,7 @@ pub struct TfIdfScorer { idf: Vec, score: f32, num_fields: usize, + term_names: Option>, //< only here for explain } pub struct MultiTermExplainScorer { @@ -36,14 +39,7 @@ impl From for MultiTermExplainScorer< } } -impl Scorer for MultiTermExplainScorer { - fn score(&self,) -> f32 { - self.scorer.score() - } -} - - -impl MultiTermScorer for MultiTermExplainScorer { +impl MultiTermAccumulator for MultiTermExplainScorer { fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) { self.vals.push((term_ord, term_freq, fieldnorm)); self.scorer.update(term_ord, term_freq, fieldnorm); @@ -52,24 +48,38 @@ impl MultiTermScorer for MultiTermExplainScore self.vals.clear(); self.scorer.clear(); } - fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation { - self.scorer.explain(vals) - } } impl TfIdfScorer { - pub fn new(mut coords: Vec, idf: Vec) -> TfIdfScorer { + pub fn new(coords: Vec, idf: Vec) -> TfIdfScorer { TfIdfScorer { coords: coords, idf: idf, score: 0f32, num_fields: 0, + term_names: None, } } fn coord(&self,) -> f32 { self.coords[self.num_fields] } + + pub fn set_term_names(&mut self, term_names: Vec) { + self.term_names = Some(term_names); + } + + fn term_name(&self, ord: usize) -> String { + match &self.term_names { + &Some(ref term_names_vec) => term_names_vec[ord].clone(), + &None => format!("Field({})", ord) + } + + } + + fn term_score(&self, term_ord: usize, term_freq: u32, field_norm: u32) -> f32 { + (term_freq as f32 / field_norm as f32).sqrt() * self.idf[term_ord] + } } impl Scorer for TfIdfScorer { @@ -81,19 +91,28 @@ impl Scorer for TfIdfScorer { impl MultiTermScorer for TfIdfScorer { fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation { - let mut explain = String::new(); + let score = self.score(); + let mut explanation = Explanation::with_val(score); + let formula_components: Vec = vals.iter() + .map(|&(ord, _, _)| ord) + .map(|ord| format!("", self.term_name(ord))) + .collect(); + let formula = format!(" * ({})", formula_components.join(" + ")); + explanation.set_formula(&formula); for &(ord, term_freq, field_norm) in vals.iter() { - explain += &format!("{} {} {}.\n", ord, term_freq, field_norm); + let term_score = self.term_score(ord, term_freq, field_norm); + let term_explanation = explanation.add_child(&self.term_name(ord), term_score); + term_explanation.set_formula(" sqrt( / ) * "); } - let count = vals.len(); - explain += &format!("coord({}) := {}", count, self.coords[count]); - Explanation::Explanation(explain) - + explanation } +} +impl MultiTermAccumulator for TfIdfScorer { + fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) { assert!(term_freq != 0u32); - self.score += (term_freq as f32 / fieldnorm as f32).sqrt() * self.idf[term_ord]; + self.score += self.term_score(term_ord, term_freq, fieldnorm); self.num_fields += 1; } @@ -101,7 +120,6 @@ impl MultiTermScorer for TfIdfScorer { self.score = 0f32; self.num_fields = 0; } - } @@ -118,7 +136,7 @@ mod tests { #[test] pub fn test_multiterm_scorer() { - let mut tfidf_scorer = TfIdfScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32)); + let mut tfidf_scorer = TfIdfScorer::new(vec!(0f32, 1f32, 2f32), vec!(1f32, 4f32)); { tfidf_scorer.update(0, 1, 1); assert!(abs_diff(tfidf_scorer.score(), 1f32) < 0.001f32); diff --git a/src/query/query.rs b/src/query/query.rs index eda10b717..5b06ce89c 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -4,7 +4,6 @@ use core::searcher::Searcher; use common::TimerTree; use DocAddress; use query::Explanation; -use Score; pub trait Query { @@ -16,7 +15,7 @@ pub trait Query { fn explain( &self, searcher: &Searcher, - doc_address: &DocAddress) -> Result, io::Error> { + doc_address: &DocAddress) -> Result { // TODO check that the document is there or return an error. panic!("Not implemented"); } diff --git a/src/query/query_parser.rs b/src/query/query_parser.rs index 9e8082c1d..645e32634 100644 --- a/src/query/query_parser.rs +++ b/src/query/query_parser.rs @@ -9,7 +9,6 @@ use schema::{Term, Field}; use analyzer::SimpleTokenizer; use analyzer::StreamingIterator; use DocAddress; -use Score; use query::Explanation; #[derive(Debug)] @@ -50,13 +49,14 @@ impl Query for StandardQuery { fn explain( &self, searcher: &Searcher, - doc_address: &DocAddress) -> Result, io::Error> { + doc_address: &DocAddress) -> Result { match self { &StandardQuery::MultiTerm(ref q) => q.explain(searcher, doc_address) } } } + fn compute_terms(field: Field, text: &str) -> Vec { let tokenizer = SimpleTokenizer::new(); let mut tokens = Vec::new();