mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
explanation kind of working in an ugly way.
This commit is contained in:
@@ -25,7 +25,6 @@ lz4 = "1.13.131"
|
||||
time = "0.1.34"
|
||||
uuid = "0.1"
|
||||
persistent="*"
|
||||
|
||||
iron = "0.4"
|
||||
staticfile = "0.3.0"
|
||||
|
||||
@@ -47,6 +46,10 @@ path = "src/cli/bench.rs"
|
||||
name = "tantivy_serve"
|
||||
path = "src/cli/serve.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "tantivy_index"
|
||||
path = "src/cli/index.rs"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
[](https://travis-ci.org/fulmicoton/tantivy)
|
||||
[](https://coveralls.io/github/fulmicoton/tantivy?branch=master)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
# Tantivy
|
||||
|
||||
|
||||
31
TODO.md
31
TODO.md
@@ -1,19 +1,26 @@
|
||||
|
||||
pass over offset from previous block
|
||||
error management
|
||||
split directory
|
||||
add doc values
|
||||
implement serial for segment reader
|
||||
merge segments
|
||||
add merge policy
|
||||
find solution to "I have a docaddress but the segment does not exist anymore problem"
|
||||
doc values for other types
|
||||
documentation
|
||||
split postings into blocks
|
||||
add term frequency
|
||||
use skip list for each blocks
|
||||
find a clear way to put the tokenized/untokenized thing upstream
|
||||
index frequent bigrams
|
||||
clean up compression
|
||||
reconsider the first byte == field in the [u8] repr of a term.
|
||||
|
||||
|
||||
|
||||
|
||||
add field norms
|
||||
good cli
|
||||
good cli based demo
|
||||
intersection
|
||||
masks for union
|
||||
lenient mode for query parser
|
||||
WAND
|
||||
query explain
|
||||
rethink query iteration mechanics / API (should we setScorer, should
|
||||
collector take different objects?)
|
||||
Dig issue monoids idea
|
||||
phrase queries
|
||||
sort by fast field
|
||||
date
|
||||
geo search
|
||||
deletes
|
||||
@@ -1,5 +1,3 @@
|
||||
|
||||
|
||||
extern crate argparse;
|
||||
extern crate tantivy;
|
||||
|
||||
|
||||
78
src/cli/index.rs
Normal file
78
src/cli/index.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
extern crate rustc_serialize;
|
||||
extern crate tantivy;
|
||||
extern crate time;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
extern crate regex;
|
||||
|
||||
use std::fs::File;
|
||||
use tantivy::Index;
|
||||
use std::io::BufReader;
|
||||
use std::io::BufRead;
|
||||
use rustc_serialize::json;
|
||||
use std::convert::From;
|
||||
use std::path::PathBuf;
|
||||
use rustc_serialize::json::DecodeResult;
|
||||
use time::PreciseTime;
|
||||
use tantivy::schema::*;
|
||||
|
||||
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
pub struct WikiArticle {
|
||||
pub url: String,
|
||||
pub title: String,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
fn create_schema() -> Schema {
|
||||
let mut schema = Schema::new();
|
||||
schema.add_text_field("url", STRING | STORED);
|
||||
schema.add_text_field("title", TEXT | STORED);
|
||||
schema.add_text_field("body", TEXT | STORED);
|
||||
schema
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let articles = BufReader::new(File::open(&PathBuf::from("wiki-articles-1000.json")).unwrap());
|
||||
|
||||
let schema = create_schema();
|
||||
let directory_path = PathBuf::from("/Users/pmasurel/wiki-index");
|
||||
let index = Index::create(&directory_path, schema.clone()).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1).unwrap();
|
||||
|
||||
let mut num_docs = 0;
|
||||
let mut cur = PreciseTime::now();
|
||||
let group_count = 10000;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let url = schema.get_field("url").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
for article_line_res in articles.lines() {
|
||||
let article_line = article_line_res.unwrap();
|
||||
let article_res: DecodeResult<WikiArticle> = json::decode(&article_line);
|
||||
match article_res {
|
||||
Ok(article) => {
|
||||
let mut doc = Document::new();
|
||||
doc.add_text(title, &article.title);
|
||||
doc.add_text(body, &article.body);
|
||||
doc.add_text(url, &article.url);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
Err(_) => {}
|
||||
}
|
||||
|
||||
if num_docs > 0 && (num_docs % group_count == 0) {
|
||||
println!("{} Docs", num_docs);
|
||||
let new = PreciseTime::now();
|
||||
let elapsed = cur.to(new);
|
||||
println!("{:?} docs / hour", group_count * 3600 * 1e6 as u64 / (elapsed.num_microseconds().unwrap() as u64));
|
||||
cur = new;
|
||||
}
|
||||
|
||||
num_docs += 1;
|
||||
|
||||
}
|
||||
|
||||
index_writer.wait().unwrap();
|
||||
}
|
||||
@@ -11,6 +11,7 @@ use tantivy::schema::Field;
|
||||
use tantivy::collector::CountCollector;
|
||||
use tantivy::Index;
|
||||
use tantivy::collector;
|
||||
use tantivy::Score;
|
||||
use urlencoded::UrlEncodedQuery;
|
||||
use iron::status;
|
||||
use rustc_serialize::json::as_pretty_json;
|
||||
@@ -20,6 +21,7 @@ use iron::mime::Mime;
|
||||
use mount::Mount;
|
||||
use tantivy::query::Query;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::query::Explanation;
|
||||
use tantivy::Document;
|
||||
use tantivy::collector::TopCollector;
|
||||
use persistent::Read;
|
||||
@@ -40,6 +42,8 @@ struct Serp {
|
||||
struct Hit {
|
||||
title: String,
|
||||
body: String,
|
||||
explain: Option<String>,
|
||||
score: Score,
|
||||
}
|
||||
|
||||
#[derive(RustcDecodable, RustcEncodable)]
|
||||
@@ -71,11 +75,12 @@ impl IndexServer {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_hit(&self, doc: &Document) -> Hit {
|
||||
fn create_hit(&self, doc: &Document, score: Score, explain: Explanation) -> Hit {
|
||||
Hit {
|
||||
title: String::from(doc.get_first(self.title_field).unwrap().text()),
|
||||
body: String::from(doc.get_first(self.body_field).unwrap().text().clone()),
|
||||
|
||||
explain: explain.to_string(),
|
||||
score: score,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,13 +96,14 @@ impl IndexServer {
|
||||
.add(&mut count_collector);
|
||||
try!(query.search(&searcher, &mut chained_collector));
|
||||
}
|
||||
let hits: Vec<Hit> = try!(
|
||||
top_collector.docs()
|
||||
let hits: Vec<Hit> = top_collector.docs()
|
||||
.iter()
|
||||
.map(|doc_address| searcher.doc(doc_address))
|
||||
.map(|doc_result| doc_result.map(|doc| self.create_hit(&doc) ))
|
||||
.collect()
|
||||
);
|
||||
.map(|doc_address| {
|
||||
let doc: Document = searcher.doc(doc_address).unwrap();
|
||||
let (score, explanation): (Score, Explanation) = query.explain(&searcher, doc_address).unwrap().unwrap();
|
||||
self.create_hit(&doc, score, explanation)
|
||||
})
|
||||
.collect();
|
||||
Ok(Serp {
|
||||
q: q,
|
||||
hits: hits,
|
||||
@@ -141,7 +147,7 @@ fn search(req: &mut Request) -> IronResult<Response> {
|
||||
|
||||
fn main() {
|
||||
let mut mount = Mount::new();
|
||||
let server = IndexServer::load(&Path::new("/data/wiki-index/"));
|
||||
let server = IndexServer::load(&Path::new("/Users/pmasurel/wiki-index/"));
|
||||
|
||||
mount.mount("/api", search);
|
||||
mount.mount("/", Static::new(Path::new("static/")));
|
||||
|
||||
@@ -71,6 +71,7 @@ pub type Score = f32;
|
||||
/// It only makes sense for a given searcher.
|
||||
pub type SegmentLocalId = u32;
|
||||
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
|
||||
|
||||
@@ -59,8 +59,8 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
let left = Box::new(VecPostings::new(vec!(1, 3, 9)));
|
||||
let right = Box::new(VecPostings::new(vec!(3, 4, 9, 18)));
|
||||
let left = Box::new(VecPostings::from(vec!(1, 3, 9)));
|
||||
let right = Box::new(VecPostings::from(vec!(3, 4, 9, 18)));
|
||||
let mut intersection = IntersectionDocSet::new(vec!(left, right));
|
||||
assert!(intersection.next());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
@@ -69,9 +69,9 @@ mod tests {
|
||||
assert!(!intersection.next());
|
||||
}
|
||||
{
|
||||
let a = Box::new(VecPostings::new(vec!(1, 3, 9)));
|
||||
let b = Box::new(VecPostings::new(vec!(3, 4, 9, 18)));
|
||||
let c = Box::new(VecPostings::new(vec!(1, 5, 9, 111)));
|
||||
let a = Box::new(VecPostings::from(vec!(1, 3, 9)));
|
||||
let b = Box::new(VecPostings::from(vec!(3, 4, 9, 18)));
|
||||
let c = Box::new(VecPostings::from(vec!(1, 5, 9, 111)));
|
||||
let mut intersection = IntersectionDocSet::new(vec!(a, b, c));
|
||||
assert!(intersection.next());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use query::MultiTermScorer;
|
||||
use postings::ScoredDocSet;
|
||||
use query::Scorer;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use std::iter;
|
||||
|
||||
@@ -23,21 +22,20 @@ impl Ord for HeapItem {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UnionPostings<TPostings: Postings> {
|
||||
pub struct UnionPostings<TPostings: Postings, TScorer: MultiTermScorer> {
|
||||
fieldnorms_readers: Vec<U32FastFieldReader>,
|
||||
postings: Vec<TPostings>,
|
||||
term_frequencies: Vec<u32>,
|
||||
queue: BinaryHeap<HeapItem>,
|
||||
doc: DocId,
|
||||
scorer: MultiTermScorer
|
||||
scorer: TScorer
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> UnionPostings<TPostings> {
|
||||
impl<TPostings: Postings, TScorer: MultiTermScorer> UnionPostings<TPostings, TScorer> {
|
||||
|
||||
pub fn new(fieldnorms_reader: Vec<U32FastFieldReader>, mut postings: Vec<TPostings>, multi_term_scorer: MultiTermScorer) -> UnionPostings<TPostings> {
|
||||
pub fn new(fieldnorms_reader: Vec<U32FastFieldReader>, mut postings: Vec<TPostings>, scorer: TScorer) -> UnionPostings<TPostings, TScorer> {
|
||||
let num_postings = postings.len();
|
||||
assert_eq!(fieldnorms_reader.len(), num_postings);
|
||||
|
||||
for posting in &mut postings {
|
||||
assert!(posting.next());
|
||||
}
|
||||
@@ -60,10 +58,15 @@ impl<TPostings: Postings> UnionPostings<TPostings> {
|
||||
term_frequencies: term_frequencies,
|
||||
queue: BinaryHeap::from(heap_items),
|
||||
doc: 0,
|
||||
scorer: multi_term_scorer
|
||||
scorer: scorer
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn scorer(&self,) -> &TScorer {
|
||||
&self.scorer
|
||||
}
|
||||
|
||||
fn advance_head(&mut self,) {
|
||||
let ord = self.queue.peek().unwrap().1 as usize;
|
||||
let cur_postings = &mut self.postings[ord];
|
||||
@@ -83,9 +86,8 @@ impl<TPostings: Postings> UnionPostings<TPostings> {
|
||||
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> DocSet for UnionPostings<TPostings> {
|
||||
impl<TPostings: Postings, TScorer: MultiTermScorer> DocSet for UnionPostings<TPostings, TScorer> {
|
||||
|
||||
|
||||
fn next(&mut self,) -> bool {
|
||||
self.scorer.clear();
|
||||
match self.queue.peek() {
|
||||
@@ -94,8 +96,7 @@ impl<TPostings: Postings> DocSet for UnionPostings<TPostings> {
|
||||
let ord: usize = ord as usize;
|
||||
let fieldnorm = self.get_field_norm(ord, doc);
|
||||
let tf = self.term_frequencies[ord];
|
||||
self.scorer.update(ord, tf, fieldnorm);
|
||||
|
||||
self.scorer.update(ord, tf, fieldnorm);
|
||||
}
|
||||
None => {
|
||||
return false;
|
||||
@@ -133,7 +134,7 @@ impl<TPostings: Postings> DocSet for UnionPostings<TPostings> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> ScoredDocSet for UnionPostings<TPostings> {
|
||||
impl<TPostings: Postings, TScorer: MultiTermScorer> ScoredDocSet for UnionPostings<TPostings, TScorer> {
|
||||
fn score(&self,) -> f32 {
|
||||
self.scorer.score()
|
||||
}
|
||||
@@ -145,6 +146,7 @@ mod tests {
|
||||
use super::*;
|
||||
use postings::{DocSet, VecPostings, ScoredDocSet};
|
||||
use query::MultiTermScorer;
|
||||
use query::TfIdfScorer;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::SharedVec;
|
||||
use schema::Field;
|
||||
@@ -172,9 +174,9 @@ mod tests {
|
||||
pub fn test_union_postings() {
|
||||
let left_fieldnorms = create_u32_fastfieldreader(Field(1), vec!(100,200,300));
|
||||
let right_fieldnorms = create_u32_fastfieldreader(Field(2), vec!(15,25,35));
|
||||
let left = VecPostings::new(vec!(1, 2, 3));
|
||||
let right = VecPostings::new(vec!(1, 3, 8));
|
||||
let multi_term_scorer = MultiTermScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32));
|
||||
let left = VecPostings::from(vec!(1, 2, 3));
|
||||
let right = VecPostings::from(vec!(1, 3, 8));
|
||||
let multi_term_scorer = TfIdfScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32));
|
||||
let mut union = UnionPostings::new(
|
||||
vec!(left_fieldnorms, right_fieldnorms),
|
||||
vec!(left, right),
|
||||
|
||||
@@ -10,8 +10,8 @@ pub struct VecPostings {
|
||||
cursor: Wrapping<usize>,
|
||||
}
|
||||
|
||||
impl VecPostings {
|
||||
pub fn new(doc_ids: Vec<DocId>) -> VecPostings {
|
||||
impl From<Vec<DocId>> for VecPostings {
|
||||
fn from(doc_ids: Vec<DocId>) -> VecPostings {
|
||||
VecPostings {
|
||||
doc_ids: doc_ids,
|
||||
cursor: Wrapping(usize::max_value()),
|
||||
@@ -107,7 +107,7 @@ pub mod tests {
|
||||
#[test]
|
||||
pub fn test_vec_postings() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e*3).collect();
|
||||
let mut postings = VecPostings::new(doc_ids);
|
||||
let mut postings = VecPostings::from(doc_ids);
|
||||
assert!(postings.next());
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
assert!(postings.next());
|
||||
|
||||
14
src/query/explanation.rs
Normal file
14
src/query/explanation.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
#[derive(RustcDecodable, Debug)]
|
||||
pub enum Explanation {
|
||||
NotImplementedYet,
|
||||
Explanation(String),
|
||||
}
|
||||
|
||||
impl Explanation {
|
||||
pub fn to_string(&self,) -> Option<String> {
|
||||
match self {
|
||||
&Explanation::Explanation(ref expl) => Some(expl.clone()),
|
||||
&Explanation::NotImplementedYet => None
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,9 +3,13 @@ mod multi_term_query;
|
||||
mod multi_term_scorer;
|
||||
mod scorer;
|
||||
mod query_parser;
|
||||
mod explanation;
|
||||
|
||||
pub use self::query::Query;
|
||||
pub use self::multi_term_query::MultiTermQuery;
|
||||
pub use self::multi_term_scorer::MultiTermScorer;
|
||||
pub use self::multi_term_scorer::TfIdfScorer;
|
||||
pub use self::multi_term_scorer::MultiTermExplainScorer;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::explanation::Explanation;
|
||||
@@ -1,5 +1,6 @@
|
||||
use schema::Term;
|
||||
use query::Query;
|
||||
use query::Scorer;
|
||||
use common::TimerTree;
|
||||
use common::OpenTimer;
|
||||
use std::io;
|
||||
@@ -7,14 +8,19 @@ use core::searcher::Searcher;
|
||||
use collector::Collector;
|
||||
use SegmentLocalId;
|
||||
use core::SegmentReader;
|
||||
use query::MultiTermExplainScorer;
|
||||
use postings::SegmentPostings;
|
||||
use postings::UnionPostings;
|
||||
use postings::ScoredDocSet;
|
||||
use postings::DocSet;
|
||||
use query::MultiTermScorer;
|
||||
use query::TfIdfScorer;
|
||||
use postings::SkipResult;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use ScoredDoc;
|
||||
|
||||
use query::MultiTermScorer;
|
||||
use DocAddress;
|
||||
use Score;
|
||||
use query::Explanation;
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct MultiTermQuery {
|
||||
@@ -23,7 +29,32 @@ pub struct MultiTermQuery {
|
||||
|
||||
impl Query for MultiTermQuery {
|
||||
|
||||
fn search<C: Collector>(&self, searcher: &Searcher, collector: &mut C) -> io::Result<TimerTree> {
|
||||
fn explain(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_address: &DocAddress) -> Result<Option<(Score, Explanation)>, io::Error> {
|
||||
let segment_reader = &searcher.segments()[doc_address.segment_ord() as usize];
|
||||
let multi_term_scorer = MultiTermExplainScorer::from(self.scorer(searcher));
|
||||
let mut timer_tree = TimerTree::new();
|
||||
let mut postings = self.search_segment(
|
||||
segment_reader,
|
||||
multi_term_scorer,
|
||||
timer_tree.open("explain"));
|
||||
match postings.skip_next(doc_address.doc()) {
|
||||
SkipResult::Reached => {
|
||||
let scorer = postings.scorer();
|
||||
let explanation = scorer.explain_score();
|
||||
let result = (scorer.score(), explanation);
|
||||
Ok(Some(result))
|
||||
}
|
||||
_ => Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn search<C: Collector>(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
collector: &mut C) -> io::Result<TimerTree> {
|
||||
let mut timer_tree = TimerTree::new();
|
||||
|
||||
let multi_term_scorer = self.scorer(searcher);
|
||||
@@ -60,7 +91,7 @@ impl MultiTermQuery {
|
||||
self.terms.len()
|
||||
}
|
||||
|
||||
fn scorer(&self, searcher: &Searcher) -> MultiTermScorer {
|
||||
fn scorer(&self, searcher: &Searcher) -> TfIdfScorer {
|
||||
let num_docs = searcher.num_docs() as f32;
|
||||
let idfs: Vec<f32> = self.terms.iter()
|
||||
.map(|term| searcher.doc_freq(term))
|
||||
@@ -74,9 +105,9 @@ impl MultiTermQuery {
|
||||
})
|
||||
.collect();
|
||||
let query_coords = (0..self.terms.len() + 1)
|
||||
.map(|i| i as f32 / self.terms.len() as f32)
|
||||
.map(|i| (i as f32) / (self.terms.len() as f32))
|
||||
.collect();
|
||||
MultiTermScorer::new(query_coords, idfs)
|
||||
TfIdfScorer::new(query_coords, idfs)
|
||||
}
|
||||
|
||||
pub fn new(terms: Vec<Term>) -> MultiTermQuery {
|
||||
@@ -85,7 +116,7 @@ impl MultiTermQuery {
|
||||
}
|
||||
}
|
||||
|
||||
fn search_segment<'a, 'b>(&'b self, reader: &'b SegmentReader, multi_term_scorer: MultiTermScorer, mut timer: OpenTimer<'a>) -> UnionPostings<SegmentPostings> {
|
||||
fn search_segment<'a, 'b, TScorer: MultiTermScorer>(&'b self, reader: &'b SegmentReader, multi_term_scorer: TScorer, mut timer: OpenTimer<'a>) -> UnionPostings<SegmentPostings, TScorer> {
|
||||
let mut segment_postings: Vec<SegmentPostings> = Vec::with_capacity(self.terms.len());
|
||||
let mut fieldnorms_readers: Vec<U32FastFieldReader> = Vec::with_capacity(self.terms.len());
|
||||
{
|
||||
|
||||
@@ -1,48 +1,108 @@
|
||||
use query::Scorer;
|
||||
use query::Explanation;
|
||||
|
||||
|
||||
pub trait MultiTermScorer: Scorer {
|
||||
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32);
|
||||
fn clear(&mut self,);
|
||||
fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct MultiTermScorer {
|
||||
pub struct TfIdfScorer {
|
||||
coords: Vec<f32>,
|
||||
idf: Vec<f32>,
|
||||
score: f32,
|
||||
num_fields: usize,
|
||||
}
|
||||
|
||||
impl MultiTermScorer {
|
||||
pub fn new(mut coords: Vec<f32>, idf: Vec<f32>) -> MultiTermScorer {
|
||||
coords.insert(0, 0f32);
|
||||
MultiTermScorer {
|
||||
pub struct MultiTermExplainScorer<TScorer: MultiTermScorer + Sized> {
|
||||
scorer: TScorer,
|
||||
vals: Vec<(usize, u32, u32)>,
|
||||
}
|
||||
|
||||
impl<TScorer: MultiTermScorer + Sized> MultiTermExplainScorer<TScorer> {
|
||||
pub fn explain_score(&self,) -> Explanation {
|
||||
self.scorer.explain(&self.vals)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: MultiTermScorer + Sized> From<TScorer> for MultiTermExplainScorer<TScorer> {
|
||||
fn from(multi_term_scorer: TScorer) -> MultiTermExplainScorer<TScorer> {
|
||||
MultiTermExplainScorer {
|
||||
scorer: multi_term_scorer,
|
||||
vals: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: MultiTermScorer + Sized> Scorer for MultiTermExplainScorer<TScorer> {
|
||||
fn score(&self,) -> f32 {
|
||||
self.scorer.score()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TScorer: MultiTermScorer + Sized> MultiTermScorer for MultiTermExplainScorer<TScorer> {
|
||||
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
|
||||
self.vals.push((term_ord, term_freq, fieldnorm));
|
||||
self.scorer.update(term_ord, term_freq, fieldnorm);
|
||||
}
|
||||
fn clear(&mut self,) {
|
||||
self.vals.clear();
|
||||
self.scorer.clear();
|
||||
}
|
||||
fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation {
|
||||
self.scorer.explain(vals)
|
||||
}
|
||||
}
|
||||
|
||||
impl TfIdfScorer {
|
||||
pub fn new(mut coords: Vec<f32>, idf: Vec<f32>) -> TfIdfScorer {
|
||||
TfIdfScorer {
|
||||
coords: coords,
|
||||
idf: idf,
|
||||
score: 0f32,
|
||||
num_fields: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
|
||||
assert!(term_freq != 0u32);
|
||||
self.score += (term_freq as f32 / fieldnorm as f32).sqrt() * self.idf[term_ord];
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
fn coord(&self,) -> f32 {
|
||||
self.coords[self.num_fields]
|
||||
}
|
||||
|
||||
pub fn clear(&mut self,) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl Scorer for MultiTermScorer {
|
||||
impl Scorer for TfIdfScorer {
|
||||
fn score(&self, ) -> f32 {
|
||||
self.score * self.coord()
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiTermScorer for TfIdfScorer {
|
||||
|
||||
fn explain(&self, vals: &Vec<(usize, u32, u32)>) -> Explanation {
|
||||
let mut explain = String::new();
|
||||
for &(ord, term_freq, field_norm) in vals.iter() {
|
||||
explain += &format!("{} {} {}.\n", ord, term_freq, field_norm);
|
||||
}
|
||||
let count = vals.len();
|
||||
explain += &format!("coord({}) := {}", count, self.coords[count]);
|
||||
Explanation::Explanation(explain)
|
||||
|
||||
}
|
||||
|
||||
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
|
||||
assert!(term_freq != 0u32);
|
||||
self.score += (term_freq as f32 / fieldnorm as f32).sqrt() * self.idf[term_ord];
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
fn clear(&mut self,) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -58,28 +118,28 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_multiterm_scorer() {
|
||||
let mut multi_term_scorer = MultiTermScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32));
|
||||
let mut tfidf_scorer = TfIdfScorer::new(vec!(1f32, 2f32), vec!(1f32, 4f32));
|
||||
{
|
||||
multi_term_scorer.update(0, 1, 1);
|
||||
assert!(abs_diff(multi_term_scorer.score(), 1f32) < 0.001f32);
|
||||
multi_term_scorer.clear();
|
||||
tfidf_scorer.update(0, 1, 1);
|
||||
assert!(abs_diff(tfidf_scorer.score(), 1f32) < 0.001f32);
|
||||
tfidf_scorer.clear();
|
||||
|
||||
}
|
||||
{
|
||||
multi_term_scorer.update(1, 1, 1);
|
||||
assert_eq!(multi_term_scorer.score(), 4f32);
|
||||
multi_term_scorer.clear();
|
||||
tfidf_scorer.update(1, 1, 1);
|
||||
assert_eq!(tfidf_scorer.score(), 4f32);
|
||||
tfidf_scorer.clear();
|
||||
}
|
||||
{
|
||||
multi_term_scorer.update(0, 2, 1);
|
||||
assert!(abs_diff(multi_term_scorer.score(), 1.4142135) < 0.001f32);
|
||||
multi_term_scorer.clear();
|
||||
tfidf_scorer.update(0, 2, 1);
|
||||
assert!(abs_diff(tfidf_scorer.score(), 1.4142135) < 0.001f32);
|
||||
tfidf_scorer.clear();
|
||||
}
|
||||
{
|
||||
multi_term_scorer.update(0, 1, 1);
|
||||
multi_term_scorer.update(1, 1, 1);
|
||||
assert_eq!(multi_term_scorer.score(), 10f32);
|
||||
multi_term_scorer.clear();
|
||||
tfidf_scorer.update(0, 1, 1);
|
||||
tfidf_scorer.update(1, 1, 1);
|
||||
assert_eq!(tfidf_scorer.score(), 10f32);
|
||||
tfidf_scorer.clear();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,22 @@ use std::io;
|
||||
use collector::Collector;
|
||||
use core::searcher::Searcher;
|
||||
use common::TimerTree;
|
||||
use DocAddress;
|
||||
use query::Explanation;
|
||||
use Score;
|
||||
|
||||
pub trait Query {
|
||||
fn search<C: Collector>(&self, searcher: &Searcher, collector: &mut C) -> io::Result<TimerTree>;
|
||||
|
||||
fn search<C: Collector>(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
collector: &mut C) -> io::Result<TimerTree>;
|
||||
|
||||
fn explain(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_address: &DocAddress) -> Result<Option<(Score, Explanation)>, io::Error> {
|
||||
// TODO check that the document is there or return an error.
|
||||
panic!("Not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,9 @@ use schema::Schema;
|
||||
use schema::{Term, Field};
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use DocAddress;
|
||||
use Score;
|
||||
use query::Explanation;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ParsingError {
|
||||
@@ -43,8 +46,16 @@ impl Query for StandardQuery {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_address: &DocAddress) -> Result<Option<(Score, Explanation)>, io::Error> {
|
||||
match self {
|
||||
&StandardQuery::MultiTerm(ref q) => q.explain(searcher, doc_address)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_terms(field: Field, text: &str) -> Vec<Term> {
|
||||
let tokenizer = SimpleTokenizer::new();
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
pub trait Scorer {
|
||||
fn score(&self,) -> f32;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use common::BinarySerializable;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use schema::Field;
|
||||
use schema::Term;
|
||||
|
||||
const TEXT_CODE: u8 = 0;
|
||||
const U32_CODE: u8 = 1;
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
</ul>
|
||||
<ul class='hits'>
|
||||
{{#hits}}
|
||||
<li>{{title}}</li>
|
||||
<li>{{score}} - {{title}} - {{explain}}</li>
|
||||
{{/hits}}
|
||||
</ul>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user