mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-31 07:30:39 +00:00
Merge branch 'issue/50'
This commit is contained in:
@@ -8,7 +8,6 @@ use tantivy::Index;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::query::Query;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
|
||||
@@ -2,7 +2,8 @@ use collector::Collector;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use std::io;
|
||||
use ScoredDoc;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
|
||||
/// Collector that does nothing.
|
||||
@@ -15,7 +16,7 @@ impl Collector for DoNothingCollector {
|
||||
Ok(())
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _: ScoredDoc) {}
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
@@ -43,9 +44,9 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
self.left.collect(scored_doc);
|
||||
self.right.collect(scored_doc);
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,7 +63,6 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use ScoredDoc;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
|
||||
#[test]
|
||||
@@ -73,9 +73,9 @@ mod tests {
|
||||
let mut collectors = chain()
|
||||
.push(&mut top_collector)
|
||||
.push(&mut count_collector);
|
||||
collectors.collect(ScoredDoc(0.2, 1));
|
||||
collectors.collect(ScoredDoc(0.1, 2));
|
||||
collectors.collect(ScoredDoc(0.5, 3));
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::io;
|
||||
use super::Collector;
|
||||
use ScoredDoc;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
@@ -31,7 +32,7 @@ impl Collector for CountCollector {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, _: ScoredDoc) {
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
@@ -41,16 +42,14 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use ScoredDoc;
|
||||
use collector::Collector;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(ScoredDoc(1f32, doc));
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use ScoredDoc;
|
||||
use DocId;
|
||||
use Score;
|
||||
use std::io;
|
||||
|
||||
mod count_collector;
|
||||
@@ -49,7 +50,7 @@ pub trait Collector {
|
||||
/// on this segment.
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, scored_doc: ScoredDoc);
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
}
|
||||
|
||||
|
||||
@@ -58,8 +59,8 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
(*self).collect(scored_doc);
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
(*self).collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,8 +70,8 @@ pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use ScoredDoc;
|
||||
use DocId;
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use std::io;
|
||||
use SegmentLocalId;
|
||||
@@ -112,8 +113,8 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
self.docs.push(scored_doc.doc() + self.offset);
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,8 +151,8 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(scored_doc.doc());
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
}
|
||||
@@ -163,7 +164,7 @@ pub mod tests {
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(ScoredDoc(1f32, doc));
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::io;
|
||||
use super::Collector;
|
||||
use ScoredDoc;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
@@ -31,9 +32,9 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for collector in &mut self.collectors {
|
||||
collector.collect(scored_doc);
|
||||
collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -44,7 +45,6 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use ScoredDoc;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
|
||||
#[test]
|
||||
@@ -53,9 +53,9 @@ mod tests {
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = MultiCollector::from(vec!(&mut top_collector, &mut count_collector));
|
||||
collectors.collect(ScoredDoc(0.2, 1));
|
||||
collectors.collect(ScoredDoc(0.1, 2));
|
||||
collectors.collect(ScoredDoc(0.5, 3));
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::io;
|
||||
use super::Collector;
|
||||
use ScoredDoc;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocAddress;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
@@ -13,6 +13,7 @@ use Score;
|
||||
struct GlobalScoredDoc {
|
||||
score: Score,
|
||||
doc_address: DocAddress
|
||||
|
||||
}
|
||||
|
||||
impl PartialOrd for GlobalScoredDoc {
|
||||
@@ -109,20 +110,20 @@ impl Collector for TopCollector {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < scored_doc.score() {
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap.peek_mut().expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = scored_doc.score();
|
||||
mut_head.doc_address = DocAddress(self.segment_id, scored_doc.doc());
|
||||
mut_head.score = score;
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
let wrapped_doc = GlobalScoredDoc {
|
||||
score: scored_doc.score(),
|
||||
doc_address: DocAddress(self.segment_id, scored_doc.doc())
|
||||
score: score,
|
||||
doc_address: DocAddress(self.segment_id, doc)
|
||||
};
|
||||
self.heap.push(wrapped_doc);
|
||||
}
|
||||
@@ -135,7 +136,6 @@ impl Collector for TopCollector {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use ScoredDoc;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
@@ -143,9 +143,9 @@ mod tests {
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
let mut top_collector = TopCollector::with_limit(4);
|
||||
top_collector.collect(ScoredDoc(0.8, 1));
|
||||
top_collector.collect(ScoredDoc(0.2, 3));
|
||||
top_collector.collect(ScoredDoc(0.3, 5));
|
||||
top_collector.collect(1, 0.8);
|
||||
top_collector.collect(3, 0.2);
|
||||
top_collector.collect(5, 0.3);
|
||||
assert!(!top_collector.at_capacity());
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector.score_docs()
|
||||
.into_iter()
|
||||
@@ -159,11 +159,11 @@ mod tests {
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let mut top_collector = TopCollector::with_limit(4);
|
||||
top_collector.collect(ScoredDoc(0.8, 1));
|
||||
top_collector.collect(ScoredDoc(0.2, 3));
|
||||
top_collector.collect(ScoredDoc(0.3, 5));
|
||||
top_collector.collect(ScoredDoc(0.9, 7));
|
||||
top_collector.collect(ScoredDoc(-0.2, 9));
|
||||
top_collector.collect(1, 0.8);
|
||||
top_collector.collect(3, 0.2);
|
||||
top_collector.collect(5, 0.3);
|
||||
top_collector.collect(7, 0.9);
|
||||
top_collector.collect(9, -0.2);
|
||||
assert!(top_collector.at_capacity());
|
||||
{
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
|
||||
@@ -9,7 +9,6 @@ pub use self::timer::OpenTimer;
|
||||
pub use self::vint::VInt;
|
||||
use std::io;
|
||||
|
||||
|
||||
pub fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
@@ -25,3 +24,4 @@ pub trait HasLen {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,9 +39,7 @@ fn compute_num_bits(amplitude: u32) -> u8 {
|
||||
mod tests {
|
||||
|
||||
use super::compute_num_bits;
|
||||
use super::U32FastFieldsReader;
|
||||
use super::U32FastFieldsWriter;
|
||||
use super::FastFieldSerializer;
|
||||
use super::*;
|
||||
use schema::Field;
|
||||
use std::path::Path;
|
||||
use directory::{Directory, WritePtr, RAMDirectory};
|
||||
@@ -81,6 +79,17 @@ mod tests {
|
||||
doc.add_u32(field, value);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
|
||||
println!("{}", test_fastfield.get(0));
|
||||
println!("{}", test_fastfield.get(1));
|
||||
println!("{}", test_fastfield.get(2));
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_small() {
|
||||
|
||||
@@ -5,10 +5,22 @@ use std::ops::Deref;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use DocId;
|
||||
use schema::Field;
|
||||
|
||||
use schema::{Field, SchemaBuilder};
|
||||
use std::path::Path;
|
||||
use schema::FAST;
|
||||
use directory::{WritePtr, RAMDirectory, Directory};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::U32FastFieldsWriter;
|
||||
use super::compute_num_bits;
|
||||
|
||||
|
||||
lazy_static! {
|
||||
static ref U32_FAST_FIELD_EMPTY: ReadOnlySource = {
|
||||
let u32_fast_field = U32FastFieldReader::from(Vec::new());
|
||||
u32_fast_field._data.clone()
|
||||
};
|
||||
}
|
||||
|
||||
pub struct U32FastFieldReader {
|
||||
_data: ReadOnlySource,
|
||||
data_ptr: *const u8,
|
||||
@@ -20,6 +32,10 @@ pub struct U32FastFieldReader {
|
||||
|
||||
impl U32FastFieldReader {
|
||||
|
||||
pub fn empty() -> U32FastFieldReader {
|
||||
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone()).expect("should always work.")
|
||||
}
|
||||
|
||||
pub fn min_val(&self,) -> u32 {
|
||||
self.min_val
|
||||
}
|
||||
@@ -62,6 +78,31 @@ impl U32FastFieldReader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<u32>> for U32FastFieldReader {
|
||||
fn from(vals: Vec<u32>) -> U32FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u32_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
|
||||
for val in vals {
|
||||
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
|
||||
fast_field_writer.add_val(val);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
fast_field_readers.get_field(field).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct U32FastFieldsReader {
|
||||
source: ReadOnlySource,
|
||||
field_offsets: HashMap<Field, (u32, u32)>,
|
||||
|
||||
83
src/lib.rs
83
src/lib.rs
@@ -2,6 +2,7 @@
|
||||
#![allow(unknown_lints)] // for the clippy lint options
|
||||
#![allow(module_inception)]
|
||||
|
||||
#![feature(box_syntax)]
|
||||
#![feature(optin_builtin_traits)]
|
||||
#![feature(conservative_impl_trait)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
@@ -58,11 +59,16 @@ mod macros {
|
||||
|
||||
macro_rules! doc(
|
||||
($($field:ident => $value:expr),*) => {{
|
||||
let mut document = Document::default();
|
||||
$(
|
||||
document.add(FieldValue::new($field, $value.into()));
|
||||
)*
|
||||
document
|
||||
#[allow(unused_mut)] // avoid emitting a warning for `doc!()`
|
||||
{
|
||||
let mut document = Document::default();
|
||||
$(
|
||||
document.add(FieldValue::new($field, $value.into()));
|
||||
)*
|
||||
document
|
||||
}
|
||||
|
||||
|
||||
}};
|
||||
);
|
||||
}
|
||||
@@ -138,22 +144,6 @@ impl DocAddress {
|
||||
}
|
||||
}
|
||||
|
||||
/// A scored doc is simply a couple `(score, doc_id)`
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct ScoredDoc(Score, DocId);
|
||||
|
||||
impl ScoredDoc {
|
||||
|
||||
/// Returns the score
|
||||
pub fn score(&self,) -> Score {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Returns the doc
|
||||
pub fn doc(&self,) -> DocId {
|
||||
self.1
|
||||
}
|
||||
}
|
||||
|
||||
/// `DocAddress` contains all the necessary information
|
||||
/// to identify a document given a `Searcher` object.
|
||||
@@ -188,18 +178,15 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af b");
|
||||
let doc = doc!(text_field=>"af b");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c");
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c d");
|
||||
let doc = doc!(text_field=>"a b c d");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
@@ -214,27 +201,22 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a b c")).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a");
|
||||
let doc = doc!(text_field=>"a");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a a");
|
||||
let doc = doc!(text_field=>"a a");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "c");
|
||||
let doc = doc!(text_field=>"c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
@@ -260,17 +242,15 @@ mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c");
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let doc = Document::default();
|
||||
let doc = doc!();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b");
|
||||
let doc = doc!(text_field=>"a b");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
@@ -296,8 +276,7 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af af af bc bc");
|
||||
let doc = doc!(text_field=>"af af af bc bc");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
@@ -325,18 +304,15 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af af af b");
|
||||
let doc = doc!(text_field=>"af af af b");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c");
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c d");
|
||||
let doc = doc!(text_field=>"a b c d");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
@@ -394,18 +370,15 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af b");
|
||||
let doc = doc!(text_field=>"af b");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c");
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c d");
|
||||
let doc = doc!(text_field=>"a b c d");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
@@ -17,53 +17,56 @@ pub enum SkipResult {
|
||||
}
|
||||
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet {
|
||||
/// Goes to the next element.
|
||||
/// `.advance(...)` needs to be called a first time to point to the correct
|
||||
/// element.
|
||||
fn advance(&mut self,) -> bool;
|
||||
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
/// After skipping, position the iterator in such a way that `.doc()`
|
||||
/// will return a value greater than or equal to target.
|
||||
///
|
||||
///
|
||||
/// SkipResult expresses whether the `target value` was reached, overstepped,
|
||||
/// or if the `DocSet` was entirely consumed without finding any value
|
||||
/// greater or equal to the `target`.
|
||||
/// greater or equal to the `target`.
|
||||
///
|
||||
/// WARNING: Calling skip always advances the docset.
|
||||
/// More specifically, if the docset is already positionned on the target
|
||||
/// skipping will advance to the next position and return SkipResult::Overstep.
|
||||
///
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.advance();
|
||||
loop {
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Less => {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
},
|
||||
Ordering::Equal => { return SkipResult::Reached },
|
||||
Ordering::Greater => { return SkipResult::OverStep },
|
||||
}
|
||||
Ordering::Equal => return SkipResult::Reached,
|
||||
Ordering::Greater => return SkipResult::OverStep,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns the current document
|
||||
fn doc(&self,) -> DocId;
|
||||
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
/// Advances the cursor to the next document
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self,) -> Option<DocId> {
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self) -> Option<DocId> {
|
||||
if self.advance() {
|
||||
Some(self.doc())
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for Box<TDocSet> {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.advance()
|
||||
}
|
||||
@@ -73,28 +76,25 @@ impl<TDocSet: DocSet> DocSet for Box<TDocSet> {
|
||||
unboxed.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.doc()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.advance()
|
||||
}
|
||||
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
let unref: &TDocSet = *self;
|
||||
unref.doc()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ pub struct FreqHandler {
|
||||
|
||||
|
||||
fn read_positions(data: &[u8]) -> Vec<u32> {
|
||||
let mut composite_reader = CompositeDecoder::new();
|
||||
let mut composite_reader = CompositeDecoder::new();
|
||||
let mut readable: &[u8] = data;
|
||||
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
|
||||
composite_reader.uncompress_unsorted(readable, uncompressed_len);
|
||||
@@ -27,17 +27,16 @@ fn read_positions(data: &[u8]) -> Vec<u32> {
|
||||
|
||||
|
||||
impl FreqHandler {
|
||||
|
||||
/// Returns a `FreqHandler` that just decodes `DocId`s.
|
||||
pub fn new_without_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder::with_val(1u32),
|
||||
positions: Vec::new(),
|
||||
positions: Vec::new(),
|
||||
option: SegmentPostingsOption::NoFreq,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
|
||||
pub fn new_with_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
@@ -54,15 +53,15 @@ impl FreqHandler {
|
||||
let positions = read_positions(position_data);
|
||||
FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder::new(),
|
||||
positions: positions,
|
||||
positions: positions,
|
||||
option: SegmentPostingsOption::FreqAndPositions,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_positions_offset(&mut self,) {
|
||||
|
||||
fn fill_positions_offset(&mut self) {
|
||||
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
|
||||
let mut i: usize = 0;
|
||||
let mut i: usize = 0;
|
||||
self.positions_offsets[i] = cur_position;
|
||||
let mut last_cur_position = cur_position;
|
||||
for &doc_freq in self.freq_decoder.output_array() {
|
||||
@@ -78,16 +77,16 @@ impl FreqHandler {
|
||||
last_cur_position = cur_position;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Accessor to term frequency
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
/// It takes value between 0 and 128.
|
||||
pub fn freq(&self, idx: usize)-> u32 {
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the positions
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
@@ -95,18 +94,14 @@ impl FreqHandler {
|
||||
pub fn positions(&self, idx: usize) -> &[u32] {
|
||||
let start = self.positions_offsets[idx];
|
||||
let stop = self.positions_offsets[idx + 1];
|
||||
&self.positions[start..stop]
|
||||
&self.positions[start..stop]
|
||||
}
|
||||
|
||||
|
||||
/// Decompresses a complete frequency block
|
||||
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
match self.option {
|
||||
SegmentPostingsOption::NoFreq => {
|
||||
data
|
||||
}
|
||||
SegmentPostingsOption::Freq => {
|
||||
self.freq_decoder.uncompress_block_unsorted(data)
|
||||
}
|
||||
SegmentPostingsOption::NoFreq => data,
|
||||
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
|
||||
self.fill_positions_offset();
|
||||
@@ -114,7 +109,7 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Decompresses an incomplete frequency block
|
||||
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
|
||||
match self.option {
|
||||
@@ -128,5 +123,4 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,90 +1,80 @@
|
||||
use postings::DocSet;
|
||||
use std::cmp::Ordering;
|
||||
use postings::SkipResult;
|
||||
use DocId;
|
||||
|
||||
// TODO Find a way to specialize `IntersectionDocSet`
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct IntersectionDocSet<'a> {
|
||||
left: Box<DocSet + 'a>,
|
||||
right: Box<DocSet + 'a>,
|
||||
finished: bool,
|
||||
pub struct IntersectionDocSet<TDocSet: DocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
finished: bool,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
impl<'a> IntersectionDocSet<'a> {
|
||||
|
||||
/// Intersect two `DocSet`s
|
||||
fn from_pair(left: Box<DocSet + 'a>, right: Box<DocSet + 'a>) -> IntersectionDocSet<'a> {
|
||||
impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
|
||||
fn from(docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
|
||||
assert!(docsets.len() >= 2);
|
||||
IntersectionDocSet {
|
||||
left: left,
|
||||
right: right,
|
||||
docsets: docsets,
|
||||
finished: false,
|
||||
}
|
||||
doc: DocId::max_value(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersect a list of `DocSet`s
|
||||
pub fn new(mut postings: Vec<Box<DocSet + 'a>>) -> IntersectionDocSet<'a> {
|
||||
let left = postings.pop().unwrap();
|
||||
let right =
|
||||
if postings.len() == 1 {
|
||||
postings.pop().unwrap()
|
||||
}
|
||||
else {
|
||||
Box::new(IntersectionDocSet::new(postings))
|
||||
};
|
||||
IntersectionDocSet::from_pair(left, right)
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
|
||||
/// Returns an array to the underlying `DocSet`s of the intersection.
|
||||
/// These `DocSet` are in the same position as the `IntersectionDocSet`,
|
||||
/// so that user can access their `docfreq` and `positions`.
|
||||
pub fn docsets(&self) -> &[TDocSet] {
|
||||
&self.docsets[..]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> DocSet for IntersectionDocSet<'a> {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.finished {
|
||||
return false;
|
||||
}
|
||||
|
||||
if !self.left.advance() {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
if !self.right.advance() {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
let num_docsets = self.docsets.len();
|
||||
let mut count_matching = 1;
|
||||
let mut doc_candidate = {
|
||||
let mut first_docset = &mut self.docsets[0];
|
||||
if !first_docset.advance() {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
first_docset.doc()
|
||||
};
|
||||
let mut ord = 1;
|
||||
loop {
|
||||
match self.left.doc().cmp(&self.right.doc()) {
|
||||
Ordering::Equal => {
|
||||
return true;
|
||||
}
|
||||
Ordering::Less => {
|
||||
if !self.left.advance() {
|
||||
self.finished = true;
|
||||
return false;
|
||||
let mut doc_set = &mut self.docsets[ord];
|
||||
match doc_set.skip_next(doc_candidate) {
|
||||
SkipResult::Reached => {
|
||||
count_matching += 1;
|
||||
if count_matching == num_docsets {
|
||||
self.doc = doc_candidate;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Ordering::Greater => {
|
||||
if !self.right.advance() {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
count_matching = 1;
|
||||
doc_candidate = doc_set.doc();
|
||||
}
|
||||
}
|
||||
ord += 1;
|
||||
if ord == num_docsets {
|
||||
ord = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
self.left.doc()
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersects a `Vec` of `DocSets`
|
||||
pub fn intersection<'a, TDocSet: DocSet + 'a>(postings: Vec<TDocSet>) -> IntersectionDocSet<'a> {
|
||||
let boxed_postings: Vec<Box<DocSet + 'a>> = postings
|
||||
.into_iter()
|
||||
.map(|postings: TDocSet| {
|
||||
Box::new(postings) as Box<DocSet + 'a>
|
||||
})
|
||||
.collect();
|
||||
IntersectionDocSet::new(boxed_postings)
|
||||
}
|
||||
|
||||
@@ -29,12 +29,11 @@ pub use self::postings::Postings;
|
||||
|
||||
#[cfg(test)]
|
||||
pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::chained_postings::ChainedPostings;
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::intersection::intersection;
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::freq_handler::FreqHandler;
|
||||
|
||||
pub use self::segment_postings_option::SegmentPostingsOption;
|
||||
pub use common::HasLen;
|
||||
|
||||
@@ -49,6 +48,7 @@ mod tests {
|
||||
use core::Index;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use query::TermQuery;
|
||||
|
||||
|
||||
#[test]
|
||||
@@ -72,7 +72,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm_write_fullstack() {
|
||||
pub fn test_position_and_fieldnorm() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -153,12 +153,43 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm2() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "g b b d c g c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "g a b b a d c g c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
let term_query = TermQuery::from(Term::from_field_text(text_field, "a"));
|
||||
let searcher = index.searcher();
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher);
|
||||
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
|
||||
let segment_reader = &searcher.segment_readers()[0];
|
||||
let mut term_scorer = term_weight.specialized_scorer(segment_reader).unwrap();
|
||||
assert!(term_scorer.advance());
|
||||
assert_eq!(term_scorer.doc(), 1u32);
|
||||
assert_eq!(term_scorer.postings().positions(), &[1u32, 4]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
let left = Box::new(VecPostings::from(vec!(1, 3, 9)));
|
||||
let right = Box::new(VecPostings::from(vec!(3, 4, 9, 18)));
|
||||
let mut intersection = IntersectionDocSet::new(vec!(left, right));
|
||||
let left = VecPostings::from(vec!(1, 3, 9));
|
||||
let right = VecPostings::from(vec!(3, 4, 9, 18));
|
||||
let mut intersection = IntersectionDocSet::from(vec!(left, right));
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert!(intersection.advance());
|
||||
@@ -166,10 +197,10 @@ mod tests {
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
{
|
||||
let a = Box::new(VecPostings::from(vec!(1, 3, 9)));
|
||||
let b = Box::new(VecPostings::from(vec!(3, 4, 9, 18)));
|
||||
let c = Box::new(VecPostings::from(vec!(1, 5, 9, 111)));
|
||||
let mut intersection = IntersectionDocSet::new(vec!(a, b, c));
|
||||
let a = VecPostings::from(vec!(1, 3, 9));
|
||||
let b = VecPostings::from(vec!(3, 4, 9, 18));
|
||||
let c = VecPostings::from(vec!(1, 5, 9, 111));
|
||||
let mut intersection = IntersectionDocSet::from(vec!(a, b, c));
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
|
||||
@@ -15,7 +15,6 @@ pub struct OffsetPostings<'a> {
|
||||
}
|
||||
|
||||
impl<'a> OffsetPostings<'a> {
|
||||
|
||||
/// Constructor
|
||||
pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
|
||||
OffsetPostings {
|
||||
@@ -26,38 +25,35 @@ impl<'a> OffsetPostings<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DocSet for OffsetPostings<'a> {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.underlying.advance()
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.underlying.doc() + self.offset
|
||||
}
|
||||
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if target >= self.offset {
|
||||
SkipResult::OverStep
|
||||
}
|
||||
else {
|
||||
self.underlying.skip_next(target - self.offset)
|
||||
} else {
|
||||
self.underlying.skip_next(target - self.offset)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> HasLen for OffsetPostings<'a> {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.underlying.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for OffsetPostings<'a> {
|
||||
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.underlying.term_freq()
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.underlying.positions()
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,8 +1,5 @@
|
||||
use std::borrow::Borrow;
|
||||
use postings::docset::DocSet;
|
||||
use common::HasLen;
|
||||
|
||||
|
||||
|
||||
/// Postings (also called inverted list)
|
||||
///
|
||||
@@ -10,58 +7,38 @@ use common::HasLen;
|
||||
/// containing the term. Optionally, for each document,
|
||||
/// it may also give access to the term frequency
|
||||
/// as well as the list of term positions.
|
||||
///
|
||||
///
|
||||
/// Its main implementation is `SegmentPostings`,
|
||||
/// but other implementations mocking `SegmentPostings` exist,
|
||||
/// for merging segments or for testing.
|
||||
pub trait Postings: DocSet {
|
||||
/// Returns the term frequency
|
||||
fn term_freq(&self,) -> u32;
|
||||
fn term_freq(&self) -> u32;
|
||||
/// Returns the list of positions of the term, expressed as a list of
|
||||
/// token ordinals.
|
||||
fn positions(&self) -> &[u32];
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Postings for Box<TPostings> {
|
||||
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let unboxed: &TPostings = self.borrow();
|
||||
unboxed.term_freq()
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
let unboxed: &TPostings = self.borrow();
|
||||
unboxed.positions()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<'a, TPostings: Postings> Postings for &'a mut TPostings {
|
||||
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let unref: &TPostings = *self;
|
||||
unref.term_freq()
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
let unref: &TPostings = *self;
|
||||
unref.positions()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl<THasLen: HasLen> HasLen for Box<THasLen> {
|
||||
fn len(&self,) -> usize {
|
||||
let unboxed: &THasLen = self.borrow();
|
||||
unboxed.borrow().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> HasLen for &'a HasLen {
|
||||
fn len(&self,) -> usize {
|
||||
let unref: &HasLen = *self;
|
||||
unref.len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,12 +9,11 @@ use schema::Field;
|
||||
use analyzer::StreamingIterator;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
pub trait PostingsWriter {
|
||||
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
/// * doc - the document id
|
||||
@@ -22,17 +21,22 @@ pub trait PostingsWriter {
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
|
||||
|
||||
/// Closes all of the currently open `Recorder`'s.
|
||||
fn close(&mut self, heap: &Heap);
|
||||
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 {
|
||||
fn index_text<'a>(&mut self,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = Term::allocate(field, 100);
|
||||
@@ -65,7 +69,7 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
let num_buckets_usable = heap_capacity / 100;
|
||||
let hash_table_size = num_buckets_usable * 2;
|
||||
let mut pow = 512;
|
||||
for num_bits in 10 .. 32 {
|
||||
for num_bits in 10..32 {
|
||||
pow <<= 1;
|
||||
if pow > hash_table_size {
|
||||
return num_bits;
|
||||
@@ -75,31 +79,26 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
/// constructor
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
let capacity = heap.capacity();
|
||||
let hashmap_size = hashmap_size_in_bits(capacity);
|
||||
SpecializedPostingsWriter {
|
||||
term_index: HashMap::new(hashmap_size, heap),
|
||||
}
|
||||
SpecializedPostingsWriter { term_index: HashMap::new(hashmap_size, heap) }
|
||||
}
|
||||
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
fn close(&mut self, heap: &Heap) {
|
||||
for recorder in self.term_index.values_mut() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) {
|
||||
let mut recorder = self.term_index.get_or_create(term);
|
||||
@@ -112,9 +111,9 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
}
|
||||
|
||||
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
|
||||
.iter()
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
@@ -128,8 +127,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,32 +4,36 @@ use postings::PostingsSerializer;
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = 4294967295;
|
||||
const POSITION_END: u32 = 4294967295;
|
||||
|
||||
/// Recorder is in charge of recording relevant information about
|
||||
/// the presence of a term in a document.
|
||||
///
|
||||
/// Depending on the `TextIndexingOptions` associated to the
|
||||
/// Depending on the `TextIndexingOptions` associated to the
|
||||
/// field, the recorder may records
|
||||
/// * the document frequency
|
||||
/// * the document id
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: HeapAllocable {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self,) -> u32;
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
/// This method shall only be called if the term is within the document.
|
||||
/// This method shall only be called if the term is within the document.
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap);
|
||||
/// Record the position of a term. For each document,
|
||||
/// Record the position of a term. For each document,
|
||||
/// this method will be called `term_freq` times.
|
||||
fn record_position(&mut self, position: u32, heap: &Heap);
|
||||
/// Close the document. It will help record the term frequency.
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
/// Returns the number of document that have been seen so far
|
||||
fn doc_freq(&self,) -> u32;
|
||||
fn doc_freq(&self) -> u32;
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
@@ -51,11 +55,10 @@ impl HeapAllocable for NothingRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
|
||||
fn current_doc(&self,) -> DocId {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
@@ -66,17 +69,20 @@ impl Recorder for NothingRecorder {
|
||||
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
|
||||
fn doc_freq(&self,) -> u32 {
|
||||
fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
@@ -94,16 +100,13 @@ impl HeapAllocable for TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
doc_freq: 0u32
|
||||
}
|
||||
doc_freq: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
|
||||
|
||||
fn current_doc(&self,) -> DocId {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
@@ -112,22 +115,26 @@ impl Recorder for TermFrequencyRecorder {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {
|
||||
self.current_tf += 1;
|
||||
}
|
||||
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn doc_freq(&self,) -> u32 {
|
||||
|
||||
fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
fn serialize(&self, self_addr:u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
let mut doc_iter = self.stack.iter(self_addr, heap);
|
||||
loop {
|
||||
if let Some(doc) = doc_iter.next() {
|
||||
@@ -140,7 +147,6 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
@@ -162,12 +168,10 @@ impl HeapAllocable for TFAndPositionRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
|
||||
|
||||
fn current_doc(&self,) -> DocId {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.doc_freq += 1;
|
||||
self.current_doc = doc;
|
||||
@@ -177,16 +181,20 @@ impl Recorder for TFAndPositionRecorder {
|
||||
fn record_position(&mut self, position: u32, heap: &Heap) {
|
||||
self.stack.push(position, heap);
|
||||
}
|
||||
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn doc_freq(&self,) -> u32 {
|
||||
|
||||
fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
@@ -197,8 +205,7 @@ impl Recorder for TFAndPositionRecorder {
|
||||
Some(position) => {
|
||||
if position == POSITION_END {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
doc_positions.push(position - prev_position);
|
||||
prev_position = position;
|
||||
}
|
||||
@@ -212,7 +219,4 @@ impl Recorder for TFAndPositionRecorder {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,11 +4,11 @@ use postings::{Postings, FreqHandler, DocSet, HasLen};
|
||||
use std::num::Wrapping;
|
||||
|
||||
|
||||
const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
///
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<'a> {
|
||||
@@ -21,16 +21,16 @@ pub struct SegmentPostings<'a> {
|
||||
}
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
|
||||
fn load_next_block(&mut self,) {
|
||||
fn load_next_block(&mut self) {
|
||||
let num_remaining_docs = self.len - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
}
|
||||
else {
|
||||
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
} else {
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
}
|
||||
}
|
||||
@@ -39,7 +39,7 @@ impl<'a> SegmentPostings<'a> {
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
@@ -51,22 +51,32 @@ impl<'a> SegmentPostings<'a> {
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Index within a block is used as an address when
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self,) -> usize {
|
||||
self.cur.0 % NUM_DOCS_PER_BLOCK
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings<'static> {
|
||||
SegmentPostings {
|
||||
len: 0,
|
||||
doc_offset: 0,
|
||||
block_decoder: SIMDBlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_DATA,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Index within a block is used as an address when
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self) -> usize {
|
||||
self.cur.0 % NUM_DOCS_PER_BLOCK
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> DocSet for SegmentPostings<'a> {
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cur += Wrapping(1);
|
||||
if self.cur.0 >= self.len {
|
||||
return false;
|
||||
@@ -76,27 +86,25 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
self.block_decoder.output(self.index_within_block())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<'a> HasLen for SegmentPostings<'a> {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for SegmentPostings<'a> {
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.freq_handler.freq(self.index_within_block())
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.freq_handler.positions(self.index_within_block())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,11 @@
|
||||
|
||||
/// Object describing the amount of information required when reading a postings.
|
||||
///
|
||||
/// Since decoding information is not free, this makes it possible to
|
||||
/// Since decoding information is not free, this makes it possible to
|
||||
/// avoid this extra cost when the information is not required.
|
||||
/// For instance, positions are useful when running phrase queries
|
||||
/// but useless in other queries,
|
||||
/// but useless in other queries.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum SegmentPostingsOption {
|
||||
/// Only the doc ids are decoded
|
||||
NoFreq,
|
||||
@@ -13,4 +14,4 @@ pub enum SegmentPostingsOption {
|
||||
Freq,
|
||||
/// DocIds, term frequencies and positions will be decoded.
|
||||
FreqAndPositions,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,14 +19,14 @@ use common::BinarySerializable;
|
||||
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
/// postings on disk, in the
|
||||
/// * `.idx` (inverted index)
|
||||
/// * `.pos` (positions file)
|
||||
/// * `.term` (term dictionary)
|
||||
///
|
||||
/// `PostingsWriter` are in charge of pushing the data to the
|
||||
///
|
||||
/// `PostingsWriter` are in charge of pushing the data to the
|
||||
/// serializer.
|
||||
///
|
||||
///
|
||||
/// The serializer expects to receive the following calls
|
||||
/// in this order :
|
||||
///
|
||||
@@ -45,10 +45,10 @@ use common::BinarySerializable;
|
||||
/// Terms have to be pushed in a lexicographically-sorted order.
|
||||
/// Within a term, document have to be pushed in increasing order.
|
||||
///
|
||||
/// A description of the serialization format is
|
||||
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
|
||||
/// A description of the serialization format is
|
||||
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
|
||||
pub struct PostingsSerializer {
|
||||
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
|
||||
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, /* TODO find an alternative to work around the "move" */
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
written_bytes_postings: usize,
|
||||
@@ -65,14 +65,13 @@ pub struct PostingsSerializer {
|
||||
}
|
||||
|
||||
impl PostingsSerializer {
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
|
||||
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn new(terms_write: WritePtr,
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
schema: Schema)
|
||||
-> Result<PostingsSerializer> {
|
||||
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
|
||||
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
|
||||
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
|
||||
let schema = segment.schema();
|
||||
Ok(PostingsSerializer {
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
postings_write: postings_write,
|
||||
@@ -90,27 +89,36 @@ impl PostingsSerializer {
|
||||
term_open: false,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
|
||||
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
|
||||
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
|
||||
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
|
||||
PostingsSerializer::new(terms_write,
|
||||
postings_write,
|
||||
positions_write,
|
||||
segment.schema())
|
||||
}
|
||||
|
||||
fn load_indexing_options(&mut self, field: Field) {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
self.text_indexing_options = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
text_options.get_indexing_options()
|
||||
}
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
FieldType::U32(ref u32_options) => {
|
||||
if u32_options.is_indexed() {
|
||||
TextIndexingOptions::Unindexed
|
||||
}
|
||||
else {
|
||||
TextIndexingOptions::Untokenized
|
||||
} else {
|
||||
TextIndexingOptions::Untokenized
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/// Starts the postings for a new term.
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
@@ -130,31 +138,34 @@ impl PostingsSerializer {
|
||||
self.terms_fst_builder
|
||||
.insert(term.as_slice(), &term_info)
|
||||
}
|
||||
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
///
|
||||
/// If the current block is incomplete, it need to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self,) -> io::Result<()> {
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
if !self.doc_ids.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
// this happens when the number of doc ids is
|
||||
// this happens when the number of doc ids is
|
||||
// not a perfect multiple of our block size.
|
||||
//
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
// ... Idem for term frequencies
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
for num in block_encoded {
|
||||
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
|
||||
self.written_bytes_postings +=
|
||||
try!(num.serialize(&mut self.postings_write));
|
||||
}
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
@@ -162,8 +173,10 @@ impl PostingsSerializer {
|
||||
// On the other hand, positions are entirely buffered until the
|
||||
// end of the term, at which point they are compressed and written.
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64)
|
||||
.serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder
|
||||
.compress_unsorted(&self.position_deltas[..]);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
self.position_deltas.clear();
|
||||
@@ -172,8 +185,8 @@ impl PostingsSerializer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
/// its term frequency, and the position deltas.
|
||||
///
|
||||
@@ -183,7 +196,11 @@ impl PostingsSerializer {
|
||||
///
|
||||
/// Term frequencies and positions may be ignored by the serializer depending
|
||||
/// on the configuration of the field in the `Schema`.
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
|
||||
pub fn write_doc(&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32])
|
||||
-> io::Result<()> {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
self.term_freqs.push(term_freq as u32);
|
||||
@@ -194,14 +211,16 @@ impl PostingsSerializer {
|
||||
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
}
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_unsorted(&self.term_freqs);
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
self.term_freqs.clear();
|
||||
@@ -210,9 +229,9 @@ impl PostingsSerializer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Closes the serializer.
|
||||
pub fn close(mut self,) -> io::Result<()> {
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
try!(self.close_term());
|
||||
try!(self.terms_fst_builder.finish());
|
||||
try!(self.postings_write.flush());
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
use DocId;
|
||||
use postings::{Postings, DocSet, SkipResult, HasLen};
|
||||
use postings::{Postings, DocSet, HasLen};
|
||||
use std::num::Wrapping;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [];
|
||||
const EMPTY_ARRAY: [u32; 0] = [];
|
||||
|
||||
/// Simulate a `Postings` objects from a `VecPostings`.
|
||||
/// `VecPostings` only exist for testing purposes.
|
||||
@@ -27,99 +26,43 @@ impl From<Vec<DocId>> for VecPostings {
|
||||
}
|
||||
|
||||
impl DocSet for VecPostings {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cursor += Wrapping(1);
|
||||
self.doc_ids.len() > self.cursor.0
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc_ids[self.cursor.0]
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let mut start: usize = self.cursor.0;
|
||||
match self.doc_ids[start].cmp(&target) {
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
if self.cursor.0 < self.doc_ids.len() {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
else {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
Ordering::Less => {
|
||||
// see below
|
||||
}
|
||||
}
|
||||
|
||||
let mut end = self.doc_ids.len();
|
||||
|
||||
while end - start > 1 {
|
||||
// find an upper bound
|
||||
let mut jump = 1;
|
||||
loop {
|
||||
let jump_dest = start + jump;
|
||||
if jump_dest >= end {
|
||||
// we jump out of bounds
|
||||
break;
|
||||
}
|
||||
match self.doc_ids[jump_dest].cmp(&target) {
|
||||
Ordering::Less => {
|
||||
// still below the target, let's keep jumping.
|
||||
start = jump_dest;
|
||||
jump *= 2;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
self.cursor = Wrapping(jump_dest);
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
end = jump_dest;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.cursor = Wrapping(start + 1);
|
||||
if self.cursor.0 < self.doc_ids.len() {
|
||||
SkipResult::OverStep
|
||||
}
|
||||
else {
|
||||
SkipResult::End
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for VecPostings {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.doc_ids.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for VecPostings {
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
1u32
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
&EMPTY_ARRAY
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use DocId;
|
||||
use postings::{Postings, SkipResult, DocSet};
|
||||
|
||||
|
||||
use DocId;
|
||||
use postings::{Postings, SkipResult, DocSet};
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_vec_postings() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e*3).collect();
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
||||
let mut postings = VecPostings::from(doc_ids);
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
@@ -132,5 +75,5 @@ pub mod tests {
|
||||
assert_eq!(postings.doc(), 300u32);
|
||||
assert_eq!(postings.skip_next(6000u32), SkipResult::End);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
48
src/query/boolean_query/boolean_query.rs
Normal file
48
src/query/boolean_query/boolean_query.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
use Result;
|
||||
use std::any::Any;
|
||||
use super::boolean_weight::BooleanWeight;
|
||||
use query::Weight;
|
||||
use Searcher;
|
||||
use query::Query;
|
||||
use query::Occur;
|
||||
use query::OccurFilter;
|
||||
|
||||
/// The boolean query combines a set of queries
|
||||
///
|
||||
/// The documents matched by the boolean query are
|
||||
/// those which
|
||||
/// * match all of the sub queries associated with the
|
||||
/// `Must` occurence
|
||||
/// * match none of the sub queries associated with the
|
||||
/// `MustNot` occurence.
|
||||
/// * match at least one of the subqueries that is not
|
||||
/// a `MustNot` occurence.
|
||||
#[derive(Debug)]
|
||||
pub struct BooleanQuery {
|
||||
subqueries: Vec<(Occur, Box<Query>)>
|
||||
}
|
||||
|
||||
impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
||||
fn from(subqueries: Vec<(Occur, Box<Query>)>) -> BooleanQuery {
|
||||
BooleanQuery { subqueries: subqueries }
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for BooleanQuery {
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
let sub_weights = try!(self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect());
|
||||
let occurs: Vec<Occur> = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref _subquery)| *occur)
|
||||
.collect();
|
||||
let filter = OccurFilter::new(&occurs);
|
||||
Ok(box BooleanWeight::new(sub_weights, filter))
|
||||
}
|
||||
}
|
||||
149
src/query/boolean_query/boolean_scorer.rs
Normal file
149
src/query/boolean_query/boolean_scorer.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
use query::Scorer;
|
||||
use DocId;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Ordering;
|
||||
use postings::DocSet;
|
||||
use query::OccurFilter;
|
||||
use query::boolean_query::ScoreCombiner;
|
||||
|
||||
|
||||
/// Each `HeapItem` represents the head of
|
||||
/// one of scorer being merged.
|
||||
///
|
||||
/// * `doc` - is the current doc id for the given segment postings
|
||||
/// * `ord` - is the ordinal used to identify to which segment postings
|
||||
/// this heap item belong to.
|
||||
#[derive(Eq, PartialEq)]
|
||||
struct HeapItem {
|
||||
doc: DocId,
|
||||
ord: u32,
|
||||
}
|
||||
|
||||
/// `HeapItem` are ordered by the document
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other:&Self) -> Ordering {
|
||||
(other.doc).cmp(&self.doc)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BooleanScorer<TScorer: Scorer> {
|
||||
scorers: Vec<TScorer>,
|
||||
queue: BinaryHeap<HeapItem>,
|
||||
doc: DocId,
|
||||
score_combiner: ScoreCombiner,
|
||||
occur_filter: OccurFilter,
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
|
||||
pub fn scorers(&self) -> &[TScorer] {
|
||||
&self.scorers
|
||||
}
|
||||
|
||||
pub fn new(scorers: Vec<TScorer>,
|
||||
occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
|
||||
let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len());
|
||||
let mut non_empty_scorers: Vec<TScorer> = Vec::new();
|
||||
for mut posting in scorers {
|
||||
let non_empty = posting.advance();
|
||||
if non_empty {
|
||||
non_empty_scorers.push(posting);
|
||||
}
|
||||
}
|
||||
let heap_items: Vec<HeapItem> = non_empty_scorers
|
||||
.iter()
|
||||
.map(|posting| posting.doc())
|
||||
.enumerate()
|
||||
.map(|(ord, doc)| {
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
BooleanScorer {
|
||||
scorers: non_empty_scorers,
|
||||
queue: BinaryHeap::from(heap_items),
|
||||
doc: 0u32,
|
||||
score_combiner: score_combiner,
|
||||
occur_filter: occur_filter,
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the head of our heap (the segment posting with the lowest doc)
|
||||
/// It will also update the new current `DocId` as well as the term frequency
|
||||
/// associated with the segment postings.
|
||||
///
|
||||
/// After advancing the `SegmentPosting`, the postings is removed from the heap
|
||||
/// if it has been entirely consumed, or pushed back into the heap.
|
||||
///
|
||||
/// # Panics
|
||||
/// This method will panic if the head `SegmentPostings` is not empty.
|
||||
fn advance_head(&mut self,) {
|
||||
{
|
||||
let mut mutable_head = self.queue.peek_mut().unwrap();
|
||||
let cur_scorers = &mut self.scorers[mutable_head.ord as usize];
|
||||
if cur_scorers.advance() {
|
||||
mutable_head.doc = cur_scorers.doc();
|
||||
return;
|
||||
}
|
||||
}
|
||||
self.queue.pop();
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
|
||||
fn advance(&mut self,) -> bool {
|
||||
loop {
|
||||
self.score_combiner.clear();
|
||||
let mut ord_bitset = 0u64;
|
||||
match self.queue.peek() {
|
||||
Some(heap_item) => {
|
||||
let ord = heap_item.ord as usize;
|
||||
self.doc = heap_item.doc;
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
None => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.advance_head();
|
||||
while let Some(&HeapItem {doc, ord}) = self.queue.peek() {
|
||||
if doc == self.doc {
|
||||
let ord = ord as usize;
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
self.advance_head();
|
||||
}
|
||||
if self.occur_filter.accept(ord_bitset) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for BooleanScorer<TScorer> {
|
||||
|
||||
fn score(&self,) -> f32 {
|
||||
self.score_combiner.score()
|
||||
}
|
||||
}
|
||||
|
||||
32
src/query/boolean_query/boolean_weight.rs
Normal file
32
src/query/boolean_query/boolean_weight.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use query::Scorer;
|
||||
use super::BooleanScorer;
|
||||
use query::OccurFilter;
|
||||
use Result;
|
||||
|
||||
pub struct BooleanWeight {
|
||||
weights: Vec<Box<Weight>>,
|
||||
occur_filter: OccurFilter,
|
||||
}
|
||||
|
||||
impl BooleanWeight {
|
||||
pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
|
||||
BooleanWeight {
|
||||
weights: weights,
|
||||
occur_filter: occur_filter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Weight for BooleanWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect());
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
}
|
||||
158
src/query/boolean_query/mod.rs
Normal file
158
src/query/boolean_query/mod.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
mod boolean_query;
|
||||
mod boolean_scorer;
|
||||
mod boolean_weight;
|
||||
mod score_combiner;
|
||||
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boolean_scorer::BooleanScorer;
|
||||
pub use self::score_combiner::ScoreCombiner;
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use postings::{DocSet, VecPostings};
|
||||
use query::Scorer;
|
||||
use query::OccurFilter;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Occur;
|
||||
use query::Query;
|
||||
use query::TermQuery;
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use schema::*;
|
||||
use fastfield::{U32FastFieldReader};
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_query() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let doc = doc!(text_field => "a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "a c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "a b c d");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "d");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
let make_term_query = |text: &str| {
|
||||
let term_query = TermQuery::from(Term::from_field_text(text_field, text));
|
||||
let query: Box<Query> = box term_query;
|
||||
query
|
||||
};
|
||||
|
||||
|
||||
let matching_docs = |boolean_query: &Query| {
|
||||
let searcher = index.searcher();
|
||||
let mut test_collector = TestCollector::default();
|
||||
boolean_query.search(&*searcher, &mut test_collector).unwrap();
|
||||
test_collector.docs()
|
||||
};
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")) ]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")) ]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), (Occur::Should, make_term_query("b"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 2, 3));
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Should, make_term_query("b"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
(Occur::MustNot, make_term_query("d")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1));
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]);
|
||||
// TODO optimize this use case : only MustNot subqueries... no need
|
||||
// to read any postings.
|
||||
assert_eq!(matching_docs(&boolean_query), Vec::new());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_scorer() {
|
||||
let occurs = vec!(Occur::Should, Occur::Should);
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
|
||||
let left_fieldnorms = U32FastFieldReader::from(vec!(100,200,300));
|
||||
|
||||
let left = VecPostings::from(vec!(1, 2, 3));
|
||||
let left_scorer = TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader: left_fieldnorms,
|
||||
postings: left,
|
||||
};
|
||||
|
||||
let right_fieldnorms = U32FastFieldReader::from(vec!(15,25,35));
|
||||
let right = VecPostings::from(vec!(1, 3, 8));
|
||||
|
||||
let right_scorer = TermScorer {
|
||||
idf: 4f32,
|
||||
fieldnorm_reader: right_fieldnorms,
|
||||
postings: right,
|
||||
};
|
||||
|
||||
let mut boolean_scorer = BooleanScorer::new(vec!(left_scorer, right_scorer), occur_filter);
|
||||
assert_eq!(boolean_scorer.next(), Some(1u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.8707107) < 0.001);
|
||||
assert_eq!(boolean_scorer.next(), Some(2u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.028867513) < 0.001f32);
|
||||
assert_eq!(boolean_scorer.next(), Some(3u32));
|
||||
assert_eq!(boolean_scorer.next(), Some(8u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.5163978) < 0.001f32);
|
||||
assert!(!boolean_scorer.advance());
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_term_scorer() {
|
||||
let left_fieldnorms = U32FastFieldReader::from(vec!(10, 4));
|
||||
assert_eq!(left_fieldnorms.get(0), 10);
|
||||
assert_eq!(left_fieldnorms.get(1), 4);
|
||||
let left = VecPostings::from(vec!(1));
|
||||
let mut left_scorer = TermScorer {
|
||||
idf: 0.30685282,
|
||||
fieldnorm_reader: left_fieldnorms,
|
||||
postings: left,
|
||||
};
|
||||
left_scorer.advance();
|
||||
assert!(abs_diff(left_scorer.score(), 0.15342641) < 0.001f32);
|
||||
}
|
||||
|
||||
}
|
||||
46
src/query/boolean_query/score_combiner.rs
Normal file
46
src/query/boolean_query/score_combiner.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
use Score;
|
||||
|
||||
pub struct ScoreCombiner {
|
||||
coords: Vec<Score>,
|
||||
num_fields: usize,
|
||||
score: Score,
|
||||
}
|
||||
|
||||
impl ScoreCombiner {
|
||||
|
||||
pub fn update(&mut self, score: Score) {
|
||||
self.score += score;
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
pub fn clear(&mut self,) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
|
||||
/// Compute the coord term
|
||||
fn coord(&self,) -> f32 {
|
||||
self.coords[self.num_fields]
|
||||
}
|
||||
|
||||
pub fn score(&self, ) -> Score {
|
||||
self.score * self.coord()
|
||||
}
|
||||
|
||||
pub fn default_for_num_scorers(num_scorers: usize) -> ScoreCombiner {
|
||||
let query_coords: Vec<Score> = (0..num_scorers + 1)
|
||||
.map(|i| (i as Score) / (num_scorers as Score))
|
||||
.collect();
|
||||
ScoreCombiner::from(query_coords)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Score>> for ScoreCombiner {
|
||||
fn from(coords: Vec<Score>) -> ScoreCombiner {
|
||||
ScoreCombiner {
|
||||
coords: coords,
|
||||
num_fields: 0,
|
||||
score: 0f32,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,285 +0,0 @@
|
||||
use DocId;
|
||||
use postings::{Postings, DocSet};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use query::MultiTermAccumulator;
|
||||
use query::Similarity;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use query::Occur;
|
||||
use std::iter;
|
||||
use super::Scorer;
|
||||
use Score;
|
||||
|
||||
/// Each `HeapItem` represents the head of
|
||||
/// a segment postings being merged.
|
||||
///
|
||||
/// * `doc` - is the current doc id for the given segment postings
|
||||
/// * `ord` - is the ordinal used to identify to which segment postings
|
||||
/// this heap item belong to.
|
||||
#[derive(Eq, PartialEq)]
|
||||
struct HeapItem {
|
||||
doc: DocId,
|
||||
ord: u32,
|
||||
}
|
||||
|
||||
/// `HeapItem` are ordered by the document
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other:&Self) -> Ordering {
|
||||
(other.doc).cmp(&self.doc)
|
||||
}
|
||||
}
|
||||
|
||||
struct Filter {
|
||||
and_mask: u64,
|
||||
result: u64,
|
||||
}
|
||||
|
||||
impl Filter {
|
||||
fn accept(&self, ord_set: u64) -> bool {
|
||||
(self.and_mask & ord_set) == self.result
|
||||
}
|
||||
|
||||
fn new(occurs: &[Occur]) -> Filter {
|
||||
let mut and_mask = 0u64;
|
||||
let mut result = 0u64;
|
||||
for (i, occur) in occurs.iter().enumerate() {
|
||||
let shift = 1 << i;
|
||||
match *occur {
|
||||
Occur::Must => {
|
||||
and_mask |= shift;
|
||||
result |= shift;
|
||||
},
|
||||
Occur::MustNot => {
|
||||
and_mask |= shift;
|
||||
},
|
||||
Occur::Should => {},
|
||||
}
|
||||
}
|
||||
Filter {
|
||||
and_mask: and_mask,
|
||||
result: result
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Document-At-A-Time multi term scorer.
|
||||
///
|
||||
/// The scorer merges multiple segment postings and pushes
|
||||
/// term information to the score accumulator.
|
||||
pub struct DAATMultiTermScorer<TPostings: Postings, TAccumulator: MultiTermAccumulator> {
|
||||
fieldnorm_readers: Vec<U32FastFieldReader>,
|
||||
postings: Vec<TPostings>,
|
||||
term_frequencies: Vec<u32>,
|
||||
queue: BinaryHeap<HeapItem>,
|
||||
doc: DocId,
|
||||
similarity: TAccumulator,
|
||||
filter: Filter,
|
||||
}
|
||||
|
||||
impl<TPostings: Postings, TAccumulator: MultiTermAccumulator> DAATMultiTermScorer<TPostings, TAccumulator> {
|
||||
|
||||
fn new_non_empty(
|
||||
|
||||
fieldnorm_readers: Vec<U32FastFieldReader>,
|
||||
postings: Vec<TPostings>,
|
||||
similarity: TAccumulator,
|
||||
filter: Filter
|
||||
) -> DAATMultiTermScorer<TPostings, TAccumulator> {
|
||||
let mut term_frequencies: Vec<u32> = iter::repeat(0u32).take(postings.len()).collect();
|
||||
let heap_items: Vec<HeapItem> = postings
|
||||
.iter()
|
||||
.map(|posting| {
|
||||
(posting.doc(), posting.term_freq())
|
||||
})
|
||||
.enumerate()
|
||||
.map(|(ord, (doc, tf))| {
|
||||
term_frequencies[ord] = tf;
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
DAATMultiTermScorer {
|
||||
fieldnorm_readers: fieldnorm_readers,
|
||||
postings: postings,
|
||||
term_frequencies: term_frequencies,
|
||||
queue: BinaryHeap::from(heap_items),
|
||||
doc: 0,
|
||||
similarity: similarity,
|
||||
filter: filter
|
||||
}
|
||||
}
|
||||
|
||||
/// Constructor
|
||||
pub fn new(postings_and_fieldnorms: Vec<(Occur, TPostings, U32FastFieldReader)>, similarity: TAccumulator) -> DAATMultiTermScorer<TPostings, TAccumulator> {
|
||||
let mut postings = Vec::new();
|
||||
let mut fieldnorm_readers = Vec::new();
|
||||
let mut occurs = Vec::new();
|
||||
for (occur, mut posting, fieldnorm_reader) in postings_and_fieldnorms {
|
||||
if posting.advance() {
|
||||
postings.push(posting);
|
||||
fieldnorm_readers.push(fieldnorm_reader);
|
||||
occurs.push(occur);
|
||||
}
|
||||
}
|
||||
let filter = Filter::new(&occurs);
|
||||
DAATMultiTermScorer::new_non_empty(fieldnorm_readers, postings, similarity, filter)
|
||||
}
|
||||
|
||||
/// Returns the scorer
|
||||
pub fn scorer(&self,) -> &TAccumulator {
|
||||
&self.similarity
|
||||
}
|
||||
|
||||
/// Advances the head of our heap (the segment postings with the lowest doc)
|
||||
/// It will also update the new current `DocId` as well as the term frequency
|
||||
/// associated with the segment postings.
|
||||
///
|
||||
/// After advancing the `SegmentPosting`, the postings is removed from the heap
|
||||
/// if it has been entirely consumed, or pushed back into the heap.
|
||||
///
|
||||
/// # Panics
|
||||
/// This method will panic if the head `SegmentPostings` is not empty.
|
||||
fn advance_head(&mut self,) {
|
||||
|
||||
{
|
||||
let mut mutable_head = self.queue.peek_mut().unwrap();
|
||||
let cur_postings = &mut self.postings[mutable_head.ord as usize];
|
||||
if cur_postings.advance() {
|
||||
let doc = cur_postings.doc();
|
||||
self.term_frequencies[mutable_head.ord as usize] = cur_postings.term_freq();
|
||||
mutable_head.doc = doc;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
self.queue.pop();
|
||||
}
|
||||
|
||||
/// Returns the field norm for the segment postings with the given ordinal,
|
||||
/// and the given document.
|
||||
fn get_field_norm(&self, ord:usize, doc:DocId) -> u32 {
|
||||
self.fieldnorm_readers[ord].get(doc)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<TPostings: Postings, TSimilarity: Similarity> Scorer for DAATMultiTermScorer<TPostings, TSimilarity> {
|
||||
fn score(&self,) -> Score {
|
||||
self.similarity.score()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings, TAccumulator: MultiTermAccumulator> DocSet for DAATMultiTermScorer<TPostings, TAccumulator> {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
loop {
|
||||
self.similarity.clear();
|
||||
let mut ord_bitset = 0u64;
|
||||
match self.queue.peek() {
|
||||
Some(heap_item) => {
|
||||
self.doc = heap_item.doc;
|
||||
let ord: usize = heap_item.ord as usize;
|
||||
let fieldnorm = self.get_field_norm(ord, heap_item.doc);
|
||||
let tf = self.term_frequencies[ord];
|
||||
self.similarity.update(ord, tf, fieldnorm);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
None => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.advance_head();
|
||||
while let Some(&HeapItem {doc, ord}) = self.queue.peek() {
|
||||
if doc == self.doc {
|
||||
let peek_ord: usize = ord as usize;
|
||||
let peek_tf = self.term_frequencies[peek_ord];
|
||||
let peek_fieldnorm = self.get_field_norm(peek_ord, doc);
|
||||
self.similarity.update(peek_ord, peek_tf, peek_fieldnorm);
|
||||
ord_bitset |= 1 << peek_ord;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
self.advance_head();
|
||||
}
|
||||
if self.filter.accept(ord_bitset) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use postings::{DocSet, VecPostings};
|
||||
use query::TfIdf;
|
||||
use query::Scorer;
|
||||
use directory::Directory;
|
||||
use directory::RAMDirectory;
|
||||
use schema::Field;
|
||||
use std::path::Path;
|
||||
use query::Occur;
|
||||
use fastfield::{U32FastFieldReader, U32FastFieldWriter, FastFieldSerializer};
|
||||
|
||||
|
||||
pub fn create_u32_fastfieldreader(field: Field, vals: Vec<u32>) -> U32FastFieldReader {
|
||||
let mut u32_field_writer = U32FastFieldWriter::new(field);
|
||||
for val in vals {
|
||||
u32_field_writer.add_val(val);
|
||||
}
|
||||
let path = Path::new("some_path");
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
u32_field_writer.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let read = directory.open_read(&path).unwrap();
|
||||
U32FastFieldReader::open(read).unwrap()
|
||||
}
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_daat_scorer() {
|
||||
let left_fieldnorms = create_u32_fastfieldreader(Field(1), vec!(100,200,300));
|
||||
let right_fieldnorms = create_u32_fastfieldreader(Field(2), vec!(15,25,35));
|
||||
let left = VecPostings::from(vec!(1, 2, 3));
|
||||
let right = VecPostings::from(vec!(1, 3, 8));
|
||||
let tfidf = TfIdf::new(vec!(0f32, 1f32, 2f32), vec!(1f32, 4f32));
|
||||
let mut daat_scorer = DAATMultiTermScorer::new(
|
||||
vec!(
|
||||
(Occur::Should, left, left_fieldnorms),
|
||||
(Occur::Should, right, right_fieldnorms),
|
||||
),
|
||||
tfidf
|
||||
);
|
||||
assert_eq!(daat_scorer.next(), Some(1u32));
|
||||
assert!(abs_diff(daat_scorer.score(), 2.182179f32) < 0.001);
|
||||
assert_eq!(daat_scorer.next(), Some(2u32));
|
||||
assert!(abs_diff(daat_scorer.score(), 0.2236068) < 0.001f32);
|
||||
assert_eq!(daat_scorer.next(), Some(3u32));
|
||||
assert_eq!(daat_scorer.next(), Some(8u32));
|
||||
assert!(abs_diff(daat_scorer.score(), 0.8944272f32) < 0.001f32);
|
||||
assert!(!daat_scorer.advance());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -4,29 +4,29 @@
|
||||
///
|
||||
|
||||
mod query;
|
||||
mod boolean_query;
|
||||
mod multi_term_query;
|
||||
mod multi_term_accumulator;
|
||||
mod similarity_explainer;
|
||||
mod phrase_query;
|
||||
mod scorer;
|
||||
mod query_parser;
|
||||
mod explanation;
|
||||
mod tfidf;
|
||||
mod occur;
|
||||
mod daat_multiterm_scorer;
|
||||
mod similarity;
|
||||
mod weight;
|
||||
mod occur_filter;
|
||||
mod term_query;
|
||||
|
||||
pub use self::similarity::Similarity;
|
||||
|
||||
pub use self::daat_multiterm_scorer::DAATMultiTermScorer;
|
||||
|
||||
pub use self::occur_filter::OccurFilter;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::query::Query;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::multi_term_query::MultiTermQuery;
|
||||
pub use self::similarity_explainer::SimilarityExplainer;
|
||||
pub use self::tfidf::TfIdf;
|
||||
|
||||
pub use self::multi_term_query::MultiTermWeight;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::scorer::EmptyScorer;
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::explanation::Explanation;
|
||||
pub use self::multi_term_accumulator::MultiTermAccumulator;
|
||||
pub use self::query_parser::ParsingError;
|
||||
pub use self::query_parser::ParsingError;
|
||||
pub use self::weight::Weight;
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
|
||||
/// Accumulator of the matching terms information
|
||||
pub trait MultiTermAccumulator {
|
||||
/// Update the accumulator given the information of a term.
|
||||
/// - term_ord is the term_ordinal
|
||||
/// - term_freq is the frequency of the term within the document
|
||||
/// - fieldnorm is the number of tokens associated to the field for this document
|
||||
///
|
||||
/// The term's update do not have to arrive in a specific order.
|
||||
/// Terms that are not present in the document will not be updated.
|
||||
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32);
|
||||
/// Resets the accumulator
|
||||
fn clear(&mut self,);
|
||||
}
|
||||
@@ -1,175 +0,0 @@
|
||||
use Result;
|
||||
use Error;
|
||||
use schema::Term;
|
||||
use query::Query;
|
||||
use common::TimerTree;
|
||||
use common::OpenTimer;
|
||||
use core::searcher::Searcher;
|
||||
use collector::Collector;
|
||||
use SegmentLocalId;
|
||||
use core::SegmentReader;
|
||||
use query::SimilarityExplainer;
|
||||
use postings::SegmentPostings;
|
||||
use postings::DocSet;
|
||||
use query::TfIdf;
|
||||
use postings::SkipResult;
|
||||
use ScoredDoc;
|
||||
use query::Scorer;
|
||||
use query::MultiTermAccumulator;
|
||||
use DocAddress;
|
||||
use query::Explanation;
|
||||
use query::occur::Occur;
|
||||
use postings::SegmentPostingsOption;
|
||||
use query::DAATMultiTermScorer;
|
||||
|
||||
|
||||
/// Query involving one or more terms.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct MultiTermQuery {
|
||||
occur_terms: Vec<(Occur, Term)>,
|
||||
}
|
||||
|
||||
impl MultiTermQuery {
|
||||
|
||||
/// Accessor for the number of terms
|
||||
pub fn num_terms(&self,) -> usize {
|
||||
self.occur_terms.len()
|
||||
}
|
||||
|
||||
/// Builds the similitude object
|
||||
fn similitude(&self, searcher: &Searcher) -> TfIdf {
|
||||
let num_terms = self.num_terms();
|
||||
let num_docs = searcher.num_docs() as f32;
|
||||
let idfs: Vec<f32> = self.occur_terms
|
||||
.iter()
|
||||
.map(|&(_, ref term)| searcher.doc_freq(term))
|
||||
.map(|doc_freq| {
|
||||
if doc_freq == 0 {
|
||||
1.
|
||||
}
|
||||
else {
|
||||
1. + ( num_docs / (doc_freq as f32) ).ln()
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let query_coords = (0..num_terms + 1)
|
||||
.map(|i| (i as f32) / (num_terms as f32))
|
||||
.collect();
|
||||
// TODO have the actual terms in these names
|
||||
let term_names = self.occur_terms
|
||||
.iter()
|
||||
.map(|&(_, ref term)| format!("{:?}", &term))
|
||||
.collect();
|
||||
let mut tfidf = TfIdf::new(query_coords, idfs);
|
||||
tfidf.set_term_names(term_names);
|
||||
tfidf
|
||||
}
|
||||
|
||||
|
||||
/// Search the segment.
|
||||
fn search_segment<'a, 'b, TAccumulator: MultiTermAccumulator>(
|
||||
&'b self,
|
||||
reader: &'b SegmentReader,
|
||||
accumulator: TAccumulator,
|
||||
mut timer: OpenTimer<'a>) -> Result<DAATMultiTermScorer<SegmentPostings, TAccumulator>> {
|
||||
let mut postings_and_fieldnorms = Vec::with_capacity(self.num_terms());
|
||||
{
|
||||
let mut decode_timer = timer.open("decode_all");
|
||||
for &(occur, ref term) in &self.occur_terms {
|
||||
let _decode_one_timer = decode_timer.open("decode_one");
|
||||
if let Some(postings) = reader.read_postings(term, SegmentPostingsOption::Freq) {
|
||||
let field = term.field();
|
||||
let fieldnorm_reader = try!(reader.get_fieldnorms_reader(field));
|
||||
postings_and_fieldnorms.push((occur, postings, fieldnorm_reader));
|
||||
}
|
||||
}
|
||||
}
|
||||
if postings_and_fieldnorms.len() > 64 {
|
||||
// TODO putting the SHOULD at the end of the list should push the limit.
|
||||
return Err(Error::InvalidArgument(String::from("Limit of 64 terms was exceeded.")));
|
||||
}
|
||||
Ok(DAATMultiTermScorer::new(postings_and_fieldnorms, accumulator))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<(Occur, Term)>> for MultiTermQuery {
|
||||
fn from(occur_terms: Vec<(Occur, Term)>) -> MultiTermQuery {
|
||||
MultiTermQuery {
|
||||
occur_terms: occur_terms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Term>> for MultiTermQuery {
|
||||
fn from(terms: Vec<Term>) -> MultiTermQuery {
|
||||
let should_terms = terms
|
||||
.into_iter()
|
||||
.map(|term| (Occur::Should, term))
|
||||
.collect();
|
||||
MultiTermQuery {
|
||||
occur_terms: should_terms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for MultiTermQuery {
|
||||
|
||||
fn explain(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_address: &DocAddress) -> Result<Explanation> {
|
||||
let segment_reader = searcher.segment_reader(doc_address.segment_ord() as usize);
|
||||
let similitude = SimilarityExplainer::from(self.similitude(searcher));
|
||||
let mut timer_tree = TimerTree::default();
|
||||
let mut postings = try!(
|
||||
self.search_segment(
|
||||
segment_reader,
|
||||
similitude,
|
||||
timer_tree.open("explain"))
|
||||
);
|
||||
Ok(match postings.skip_next(doc_address.doc()) {
|
||||
SkipResult::Reached => {
|
||||
let scorer = postings.scorer();
|
||||
scorer.explain_score()
|
||||
}
|
||||
_ => {
|
||||
let mut explanation = Explanation::with_val(0f32);
|
||||
explanation.description(&format!("Failed to run explain: the document {:?} does not match", doc_address));
|
||||
explanation
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn search<C: Collector>(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
collector: &mut C) -> Result<TimerTree> {
|
||||
let mut timer_tree = TimerTree::default();
|
||||
{
|
||||
let mut search_timer = timer_tree.open("search");
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
let mut segment_search_timer = search_timer.open("segment_search");
|
||||
{
|
||||
let _ = segment_search_timer.open("set_segment");
|
||||
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment_reader));
|
||||
}
|
||||
let mut postings = try!(
|
||||
self.search_segment(
|
||||
segment_reader,
|
||||
self.similitude(searcher),
|
||||
segment_search_timer.open("get_postings"))
|
||||
);
|
||||
{
|
||||
let _collection_timer = segment_search_timer.open("collection");
|
||||
while postings.advance() {
|
||||
let scored_doc = ScoredDoc(postings.score(), postings.doc());
|
||||
collector.collect(scored_doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(timer_tree)
|
||||
}
|
||||
}
|
||||
|
||||
5
src/query/multi_term_query/mod.rs
Normal file
5
src/query/multi_term_query/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
mod multi_term_query;
|
||||
mod multi_term_weight;
|
||||
|
||||
pub use self::multi_term_query::MultiTermQuery;
|
||||
pub use self::multi_term_weight::MultiTermWeight;
|
||||
77
src/query/multi_term_query/multi_term_query.rs
Normal file
77
src/query/multi_term_query/multi_term_query.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use Result;
|
||||
use query::Weight;
|
||||
use std::any::Any;
|
||||
use schema::Term;
|
||||
use query::MultiTermWeight;
|
||||
use query::Query;
|
||||
use core::searcher::Searcher;
|
||||
use query::occur::Occur;
|
||||
use query::occur_filter::OccurFilter;
|
||||
use query::term_query::TermQuery;
|
||||
use postings::SegmentPostingsOption;
|
||||
|
||||
|
||||
/// Query involving one or more terms.
|
||||
#[derive(Eq, Clone, PartialEq, Debug)]
|
||||
pub struct MultiTermQuery {
|
||||
// TODO need a better Debug
|
||||
occur_terms: Vec<(Occur, Term)>,
|
||||
}
|
||||
|
||||
impl MultiTermQuery {
|
||||
/// Accessor for the number of terms
|
||||
pub fn num_terms(&self) -> usize {
|
||||
self.occur_terms.len()
|
||||
}
|
||||
|
||||
/// Same as `weight()`, except that rather than a boxed trait,
|
||||
/// `specialized_weight` returns a specific type of the weight, allowing for
|
||||
/// compile-time optimization.
|
||||
pub fn specialized_weight(&self, searcher: &Searcher) -> MultiTermWeight {
|
||||
let term_queries: Vec<TermQuery> = self.occur_terms
|
||||
.iter()
|
||||
.map(|&(_, ref term)| TermQuery::from(term.clone()))
|
||||
.collect();
|
||||
let occurs: Vec<Occur> = self.occur_terms
|
||||
.iter()
|
||||
.map(|&(occur, _)| occur.clone())
|
||||
.collect();
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
let weights = term_queries.iter()
|
||||
.map(|term_query| {
|
||||
let mut term_weight = term_query.specialized_weight(searcher);
|
||||
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
|
||||
term_weight
|
||||
})
|
||||
.collect();
|
||||
MultiTermWeight::new(weights, occur_filter)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl Query for MultiTermQuery {
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box self.specialized_weight(searcher))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<(Occur, Term)>> for MultiTermQuery {
|
||||
fn from(occur_terms: Vec<(Occur, Term)>) -> MultiTermQuery {
|
||||
MultiTermQuery { occur_terms: occur_terms }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Term>> for MultiTermQuery {
|
||||
fn from(terms: Vec<Term>) -> MultiTermQuery {
|
||||
let should_terms: Vec<(Occur, Term)> = terms.into_iter()
|
||||
.map(|term| (Occur::Should, term))
|
||||
.collect();
|
||||
MultiTermQuery::from(should_terms)
|
||||
}
|
||||
}
|
||||
45
src/query/multi_term_query/multi_term_weight.rs
Normal file
45
src/query/multi_term_query/multi_term_weight.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
use Result;
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use query::Scorer;
|
||||
use query::occur_filter::OccurFilter;
|
||||
use postings::SegmentPostings;
|
||||
use query::term_query::{TermWeight, TermScorer};
|
||||
use query::boolean_query::BooleanScorer;
|
||||
|
||||
/// Weight object associated to a [`MultiTermQuery`](./struct.MultiTermQuery.html).
|
||||
pub struct MultiTermWeight {
|
||||
weights: Vec<TermWeight>,
|
||||
occur_filter: OccurFilter,
|
||||
}
|
||||
|
||||
impl MultiTermWeight {
|
||||
/// MultiTermWeigh constructor.
|
||||
/// The `OccurFilter` is tied with the weights order.
|
||||
pub fn new(weights: Vec<TermWeight>, occur_filter: OccurFilter) -> MultiTermWeight {
|
||||
MultiTermWeight {
|
||||
weights: weights,
|
||||
occur_filter: occur_filter,
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as `scorer()`, except that rather than a boxed trait,
|
||||
/// `specialized_scorer` returns a specific type of the scorer, allowing for
|
||||
/// compile-time optimization.
|
||||
pub fn specialized_scorer<'a>(&'a self,
|
||||
reader: &'a SegmentReader)
|
||||
-> Result<BooleanScorer<TermScorer<SegmentPostings<'a>>>> {
|
||||
let mut term_scorers: Vec<TermScorer<_>> = Vec::new();
|
||||
for term_weight in &self.weights {
|
||||
let term_scorer = try!(term_weight.specialized_scorer(reader));
|
||||
term_scorers.push(term_scorer);
|
||||
}
|
||||
Ok(BooleanScorer::new(term_scorers, self.occur_filter))
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for MultiTermWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
Ok(box try!(self.specialized_scorer(reader)))
|
||||
}
|
||||
}
|
||||
@@ -2,9 +2,9 @@
|
||||
/// should be present or must not be present.
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// The term should be present in the document.
|
||||
/// Document without the term will be considered
|
||||
/// in scoring as well.
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the document with the Should or the Must
|
||||
/// Occur constraint must be within the document.
|
||||
Should,
|
||||
/// Document without the term are excluded from the search.
|
||||
Must,
|
||||
|
||||
44
src/query/occur_filter.rs
Normal file
44
src/query/occur_filter.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
use query::Occur;
|
||||
|
||||
|
||||
/// An OccurFilter represents a filter over a bitset of
|
||||
// at most 64 elements.
|
||||
///
|
||||
/// It wraps some simple bitmask to compute the filter
|
||||
/// rapidly.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct OccurFilter {
|
||||
and_mask: u64,
|
||||
result: u64,
|
||||
}
|
||||
|
||||
impl OccurFilter {
|
||||
|
||||
/// Returns true if the bitset is matching the occur list.
|
||||
pub fn accept(&self, ord_set: u64) -> bool {
|
||||
(self.and_mask & ord_set) == self.result
|
||||
}
|
||||
|
||||
/// Builds an `OccurFilter` from a list of `Occur`.
|
||||
pub fn new(occurs: &[Occur]) -> OccurFilter {
|
||||
let mut and_mask = 0u64;
|
||||
let mut result = 0u64;
|
||||
for (i, occur) in occurs.iter().enumerate() {
|
||||
let shift = 1 << i;
|
||||
match *occur {
|
||||
Occur::Must => {
|
||||
and_mask |= shift;
|
||||
result |= shift;
|
||||
},
|
||||
Occur::MustNot => {
|
||||
and_mask |= shift;
|
||||
},
|
||||
Occur::Should => {},
|
||||
}
|
||||
}
|
||||
OccurFilter {
|
||||
and_mask: and_mask,
|
||||
result: result
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,86 +0,0 @@
|
||||
use schema::Term;
|
||||
use query::Query;
|
||||
use common::TimerTree;
|
||||
use common::OpenTimer;
|
||||
use std::io;
|
||||
use core::searcher::Searcher;
|
||||
use collector::Collector;
|
||||
use core::searcher::SegmentLocalId;
|
||||
use core::SegmentReader;
|
||||
use postings::Postings;
|
||||
use postings::SegmentPostings;
|
||||
use postings::intersection;
|
||||
|
||||
pub struct PhraseQuery {
|
||||
terms: Vec<Term>,
|
||||
}
|
||||
|
||||
impl Query for PhraseQuery {
|
||||
|
||||
fn search<C: Collector>(&self, searcher: &Searcher, collector: &mut C) -> io::Result<TimerTree> {
|
||||
let mut timer_tree = TimerTree::default();
|
||||
{
|
||||
let mut search_timer = timer_tree.open("search");
|
||||
for (segment_ord, segment_reader) in searcher.segments().iter().enumerate() {
|
||||
let mut segment_search_timer = search_timer.open("segment_search");
|
||||
{
|
||||
let _ = segment_search_timer.open("set_segment");
|
||||
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment_reader));
|
||||
}
|
||||
let mut postings = self.search_segment(segment_reader, segment_search_timer.open("get_postings"));
|
||||
{
|
||||
let _collection_timer = segment_search_timer.open("collection");
|
||||
while postings.next() {
|
||||
collector.collect(postings.doc());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(timer_tree)
|
||||
}
|
||||
}
|
||||
|
||||
impl PhraseQuery {
|
||||
pub fn new(terms: Vec<Term>) -> PhraseQuery {
|
||||
PhraseQuery {
|
||||
terms: terms,
|
||||
}
|
||||
}
|
||||
|
||||
fn search_segment<'a, 'b>(&'b self, reader: &'b SegmentReader, mut timer: OpenTimer<'a>) -> Box<Postings + 'b> {
|
||||
if self.terms.len() == 1 {
|
||||
match reader.get_term(&self.terms[0]) {
|
||||
Some(term_info) => {
|
||||
let postings: SegmentPostings<'b> = reader.read_postings(&term_info);
|
||||
Box::new(postings)
|
||||
},
|
||||
None => {
|
||||
Box::new(SegmentPostings::empty())
|
||||
},
|
||||
}
|
||||
} else {
|
||||
let mut segment_postings: Vec<SegmentPostings> = Vec::new();
|
||||
{
|
||||
let mut decode_timer = timer.open("decode_all");
|
||||
for term in self.terms.iter() {
|
||||
match reader.get_term(term) {
|
||||
Some(term_info) => {
|
||||
let _decode_one_timer = decode_timer.open("decode_one");
|
||||
let segment_posting = reader.read_postings_with_positions(&term_info);
|
||||
segment_postings.push(segment_posting);
|
||||
}
|
||||
None => {
|
||||
// currently this is a strict intersection.
|
||||
return Box::new(SegmentPostings::empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Box::new(intersection(segment_postings))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
70
src/query/phrase_query/mod.rs
Normal file
70
src/query/phrase_query/mod.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
mod phrase_query;
|
||||
mod phrase_weight;
|
||||
mod phrase_scorer;
|
||||
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::phrase_weight::PhraseWeight;
|
||||
pub use self::phrase_scorer::PhraseScorer;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use query::Query;
|
||||
use core::Index;
|
||||
use schema::FieldValue;
|
||||
use schema::{Document, Term, SchemaBuilder, TEXT};
|
||||
use collector::tests::TestCollector;
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_query() {
|
||||
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{ // 0
|
||||
let doc = doc!(text_field=>"b b b d c g c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{ // 1
|
||||
let doc = doc!(text_field=>"a b b d c g c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{ // 2
|
||||
let doc = doc!(text_field=>"a b a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{ // 3
|
||||
let doc = doc!(text_field=>"c a b a d ga a");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{ // 4
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
let searcher = index.searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let mut test_collector = TestCollector::default();
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::from(terms);
|
||||
phrase_query.search(&*searcher, &mut test_collector).expect("search should succeed");
|
||||
test_collector.docs()
|
||||
};
|
||||
assert_eq!(test_query(vec!("a", "b", "c")), vec!(2, 4));
|
||||
assert_eq!(test_query(vec!("a", "b")), vec!(1, 2, 3, 4));
|
||||
assert_eq!(test_query(vec!("b", "b")), vec!(0, 1));
|
||||
assert_eq!(test_query(vec!("g", "ewrwer")), vec!());
|
||||
assert_eq!(test_query(vec!("g", "a")), vec!());
|
||||
}
|
||||
|
||||
}
|
||||
54
src/query/phrase_query/phrase_query.rs
Normal file
54
src/query/phrase_query/phrase_query.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use schema::Term;
|
||||
use query::Query;
|
||||
use core::searcher::Searcher;
|
||||
use super::PhraseWeight;
|
||||
use std::any::Any;
|
||||
use query::Weight;
|
||||
use Result;
|
||||
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of word.
|
||||
/// For instance the phrase query for `"part time"` will match
|
||||
/// the sentence
|
||||
///
|
||||
/// **Alan just got a part time job.**
|
||||
///
|
||||
/// On the other hand it will not match the sentence.
|
||||
///
|
||||
/// **This is my favorite part of the job.**
|
||||
///
|
||||
/// Using a `PhraseQuery` on a field requires positions
|
||||
/// to be indexed for this field.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct PhraseQuery {
|
||||
phrase_terms: Vec<Term>,
|
||||
}
|
||||
|
||||
impl Query for PhraseQuery {
|
||||
|
||||
|
||||
/// Used to make it possible to cast Box<Query>
|
||||
/// into a specific type. This is mostly useful for unit tests.
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// See [Weight](./trait.Weight.html).
|
||||
fn weight(&self, _searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box PhraseWeight::from(self.phrase_terms.clone()))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<Term>> for PhraseQuery {
|
||||
fn from(phrase_terms: Vec<Term>) -> PhraseQuery {
|
||||
assert!(phrase_terms.len() > 1);
|
||||
PhraseQuery {
|
||||
phrase_terms: phrase_terms,
|
||||
}
|
||||
}
|
||||
}
|
||||
81
src/query/phrase_query/phrase_scorer.rs
Normal file
81
src/query/phrase_query/phrase_scorer.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
use query::Scorer;
|
||||
use DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use postings::Postings;
|
||||
use postings::IntersectionDocSet;
|
||||
use DocId;
|
||||
|
||||
pub struct PhraseScorer<'a> {
|
||||
pub intersection_docset: IntersectionDocSet<SegmentPostings<'a>>,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> PhraseScorer<'a> {
|
||||
fn phrase_match(&self) -> bool {
|
||||
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
|
||||
.docsets()
|
||||
.iter()
|
||||
.map(|posting| {
|
||||
posting.positions()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let num_postings = positions_arr.len() as u32;
|
||||
|
||||
let mut ord = 1u32;
|
||||
let mut pos_candidate = positions_arr[0][0];
|
||||
positions_arr[0] = &(positions_arr[0])[1..];
|
||||
let mut count_matching = 1;
|
||||
|
||||
'outer: loop {
|
||||
let target = pos_candidate + ord;
|
||||
let positions = positions_arr[ord as usize];
|
||||
for i in 0..positions.len() {
|
||||
let pos_i = positions[i];
|
||||
if pos_i < target {
|
||||
continue;
|
||||
}
|
||||
if pos_i == target {
|
||||
count_matching += 1;
|
||||
if count_matching == num_postings {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else if pos_i > target {
|
||||
count_matching = 1;
|
||||
pos_candidate = positions[i] - ord;
|
||||
positions_arr[ord as usize] = &(positions_arr[ord as usize])[(i+1)..];
|
||||
}
|
||||
ord += 1;
|
||||
if ord == num_postings {
|
||||
ord = 0;
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DocSet for PhraseScorer<'a> {
|
||||
fn advance(&mut self,) -> bool {
|
||||
while self.intersection_docset.advance() {
|
||||
if self.phrase_match() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
self.intersection_docset.doc()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Scorer for PhraseScorer<'a> {
|
||||
fn score(&self,) -> f32 {
|
||||
1f32
|
||||
}
|
||||
|
||||
}
|
||||
39
src/query/phrase_query/phrase_weight.rs
Normal file
39
src/query/phrase_query/phrase_weight.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use query::Weight;
|
||||
use query::Scorer;
|
||||
use schema::Term;
|
||||
use postings::SegmentPostingsOption;
|
||||
use core::SegmentReader;
|
||||
use super::PhraseScorer;
|
||||
use postings::IntersectionDocSet;
|
||||
use query::EmptyScorer;
|
||||
use Result;
|
||||
|
||||
pub struct PhraseWeight {
|
||||
phrase_terms: Vec<Term>,
|
||||
}
|
||||
|
||||
impl From<Vec<Term>> for PhraseWeight {
|
||||
fn from(phrase_terms: Vec<Term>) -> PhraseWeight {
|
||||
PhraseWeight {
|
||||
phrase_terms: phrase_terms
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
let term_postings_option = reader.read_postings(term, SegmentPostingsOption::FreqAndPositions);
|
||||
if let Some(term_postings) = term_postings_option {
|
||||
term_postings_list.push(term_postings);
|
||||
}
|
||||
else {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
}
|
||||
Ok(box PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(term_postings_list),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -2,25 +2,84 @@ use Result;
|
||||
use collector::Collector;
|
||||
use core::searcher::Searcher;
|
||||
use common::TimerTree;
|
||||
use DocAddress;
|
||||
use query::Explanation;
|
||||
use SegmentLocalId;
|
||||
use super::Weight;
|
||||
use std::fmt;
|
||||
use std::any::Any;
|
||||
|
||||
|
||||
/// Queries represent the query of the user, and are in charge
|
||||
/// of the logic defining the set of documents that should be
|
||||
/// sent to the collector, as well as the way to score the
|
||||
/// documents.
|
||||
pub trait Query {
|
||||
/// Query trait are in charge of defining :
|
||||
///
|
||||
/// - a set of documents
|
||||
/// - a way to score these documents
|
||||
///
|
||||
/// When performing a [search](#method.search), these documents will then
|
||||
/// be pushed to a [Collector](../collector/trait.Collector.html),
|
||||
/// which will in turn be in charge of deciding what to do with them.
|
||||
///
|
||||
/// Concretely, this scored docset is represented by the
|
||||
/// [`Scorer`](./trait.Scorer.html) trait.
|
||||
///
|
||||
/// Because our index is actually split into segments, the
|
||||
/// query does not actually directly creates `DocSet` object.
|
||||
/// Instead, the query creates a [`Weight`](./trait.Weight.html)
|
||||
/// object for a given searcher.
|
||||
///
|
||||
/// The weight object, in turn, makes it possible to create
|
||||
/// a scorer for a specific [`SegmentReader`](../struct.SegmentReader.html).
|
||||
///
|
||||
/// So to sum it up :
|
||||
/// - a `Query` is recipe to define a set of documents as well the way to score them.
|
||||
/// - a `Weight` is this recipe tied to a specific `Searcher`. It may for instance
|
||||
/// hold statistics about the different term of the query. It is created by the query.
|
||||
/// - a `Scorer` is a cursor over the set of matching documents, for a specific
|
||||
/// [`SegmentReader`](../struct.SegmentReader.html). It is created by the [`Weight`](./trait.Weight.html).
|
||||
///
|
||||
/// When implementing a new type of `Query`, it is normal to implement a
|
||||
/// dedicated `Query`, `Weight` and `Scorer`.
|
||||
pub trait Query: fmt::Debug {
|
||||
|
||||
/// Perform the search operation
|
||||
fn search<C: Collector>(
|
||||
/// Used to make it possible to cast Box<Query>
|
||||
/// into a specific type. This is mostly useful for unit tests.
|
||||
fn as_any(&self) -> &Any;
|
||||
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// See [Weight](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>>;
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate throw the matched documents and push them to the collector.
|
||||
///
|
||||
fn search(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
collector: &mut C) -> Result<TimerTree>;
|
||||
collector: &mut Collector) -> Result<TimerTree> {
|
||||
|
||||
let mut timer_tree = TimerTree::default();
|
||||
let weight = try!(self.weight(searcher));
|
||||
|
||||
/// Explain the score of a specific document
|
||||
fn explain(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_address: &DocAddress) -> Result<Explanation>;
|
||||
{
|
||||
let mut search_timer = timer_tree.open("search");
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
let mut segment_search_timer = search_timer.open("segment_search");
|
||||
{
|
||||
let _ = segment_search_timer.open("set_segment");
|
||||
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment_reader));
|
||||
}
|
||||
let mut scorer = try!(weight.scorer(segment_reader));
|
||||
{
|
||||
let _collection_timer = segment_search_timer.open("collection");
|
||||
scorer.collect(collector);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(timer_tree)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
use Result as tantivy_Error;
|
||||
use combine::*;
|
||||
use collector::Collector;
|
||||
use core::searcher::Searcher;
|
||||
use common::TimerTree;
|
||||
use query::{Query, MultiTermQuery};
|
||||
use schema::{Schema, FieldType, Term, Field};
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use DocAddress;
|
||||
use query::Explanation;
|
||||
use query::Occur;
|
||||
|
||||
|
||||
@@ -61,23 +55,6 @@ pub struct QueryParser {
|
||||
}
|
||||
|
||||
|
||||
/// The `QueryParser` returns a `StandardQuery`.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub enum StandardQuery {
|
||||
MultiTerm(MultiTermQuery),
|
||||
}
|
||||
|
||||
impl StandardQuery {
|
||||
/// Number of terms involved in the query.
|
||||
pub fn num_terms(&self,) -> usize {
|
||||
match *self {
|
||||
StandardQuery::MultiTerm(ref q) => {
|
||||
q.num_terms()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`
|
||||
@@ -142,7 +119,7 @@ impl QueryParser {
|
||||
///
|
||||
/// Implementing a lenient mode for this query parser is tracked
|
||||
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
|
||||
pub fn parse_query(&self, query: &str) -> Result<StandardQuery, ParsingError> {
|
||||
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, ParsingError> {
|
||||
match parser(query_language).parse(query.trim()) {
|
||||
Ok(literals) => {
|
||||
let mut terms_result: Vec<(Occur, Term)> = Vec::new();
|
||||
@@ -154,9 +131,7 @@ impl QueryParser {
|
||||
.map(|term| (occur, term) ));
|
||||
}
|
||||
Ok(
|
||||
StandardQuery::MultiTerm(
|
||||
MultiTermQuery::from(terms_result)
|
||||
)
|
||||
box MultiTermQuery::from(terms_result)
|
||||
)
|
||||
}
|
||||
Err(_) => {
|
||||
@@ -167,24 +142,6 @@ impl QueryParser {
|
||||
}
|
||||
|
||||
|
||||
impl Query for StandardQuery {
|
||||
fn search<C: Collector>(&self, searcher: &Searcher, collector: &mut C) -> tantivy_Error<TimerTree> {
|
||||
match *self {
|
||||
StandardQuery::MultiTerm(ref q) => {
|
||||
q.search(searcher, collector)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_address: &DocAddress) -> tantivy_Error<Explanation> {
|
||||
match *self {
|
||||
StandardQuery::MultiTerm(ref q) => q.explain(searcher, doc_address)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn compute_terms(field: Field, text: &str) -> Vec<Term> {
|
||||
@@ -324,6 +281,10 @@ mod test {
|
||||
assert!(query_parser.parse("f:@e!e").is_err());
|
||||
}
|
||||
|
||||
// fn extract<T: Query>(query_parser: &QueryParser, q: &str) -> T {
|
||||
// query_parser.parse_query(q).unwrap().as_any().downcast_ref::<T>().unwrap(),
|
||||
// }
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -334,9 +295,9 @@ mod test {
|
||||
assert!(query_parser.parse_query("a:b").is_err());
|
||||
{
|
||||
let terms = vec!(Term::from_field_text(title_field, "abctitle"));
|
||||
let query = StandardQuery::MultiTerm(MultiTermQuery::from(terms));
|
||||
let query = MultiTermQuery::from(terms);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("title:abctitle").unwrap(),
|
||||
*query_parser.parse_query("title:abctitle").unwrap().as_any().downcast_ref::<MultiTermQuery>().unwrap(),
|
||||
query
|
||||
);
|
||||
}
|
||||
@@ -345,21 +306,21 @@ mod test {
|
||||
Term::from_field_text(text_field, "abctitle"),
|
||||
Term::from_field_text(author_field, "abctitle"),
|
||||
);
|
||||
let query = StandardQuery::MultiTerm(MultiTermQuery::from(terms));
|
||||
let query = MultiTermQuery::from(terms);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("abctitle").unwrap(),
|
||||
*query_parser.parse_query("abctitle").unwrap().as_any().downcast_ref::<MultiTermQuery>().unwrap(),
|
||||
query
|
||||
);
|
||||
}
|
||||
{
|
||||
let terms = vec!(Term::from_field_text(title_field, "abctitle"));
|
||||
let query = StandardQuery::MultiTerm(MultiTermQuery::from(terms));
|
||||
let query = MultiTermQuery::from(terms);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("title:abctitle ").unwrap(),
|
||||
*query_parser.parse_query("title:abctitle ").unwrap().as_any().downcast_ref::<MultiTermQuery>().unwrap(),
|
||||
query
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.parse_query(" title:abctitle").unwrap(),
|
||||
*query_parser.parse_query(" title:abctitle").unwrap().as_any().downcast_ref::<MultiTermQuery>().unwrap(),
|
||||
query
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,13 +1,59 @@
|
||||
use DocSet;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
|
||||
/// Scored `DocSet`
|
||||
/// Scored set of documents matching a query within a specific segment.
|
||||
///
|
||||
/// See [Query](./trait.Query.html).
|
||||
pub trait Scorer: DocSet {
|
||||
|
||||
/// Returns the score.
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
fn score(&self,) -> f32;
|
||||
fn score(&self,) -> Score;
|
||||
|
||||
/// Consumes the complete `DocSet` and
|
||||
/// push the scored documents to the collector.
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
while self.advance() {
|
||||
collector.collect(self.doc(), self.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Scorer for Box<Scorer + 'a> {
|
||||
fn score(&self,) -> Score {
|
||||
self.deref().score()
|
||||
}
|
||||
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
let scorer = self.deref_mut();
|
||||
while scorer.advance() {
|
||||
collector.collect(scorer.doc(), scorer.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// EmptyScorer is a dummy Scorer in which no document matches.
|
||||
///
|
||||
/// It is useful for tests and handling edge cases.
|
||||
pub struct EmptyScorer;
|
||||
|
||||
impl DocSet for EmptyScorer {
|
||||
fn advance(&mut self,) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
DocId::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for EmptyScorer {
|
||||
fn score(&self,) -> Score {
|
||||
0f32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
use Score;
|
||||
use query::Explanation;
|
||||
use query::MultiTermAccumulator;
|
||||
|
||||
/// Similarity score
|
||||
pub trait Similarity: MultiTermAccumulator {
|
||||
|
||||
/// Compute and returns the similarity score,
|
||||
///
|
||||
/// The results are not cached.
|
||||
fn score(&self, ) -> Score;
|
||||
|
||||
/// Explain the computation of this similarity given all of
|
||||
/// terms information.
|
||||
///
|
||||
/// `vals` is an array of `(term_ord, term_freq, field_norm)`.
|
||||
/// Terms that are not present should not appear in the array.
|
||||
fn explain(&self, vals: &[(usize, u32, u32)]) -> Explanation;
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
use Score;
|
||||
use super::MultiTermAccumulator;
|
||||
use super::Similarity;
|
||||
use super::Explanation;
|
||||
|
||||
|
||||
/// Wrapper over a similarity used to run `explain`
|
||||
pub struct SimilarityExplainer<TSimilarity: Similarity + Sized> {
|
||||
scorer: TSimilarity,
|
||||
vals: Vec<(usize, u32, u32)>,
|
||||
}
|
||||
|
||||
impl<TSimilarity: Similarity + Sized> SimilarityExplainer<TSimilarity> {
|
||||
/// Returns the underlying similary's score explanation.
|
||||
pub fn explain_score(&self,) -> Explanation {
|
||||
self.scorer.explain(&self.vals)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSimilarity: Similarity + Sized> From<TSimilarity> for SimilarityExplainer<TSimilarity> {
|
||||
fn from(multi_term_scorer: TSimilarity) -> SimilarityExplainer<TSimilarity> {
|
||||
SimilarityExplainer {
|
||||
scorer: multi_term_scorer,
|
||||
vals: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSimilarity: Similarity + Sized> MultiTermAccumulator for SimilarityExplainer<TSimilarity> {
|
||||
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
|
||||
self.vals.push((term_ord, term_freq, fieldnorm));
|
||||
self.scorer.update(term_ord, term_freq, fieldnorm);
|
||||
}
|
||||
|
||||
fn clear(&mut self,) {
|
||||
self.vals.clear();
|
||||
self.scorer.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSimilarity: Similarity + Sized> Similarity for SimilarityExplainer<TSimilarity> {
|
||||
|
||||
fn score(&self,) -> Score {
|
||||
self.scorer.score()
|
||||
}
|
||||
|
||||
fn explain(&self, vals: &[(usize, u32, u32)]) -> Explanation {
|
||||
self.scorer.explain(vals)
|
||||
}
|
||||
}
|
||||
7
src/query/term_query/mod.rs
Normal file
7
src/query/term_query/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
mod term_query;
|
||||
mod term_weight;
|
||||
mod term_scorer;
|
||||
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::term_weight::TermWeight;
|
||||
pub use self::term_scorer::TermScorer;
|
||||
58
src/query/term_query/term_query.rs
Normal file
58
src/query/term_query/term_query.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use Term;
|
||||
use Result;
|
||||
use super::term_weight::TermWeight;
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use postings::SegmentPostingsOption;
|
||||
use Searcher;
|
||||
use std::any::Any;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
/// containing a specific term.
|
||||
///
|
||||
/// The score associated is defined as
|
||||
/// `idf` * sqrt(`term_freq` / `field norm`)
|
||||
/// in which :
|
||||
/// * idf - inverse document frequency.
|
||||
/// * term_freq - number of occurrences of the term in the field
|
||||
/// * field norm - number of tokens in the field.
|
||||
#[derive(Debug)]
|
||||
pub struct TermQuery {
|
||||
term: Term,
|
||||
}
|
||||
|
||||
impl TermQuery {
|
||||
|
||||
/// Returns a weight object.
|
||||
///
|
||||
/// While `.weight(...)` returns a boxed trait object,
|
||||
/// this method return a specific implementation.
|
||||
/// This is useful for optimization purpose.
|
||||
pub fn specialized_weight(&self, searcher: &Searcher) -> TermWeight {
|
||||
TermWeight {
|
||||
num_docs: searcher.num_docs(),
|
||||
doc_freq: searcher.doc_freq(&self.term),
|
||||
term: self.term.clone(),
|
||||
segment_postings_options: SegmentPostingsOption::NoFreq,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Term> for TermQuery {
|
||||
fn from(term: Term) -> TermQuery {
|
||||
TermQuery {
|
||||
term: term
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for TermQuery {
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box self.specialized_weight(searcher))
|
||||
}
|
||||
|
||||
}
|
||||
37
src/query/term_query/term_scorer.rs
Normal file
37
src/query/term_query/term_scorer.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use Score;
|
||||
use DocId;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use postings::DocSet;
|
||||
use query::Scorer;
|
||||
use postings::Postings;
|
||||
|
||||
pub struct TermScorer<TPostings> where TPostings: Postings {
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader: U32FastFieldReader,
|
||||
pub postings: TPostings,
|
||||
}
|
||||
|
||||
impl<TPostings> TermScorer<TPostings> where TPostings: Postings {
|
||||
pub fn postings(&self) -> &TPostings {
|
||||
&self.postings
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings> DocSet for TermScorer<TPostings> where TPostings: Postings {
|
||||
fn advance(&mut self,) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings> Scorer for TermScorer<TPostings> where TPostings: Postings {
|
||||
fn score(&self,) -> Score {
|
||||
let doc = self.postings.doc();
|
||||
let field_norm = self.fieldnorm_reader.get(doc);
|
||||
self.idf * (self.postings.term_freq() as f32 / field_norm as f32).sqrt()
|
||||
}
|
||||
}
|
||||
|
||||
56
src/query/term_query/term_weight.rs
Normal file
56
src/query/term_query/term_weight.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
use Term;
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use query::Scorer;
|
||||
use postings::SegmentPostingsOption;
|
||||
use postings::SegmentPostings;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use super::term_scorer::TermScorer;
|
||||
use Result;
|
||||
|
||||
pub struct TermWeight {
|
||||
pub num_docs: u32,
|
||||
pub doc_freq: u32,
|
||||
pub term: Term,
|
||||
pub segment_postings_options: SegmentPostingsOption,
|
||||
}
|
||||
|
||||
|
||||
impl Weight for TermWeight {
|
||||
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let specialized_scorer = try!(self.specialized_scorer(reader));
|
||||
Ok(box specialized_scorer)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl TermWeight {
|
||||
|
||||
fn idf(&self) -> f32 {
|
||||
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
|
||||
}
|
||||
|
||||
pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<TermScorer<SegmentPostings<'a>>> {
|
||||
let field = self.term.field();
|
||||
let fieldnorm_reader = try!(reader.get_fieldnorms_reader(field));
|
||||
Ok(
|
||||
reader
|
||||
.read_postings(&self.term, self.segment_postings_options)
|
||||
.map(|segment_postings|
|
||||
TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader: fieldnorm_reader,
|
||||
postings: segment_postings,
|
||||
}
|
||||
)
|
||||
.unwrap_or(
|
||||
TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader: U32FastFieldReader::empty(),
|
||||
postings: SegmentPostings::empty()
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,143 +0,0 @@
|
||||
use Score;
|
||||
use super::MultiTermAccumulator;
|
||||
use super::Explanation;
|
||||
use super::Similarity;
|
||||
|
||||
|
||||
/// `TfIdf` is the default pertinence score in tantivy.
|
||||
///
|
||||
/// See [Tf-Idf in the global documentation](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html)
|
||||
#[derive(Clone)]
|
||||
pub struct TfIdf {
|
||||
coords: Vec<f32>,
|
||||
idf: Vec<f32>,
|
||||
score: f32,
|
||||
num_fields: usize,
|
||||
term_names: Option<Vec<String>>, //< only here for explain
|
||||
}
|
||||
|
||||
impl MultiTermAccumulator for TfIdf {
|
||||
|
||||
#[inline]
|
||||
fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
|
||||
assert!(term_freq != 0u32);
|
||||
self.score += self.term_score(term_ord, term_freq, fieldnorm);
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn clear(&mut self,) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
}
|
||||
|
||||
impl TfIdf {
|
||||
/// Constructor
|
||||
/// * coords - Coords act as a boosting factor for queries
|
||||
/// containing many terms. The coords must have a length
|
||||
/// of `num_terms + 1`
|
||||
/// * idf - idf value for each given term. `idf` must
|
||||
/// have a length of `num_terms`.
|
||||
pub fn new(coords: Vec<f32>, idf: Vec<f32>) -> TfIdf {
|
||||
TfIdf {
|
||||
coords: coords,
|
||||
idf: idf,
|
||||
score: 0f32,
|
||||
num_fields: 0,
|
||||
term_names: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the coord term
|
||||
fn coord(&self,) -> f32 {
|
||||
self.coords[self.num_fields]
|
||||
}
|
||||
|
||||
/// Set the term names for the explain function
|
||||
pub fn set_term_names(&mut self, term_names: Vec<String>) {
|
||||
self.term_names = Some(term_names);
|
||||
}
|
||||
|
||||
/// Return the name for the ordinal `ord`
|
||||
fn term_name(&self, ord: usize) -> String {
|
||||
match self.term_names {
|
||||
Some(ref term_names_vec) => term_names_vec[ord].clone(),
|
||||
None => format!("Field({})", ord)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn term_score(&self, term_ord: usize, term_freq: u32, field_norm: u32) -> f32 {
|
||||
(term_freq as f32 / field_norm as f32).sqrt() * self.idf[term_ord]
|
||||
}
|
||||
}
|
||||
|
||||
impl Similarity for TfIdf {
|
||||
|
||||
#[inline]
|
||||
fn score(&self, ) -> Score {
|
||||
self.score * self.coord()
|
||||
}
|
||||
|
||||
fn explain(&self, vals: &[(usize, u32, u32)]) -> Explanation {
|
||||
let score = self.score();
|
||||
let mut explanation = Explanation::with_val(score);
|
||||
let formula_components: Vec<String> = vals.iter()
|
||||
.map(|&(ord, _, _)| ord)
|
||||
.map(|ord| format!("<score for ({}>", self.term_name(ord)))
|
||||
.collect();
|
||||
let formula = format!("<coord> * ({})", formula_components.join(" + "));
|
||||
explanation.set_formula(&formula);
|
||||
for &(ord, term_freq, field_norm) in vals {
|
||||
let term_score = self.term_score(ord, term_freq, field_norm);
|
||||
let term_explanation = explanation.add_child(&self.term_name(ord), term_score);
|
||||
term_explanation.set_formula(" sqrt(<term_freq> / <field_norm>) * <idf>");
|
||||
}
|
||||
explanation
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use query::MultiTermAccumulator;
|
||||
use query::Similarity;
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_tfidf() {
|
||||
let mut tfidf = TfIdf::new(vec!(0f32, 1f32, 2f32), vec!(1f32, 4f32));
|
||||
{
|
||||
tfidf.update(0, 1, 1);
|
||||
assert!(abs_diff(tfidf.score(), 1f32) < 0.001f32);
|
||||
tfidf.clear();
|
||||
}
|
||||
{
|
||||
tfidf.update(1, 1, 1);
|
||||
assert_eq!(tfidf.score(), 4f32);
|
||||
tfidf.clear();
|
||||
}
|
||||
{
|
||||
tfidf.update(0, 2, 1);
|
||||
assert!(abs_diff(tfidf.score(), 1.4142135) < 0.001f32);
|
||||
tfidf.clear();
|
||||
}
|
||||
{
|
||||
tfidf.update(0, 1, 1);
|
||||
tfidf.update(1, 1, 1);
|
||||
assert_eq!(tfidf.score(), 10f32);
|
||||
tfidf.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
16
src/query/weight.rs
Normal file
16
src/query/weight.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
use super::Scorer;
|
||||
use Result;
|
||||
use core::SegmentReader;
|
||||
|
||||
|
||||
/// A Weight is the specialization of a Query
|
||||
/// for a given set of segments.
|
||||
///
|
||||
/// See [Query](./trait.Query.html).
|
||||
pub trait Weight {
|
||||
|
||||
/// Returns the scorer for the given segment.
|
||||
/// See [Query](./trait.Query.html).
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>>;
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user