This commit is contained in:
Paul Masurel
2016-01-31 21:11:07 +09:00
parent 178a99a5de
commit 484bafd144
4 changed files with 130 additions and 208 deletions

View File

@@ -2,176 +2,50 @@ use std::fmt;
use std::fmt::{Debug, Formatter}; use std::fmt::{Debug, Formatter};
use std::io::prelude::Read; use std::io::prelude::Read;
use core::global::DocId; use core::global::DocId;
use std::cmp::Ordering;
use std::vec; use std::vec;
//////////////////////////////////// ////////////////////////////////////
pub trait Postings: Iterator<Item=DocId> {
pub trait Postings {
type IteratorType: Iterator<Item=DocId>;
fn iter(&self) -> Self::IteratorType;
} }
impl<T: Iterator<Item=DocId>> Postings for T {}
#[derive(Clone)]
pub struct SimplePostings<R: Read + Clone> {
reader: R,
}
pub struct SimplePostingsIterator<R: Read> {
reader: R
}
impl<R: Read + Clone> Postings for SimplePostings<R> {
type IteratorType = SimplePostingsIterator<R>;
fn iter(&self) -> Self::IteratorType {
SimplePostingsIterator {
reader: self.reader.clone()
}
}
}
impl<R: Read> Iterator for SimplePostingsIterator<R> {
type Item=DocId;
fn next(&mut self) -> Option<DocId> {
let mut buf: [u8; 8] = [0; 8];
match self.reader.read(&mut buf) {
Ok(num_bytes) => {
if num_bytes == 8 {
unsafe {
let val = *(*buf.as_ptr() as *const u32);
return Some(val)
}
}
else {
return None
}
},
Err(_) => None
}
}
}
impl<R: Read + Clone> Debug for SimplePostings<R> {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let posting_lists: Vec<DocId> = self.iter().collect();
write!(f, "Postings({:?})", posting_lists);
Ok(())
}
}
pub struct IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
where LeftPostingsType: Postings + 'static,
RightPostingsType: Postings + 'static
{
left: &'a LeftPostingsType,
right: &'a RightPostingsType,
}
impl<'a, LeftPostingsType, RightPostingsType> Postings for IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
where LeftPostingsType: Postings + 'static,
RightPostingsType: Postings + 'static {
type IteratorType = IntersectionIterator<LeftPostingsType, RightPostingsType>;
fn iter(&self) -> IntersectionIterator<LeftPostingsType, RightPostingsType> {
let mut left_it = self.left.iter();
let mut right_it = self.right.iter();
let next_left = left_it.next();
let next_right = right_it.next();
IntersectionIterator {
left: left_it,
right: right_it,
next_left: next_left,
next_right: next_right,
}
}
}
pub fn intersection<'a, LeftPostingsType, RightPostingsType> (left: &'a LeftPostingsType, right: &'a RightPostingsType) -> IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
where LeftPostingsType: Postings + 'static,
RightPostingsType: Postings + 'static {
IntersectionPostings {
left: left,
right: right
}
}
pub struct IntersectionIterator<LeftPostingsType: Postings, RightPostingsType: Postings> {
left: LeftPostingsType::IteratorType,
right: RightPostingsType::IteratorType,
next_left: Option<DocId>,
next_right: Option<DocId>,
}
impl<LeftPostingsType: Postings, RightPostingsType: Postings>
Iterator for IntersectionIterator<LeftPostingsType, RightPostingsType> {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
loop {
match (self.next_left, self.next_right) {
(_, None) => {
return None;
},
(None, _) => {
return None;
},
(Some(left_val), Some(right_val)) => {
if left_val < right_val {
self.next_left = self.left.next();
}
else if right_val > right_val {
self.next_right = self.right.next();
}
else {
self.next_left = self.left.next();
self.next_right = self.right.next();
return Some(left_val)
}
}
}
}
}
}
#[derive(Debug)] #[derive(Debug)]
pub struct VecPostings { pub struct VecPostings {
postings: Vec<DocId>, doc_ids: Vec<DocId>,
cursor: usize,
} }
impl VecPostings { impl VecPostings {
pub fn new(vals: Vec<DocId>) -> VecPostings { pub fn new(vals: Vec<DocId>) -> VecPostings {
VecPostings { VecPostings {
postings: vals doc_ids: vals,
cursor: -1,
} }
} }
} }
impl Postings for VecPostings {
type IteratorType = vec::IntoIter<DocId>;
fn iter(&self) -> vec::IntoIter<DocId> { impl Iterator for VecPostings {
self.postings.clone().into_iter() type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
} if self.cursor + 1 >= self.doc_ids.len() {
None
}
else {
self.cursor += 1;
Some(self.doc_ids[self.cursor])
}
}
} }
impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { // impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> {
let posting_lists: Vec<DocId> = self.iter().collect(); // fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "Postings({:?})", posting_lists); // write!(f, "Postings({:?})", self.doc_ids);
Ok(()) // Ok(())
} // }
} // }

View File

@@ -1,10 +1,12 @@
use core::directory::Directory; use core::directory::Directory;
use core::directory::Segment; use core::directory::Segment;
use std::collections::BinaryHeap;
use core::schema::Term; use core::schema::Term;
use fst::Streamer; use fst::Streamer;
use fst; use fst;
use std::io; use std::io;
use fst::raw::Fst; use fst::raw::Fst;
use std::cmp::{Eq,PartialEq,Ord,PartialOrd,Ordering};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::borrow::Borrow; use std::borrow::Borrow;
use std::io::Cursor; use std::io::Cursor;
@@ -26,7 +28,8 @@ pub struct SegmentReader {
pub struct SegmentPostings<'a> { pub struct SegmentPostings<'a> {
cursor: Cursor<&'a [u8]>, cursor: Cursor<&'a [u8]>,
doc_freq: usize, num_docs_remaining: usize,
current_doc_id: DocId,
} }
impl<'a> SegmentPostings<'a> { impl<'a> SegmentPostings<'a> {
@@ -36,7 +39,8 @@ impl<'a> SegmentPostings<'a> {
let doc_freq = cursor.read_u32::<LittleEndian>().unwrap() as usize; let doc_freq = cursor.read_u32::<LittleEndian>().unwrap() as usize;
SegmentPostings { SegmentPostings {
cursor: cursor, cursor: cursor,
doc_freq: doc_freq, num_docs_remaining: doc_freq,
current_doc_id: 0,
} }
} }
} }
@@ -44,20 +48,8 @@ impl<'a> SegmentPostings<'a> {
impl<'a> Iterator for SegmentPostings<'a> {
pub struct SegmentPostingsIterator<'a> {
cursor: Cursor<&'a [u8]>,
num_docs_remaining: usize,
}
impl<'a> Iterator for SegmentPostingsIterator<'a> {
type Item = DocId; type Item = DocId;
fn next(&mut self,) -> Option<DocId> { fn next(&mut self,) -> Option<DocId> {
@@ -65,53 +57,111 @@ impl<'a> Iterator for SegmentPostingsIterator<'a> {
None None
} }
else { else {
Some(self.cursor.read_u32::<LittleEndian>().unwrap() as DocId) self.current_doc_id = self.cursor.read_u32::<LittleEndian>().unwrap() as DocId;
} Some(self.current_doc_id)
}
}
impl<'a> Postings for SegmentPostings<'a> {
type IteratorType = SegmentPostingsIterator<'a>;
fn iter(&self) -> SegmentPostingsIterator<'a> {
SegmentPostingsIterator {
cursor: self.cursor.clone(),
num_docs_remaining: self.doc_freq,
} }
} }
} }
pub struct ConjunctionPostings<'a> {
segment_postings: Vec<SegmentPostings<'a>>, struct OrderedPostings<T: Postings> {
postings: T,
current_el: DocId,
} }
impl<'a> Postings for ConjunctionPostings<'a> { impl<T: Postings> OrderedPostings<T> {
type IteratorType = ConjunctionPostingsIterator<'a>;
fn iter(&self) -> ConjunctionPostingsIterator<'a> { pub fn get(&self,) -> DocId {
ConjunctionPostingsIterator { self.current_el
postings_it: self.segment_postings }
.iter()
.map(|postings| postings.iter()) pub fn from_postings(mut postings: T) -> Option<OrderedPostings<T>> {
.collect() match(postings.next()) {
Some(doc_id) => Some(OrderedPostings {
postings: postings,
current_el: doc_id,
}),
None => None
} }
} }
} }
pub struct ConjunctionPostingsIterator<'a> {
postings_it: Vec<SegmentPostingsIterator<'a>>,
}
impl<'a> Iterator for ConjunctionPostingsIterator<'a> {
impl<T: Postings> Iterator for OrderedPostings<T> {
type Item = DocId; type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
match self.postings.next() {
Some(doc_id) => {
self.current_el = doc_id;
return Some(doc_id);
},
None => None
}
}
}
fn next(&mut self) -> Option<DocId> { impl<T: Postings> Ord for OrderedPostings<T> {
fn cmp(&self, other: &Self) -> Ordering {
other.current_el.cmp(&self.current_el)
}
}
impl<T: Postings> PartialOrd for OrderedPostings<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(other.current_el.cmp(&self.current_el))
}
}
impl<T: Postings> PartialEq for OrderedPostings<T> {
fn eq(&self, other: &Self) -> bool {
false
}
}
impl<T: Postings> Eq for OrderedPostings<T> {
}
pub struct IntersectionPostings<T: Postings> {
postings: BinaryHeap<OrderedPostings<T>>,
current_doc_id: DocId,
}
impl<T: Postings> IntersectionPostings<T> {
pub fn from_postings(mut postings: Vec<T>) -> IntersectionPostings<T> {
let mut ordered_postings = Vec::new();
for posting in postings.into_iter() {
match OrderedPostings::from_postings(posting) {
Some(ordered_posting) =>{
ordered_postings.push(ordered_posting);
},
None => {
return IntersectionPostings {
postings: BinaryHeap::new(),
current_doc_id: 0,
}
}
}
}
IntersectionPostings {
postings: ordered_postings.into_iter().collect(),
current_doc_id: 0,
}
}
}
impl<T: Postings> Iterator for IntersectionPostings<T> {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
None None
} }
} }
impl SegmentReader { impl SegmentReader {
pub fn open(segment: Segment) -> Result<SegmentReader> { pub fn open(segment: Segment) -> Result<SegmentReader> {
@@ -144,14 +194,12 @@ impl SegmentReader {
} }
} }
pub fn search<'a>(&'a self, terms: &Vec<Term>) -> ConjunctionPostings<'a> { pub fn search<'a>(&'a self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings<'a>> {
let segment_postings = terms let segment_postings: Vec<SegmentPostings> = terms
.iter() .iter()
.map(|term| self.get_term(term).unwrap()) .map(|term| self.get_term(term).unwrap())
.collect(); .collect();
ConjunctionPostings { IntersectionPostings::from_postings(segment_postings)
segment_postings: segment_postings
}
} }
} }

View File

@@ -26,7 +26,7 @@ impl Searcher {
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) { pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
for segment in &self.segments { for segment in &self.segments {
let postings = segment.search(terms); let postings = segment.search(terms);
for doc_id in postings.iter() { for doc_id in postings {
collector.collect(doc_id); collector.collect(doc_id);
} }
collector.set_segment(&segment); collector.set_segment(&segment);

View File

@@ -2,7 +2,7 @@ extern crate tantivy;
extern crate regex; extern crate regex;
extern crate tempdir; extern crate tempdir;
use tantivy::core::postings::{VecPostings, intersection}; use tantivy::core::postings::VecPostings;
use tantivy::core::postings::Postings; use tantivy::core::postings::Postings;
use tantivy::core::analyzer::tokenize; use tantivy::core::analyzer::tokenize;
use tantivy::core::collector::DisplayCollector; use tantivy::core::collector::DisplayCollector;
@@ -34,14 +34,14 @@ fn test_parse_query() {
} }
} }
#[test] // #[test]
fn test_intersection() { // fn test_intersection() {
let left = VecPostings::new(vec!(1, 3, 9)); // let left = VecPostings::new(vec!(1, 3, 9));
let right = VecPostings::new(vec!(3, 4, 9, 18)); // let right = VecPostings::new(vec!(3, 4, 9, 18));
let inter = intersection(&left, &right); // let inter = intersection(&left, &right);
let vals: Vec<DocId> = inter.iter().collect(); // let vals: Vec<DocId> = inter.iter().collect();
assert_eq!(vals, vec!(3, 9)); // assert_eq!(vals, vec!(3, 9));
} // }
#[test] #[test]
fn test_tokenizer() { fn test_tokenizer() {