mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
*werwer*
This commit is contained in:
@@ -2,176 +2,50 @@ use std::fmt;
|
|||||||
use std::fmt::{Debug, Formatter};
|
use std::fmt::{Debug, Formatter};
|
||||||
use std::io::prelude::Read;
|
use std::io::prelude::Read;
|
||||||
use core::global::DocId;
|
use core::global::DocId;
|
||||||
|
use std::cmp::Ordering;
|
||||||
use std::vec;
|
use std::vec;
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
|
|
||||||
|
pub trait Postings: Iterator<Item=DocId> {
|
||||||
pub trait Postings {
|
|
||||||
type IteratorType: Iterator<Item=DocId>;
|
|
||||||
fn iter(&self) -> Self::IteratorType;
|
|
||||||
}
|
}
|
||||||
|
impl<T: Iterator<Item=DocId>> Postings for T {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct SimplePostings<R: Read + Clone> {
|
|
||||||
reader: R,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SimplePostingsIterator<R: Read> {
|
|
||||||
reader: R
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: Read + Clone> Postings for SimplePostings<R> {
|
|
||||||
|
|
||||||
type IteratorType = SimplePostingsIterator<R>;
|
|
||||||
|
|
||||||
fn iter(&self) -> Self::IteratorType {
|
|
||||||
SimplePostingsIterator {
|
|
||||||
reader: self.reader.clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
impl<R: Read> Iterator for SimplePostingsIterator<R> {
|
|
||||||
|
|
||||||
type Item=DocId;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<DocId> {
|
|
||||||
let mut buf: [u8; 8] = [0; 8];
|
|
||||||
match self.reader.read(&mut buf) {
|
|
||||||
Ok(num_bytes) => {
|
|
||||||
if num_bytes == 8 {
|
|
||||||
unsafe {
|
|
||||||
let val = *(*buf.as_ptr() as *const u32);
|
|
||||||
return Some(val)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return None
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(_) => None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
impl<R: Read + Clone> Debug for SimplePostings<R> {
|
|
||||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
|
||||||
let posting_lists: Vec<DocId> = self.iter().collect();
|
|
||||||
write!(f, "Postings({:?})", posting_lists);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
|
|
||||||
where LeftPostingsType: Postings + 'static,
|
|
||||||
RightPostingsType: Postings + 'static
|
|
||||||
{
|
|
||||||
left: &'a LeftPostingsType,
|
|
||||||
right: &'a RightPostingsType,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, LeftPostingsType, RightPostingsType> Postings for IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
|
|
||||||
where LeftPostingsType: Postings + 'static,
|
|
||||||
RightPostingsType: Postings + 'static {
|
|
||||||
|
|
||||||
type IteratorType = IntersectionIterator<LeftPostingsType, RightPostingsType>;
|
|
||||||
|
|
||||||
fn iter(&self) -> IntersectionIterator<LeftPostingsType, RightPostingsType> {
|
|
||||||
let mut left_it = self.left.iter();
|
|
||||||
let mut right_it = self.right.iter();
|
|
||||||
let next_left = left_it.next();
|
|
||||||
let next_right = right_it.next();
|
|
||||||
IntersectionIterator {
|
|
||||||
left: left_it,
|
|
||||||
right: right_it,
|
|
||||||
next_left: next_left,
|
|
||||||
next_right: next_right,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
pub fn intersection<'a, LeftPostingsType, RightPostingsType> (left: &'a LeftPostingsType, right: &'a RightPostingsType) -> IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
|
|
||||||
where LeftPostingsType: Postings + 'static,
|
|
||||||
RightPostingsType: Postings + 'static {
|
|
||||||
IntersectionPostings {
|
|
||||||
left: left,
|
|
||||||
right: right
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
pub struct IntersectionIterator<LeftPostingsType: Postings, RightPostingsType: Postings> {
|
|
||||||
left: LeftPostingsType::IteratorType,
|
|
||||||
right: RightPostingsType::IteratorType,
|
|
||||||
|
|
||||||
next_left: Option<DocId>,
|
|
||||||
next_right: Option<DocId>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<LeftPostingsType: Postings, RightPostingsType: Postings>
|
|
||||||
Iterator for IntersectionIterator<LeftPostingsType, RightPostingsType> {
|
|
||||||
|
|
||||||
type Item = DocId;
|
|
||||||
|
|
||||||
fn next(&mut self,) -> Option<DocId> {
|
|
||||||
loop {
|
|
||||||
match (self.next_left, self.next_right) {
|
|
||||||
(_, None) => {
|
|
||||||
return None;
|
|
||||||
},
|
|
||||||
(None, _) => {
|
|
||||||
return None;
|
|
||||||
},
|
|
||||||
(Some(left_val), Some(right_val)) => {
|
|
||||||
if left_val < right_val {
|
|
||||||
self.next_left = self.left.next();
|
|
||||||
}
|
|
||||||
else if right_val > right_val {
|
|
||||||
self.next_right = self.right.next();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
self.next_left = self.left.next();
|
|
||||||
self.next_right = self.right.next();
|
|
||||||
return Some(left_val)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct VecPostings {
|
pub struct VecPostings {
|
||||||
postings: Vec<DocId>,
|
doc_ids: Vec<DocId>,
|
||||||
|
cursor: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VecPostings {
|
impl VecPostings {
|
||||||
pub fn new(vals: Vec<DocId>) -> VecPostings {
|
pub fn new(vals: Vec<DocId>) -> VecPostings {
|
||||||
VecPostings {
|
VecPostings {
|
||||||
postings: vals
|
doc_ids: vals,
|
||||||
|
cursor: -1,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Postings for VecPostings {
|
|
||||||
type IteratorType = vec::IntoIter<DocId>;
|
|
||||||
|
|
||||||
fn iter(&self) -> vec::IntoIter<DocId> {
|
impl Iterator for VecPostings {
|
||||||
self.postings.clone().into_iter()
|
type Item = DocId;
|
||||||
|
fn next(&mut self,) -> Option<DocId> {
|
||||||
}
|
if self.cursor + 1 >= self.doc_ids.len() {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
self.cursor += 1;
|
||||||
|
Some(self.doc_ids[self.cursor])
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> {
|
|
||||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
// impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> {
|
||||||
let posting_lists: Vec<DocId> = self.iter().collect();
|
// fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||||
write!(f, "Postings({:?})", posting_lists);
|
// write!(f, "Postings({:?})", self.doc_ids);
|
||||||
Ok(())
|
// Ok(())
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
use core::directory::Directory;
|
use core::directory::Directory;
|
||||||
use core::directory::Segment;
|
use core::directory::Segment;
|
||||||
|
use std::collections::BinaryHeap;
|
||||||
use core::schema::Term;
|
use core::schema::Term;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use fst;
|
use fst;
|
||||||
use std::io;
|
use std::io;
|
||||||
use fst::raw::Fst;
|
use fst::raw::Fst;
|
||||||
|
use std::cmp::{Eq,PartialEq,Ord,PartialOrd,Ordering};
|
||||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use std::borrow::Borrow;
|
use std::borrow::Borrow;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
@@ -26,7 +28,8 @@ pub struct SegmentReader {
|
|||||||
|
|
||||||
pub struct SegmentPostings<'a> {
|
pub struct SegmentPostings<'a> {
|
||||||
cursor: Cursor<&'a [u8]>,
|
cursor: Cursor<&'a [u8]>,
|
||||||
doc_freq: usize,
|
num_docs_remaining: usize,
|
||||||
|
current_doc_id: DocId,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SegmentPostings<'a> {
|
impl<'a> SegmentPostings<'a> {
|
||||||
@@ -36,7 +39,8 @@ impl<'a> SegmentPostings<'a> {
|
|||||||
let doc_freq = cursor.read_u32::<LittleEndian>().unwrap() as usize;
|
let doc_freq = cursor.read_u32::<LittleEndian>().unwrap() as usize;
|
||||||
SegmentPostings {
|
SegmentPostings {
|
||||||
cursor: cursor,
|
cursor: cursor,
|
||||||
doc_freq: doc_freq,
|
num_docs_remaining: doc_freq,
|
||||||
|
current_doc_id: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -44,20 +48,8 @@ impl<'a> SegmentPostings<'a> {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
impl<'a> Iterator for SegmentPostings<'a> {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub struct SegmentPostingsIterator<'a> {
|
|
||||||
cursor: Cursor<&'a [u8]>,
|
|
||||||
num_docs_remaining: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for SegmentPostingsIterator<'a> {
|
|
||||||
type Item = DocId;
|
type Item = DocId;
|
||||||
|
|
||||||
fn next(&mut self,) -> Option<DocId> {
|
fn next(&mut self,) -> Option<DocId> {
|
||||||
@@ -65,53 +57,111 @@ impl<'a> Iterator for SegmentPostingsIterator<'a> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
Some(self.cursor.read_u32::<LittleEndian>().unwrap() as DocId)
|
self.current_doc_id = self.cursor.read_u32::<LittleEndian>().unwrap() as DocId;
|
||||||
}
|
Some(self.current_doc_id)
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Postings for SegmentPostings<'a> {
|
|
||||||
type IteratorType = SegmentPostingsIterator<'a>;
|
|
||||||
fn iter(&self) -> SegmentPostingsIterator<'a> {
|
|
||||||
SegmentPostingsIterator {
|
|
||||||
cursor: self.cursor.clone(),
|
|
||||||
num_docs_remaining: self.doc_freq,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub struct ConjunctionPostings<'a> {
|
|
||||||
segment_postings: Vec<SegmentPostings<'a>>,
|
struct OrderedPostings<T: Postings> {
|
||||||
|
postings: T,
|
||||||
|
current_el: DocId,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Postings for ConjunctionPostings<'a> {
|
impl<T: Postings> OrderedPostings<T> {
|
||||||
type IteratorType = ConjunctionPostingsIterator<'a>;
|
|
||||||
fn iter(&self) -> ConjunctionPostingsIterator<'a> {
|
pub fn get(&self,) -> DocId {
|
||||||
ConjunctionPostingsIterator {
|
self.current_el
|
||||||
postings_it: self.segment_postings
|
}
|
||||||
.iter()
|
|
||||||
.map(|postings| postings.iter())
|
pub fn from_postings(mut postings: T) -> Option<OrderedPostings<T>> {
|
||||||
.collect()
|
match(postings.next()) {
|
||||||
|
Some(doc_id) => Some(OrderedPostings {
|
||||||
|
postings: postings,
|
||||||
|
current_el: doc_id,
|
||||||
|
}),
|
||||||
|
None => None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ConjunctionPostingsIterator<'a> {
|
|
||||||
postings_it: Vec<SegmentPostingsIterator<'a>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for ConjunctionPostingsIterator<'a> {
|
|
||||||
|
|
||||||
|
impl<T: Postings> Iterator for OrderedPostings<T> {
|
||||||
type Item = DocId;
|
type Item = DocId;
|
||||||
|
fn next(&mut self,) -> Option<DocId> {
|
||||||
|
match self.postings.next() {
|
||||||
|
Some(doc_id) => {
|
||||||
|
self.current_el = doc_id;
|
||||||
|
return Some(doc_id);
|
||||||
|
},
|
||||||
|
None => None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn next(&mut self) -> Option<DocId> {
|
impl<T: Postings> Ord for OrderedPostings<T> {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
other.current_el.cmp(&self.current_el)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Postings> PartialOrd for OrderedPostings<T> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(other.current_el.cmp(&self.current_el))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Postings> PartialEq for OrderedPostings<T> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Postings> Eq for OrderedPostings<T> {
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct IntersectionPostings<T: Postings> {
|
||||||
|
postings: BinaryHeap<OrderedPostings<T>>,
|
||||||
|
current_doc_id: DocId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Postings> IntersectionPostings<T> {
|
||||||
|
pub fn from_postings(mut postings: Vec<T>) -> IntersectionPostings<T> {
|
||||||
|
let mut ordered_postings = Vec::new();
|
||||||
|
for posting in postings.into_iter() {
|
||||||
|
match OrderedPostings::from_postings(posting) {
|
||||||
|
Some(ordered_posting) =>{
|
||||||
|
ordered_postings.push(ordered_posting);
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
return IntersectionPostings {
|
||||||
|
postings: BinaryHeap::new(),
|
||||||
|
current_doc_id: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
IntersectionPostings {
|
||||||
|
postings: ordered_postings.into_iter().collect(),
|
||||||
|
current_doc_id: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl<T: Postings> Iterator for IntersectionPostings<T> {
|
||||||
|
type Item = DocId;
|
||||||
|
fn next(&mut self,) -> Option<DocId> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
impl SegmentReader {
|
impl SegmentReader {
|
||||||
|
|
||||||
pub fn open(segment: Segment) -> Result<SegmentReader> {
|
pub fn open(segment: Segment) -> Result<SegmentReader> {
|
||||||
@@ -144,14 +194,12 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn search<'a>(&'a self, terms: &Vec<Term>) -> ConjunctionPostings<'a> {
|
pub fn search<'a>(&'a self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings<'a>> {
|
||||||
let segment_postings = terms
|
let segment_postings: Vec<SegmentPostings> = terms
|
||||||
.iter()
|
.iter()
|
||||||
.map(|term| self.get_term(term).unwrap())
|
.map(|term| self.get_term(term).unwrap())
|
||||||
.collect();
|
.collect();
|
||||||
ConjunctionPostings {
|
IntersectionPostings::from_postings(segment_postings)
|
||||||
segment_postings: segment_postings
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ impl Searcher {
|
|||||||
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
|
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
|
||||||
for segment in &self.segments {
|
for segment in &self.segments {
|
||||||
let postings = segment.search(terms);
|
let postings = segment.search(terms);
|
||||||
for doc_id in postings.iter() {
|
for doc_id in postings {
|
||||||
collector.collect(doc_id);
|
collector.collect(doc_id);
|
||||||
}
|
}
|
||||||
collector.set_segment(&segment);
|
collector.set_segment(&segment);
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ extern crate tantivy;
|
|||||||
extern crate regex;
|
extern crate regex;
|
||||||
extern crate tempdir;
|
extern crate tempdir;
|
||||||
|
|
||||||
use tantivy::core::postings::{VecPostings, intersection};
|
use tantivy::core::postings::VecPostings;
|
||||||
use tantivy::core::postings::Postings;
|
use tantivy::core::postings::Postings;
|
||||||
use tantivy::core::analyzer::tokenize;
|
use tantivy::core::analyzer::tokenize;
|
||||||
use tantivy::core::collector::DisplayCollector;
|
use tantivy::core::collector::DisplayCollector;
|
||||||
@@ -34,14 +34,14 @@ fn test_parse_query() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
// #[test]
|
||||||
fn test_intersection() {
|
// fn test_intersection() {
|
||||||
let left = VecPostings::new(vec!(1, 3, 9));
|
// let left = VecPostings::new(vec!(1, 3, 9));
|
||||||
let right = VecPostings::new(vec!(3, 4, 9, 18));
|
// let right = VecPostings::new(vec!(3, 4, 9, 18));
|
||||||
let inter = intersection(&left, &right);
|
// let inter = intersection(&left, &right);
|
||||||
let vals: Vec<DocId> = inter.iter().collect();
|
// let vals: Vec<DocId> = inter.iter().collect();
|
||||||
assert_eq!(vals, vec!(3, 9));
|
// assert_eq!(vals, vec!(3, 9));
|
||||||
}
|
// }
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tokenizer() {
|
fn test_tokenizer() {
|
||||||
|
|||||||
Reference in New Issue
Block a user