mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-18 01:00:40 +00:00
werwer
This commit is contained in:
@@ -1,16 +1,13 @@
|
||||
use core::DocId;
|
||||
use core::reader::SegmentReader;
|
||||
use core::directory::SegmentId;
|
||||
|
||||
use core::searcher::DocAddress;
|
||||
|
||||
pub trait Collector {
|
||||
fn set_segment(&mut self, segment: &SegmentReader);
|
||||
fn collect(&mut self, doc_id: DocId);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DocAddress(pub SegmentId, pub DocId);
|
||||
|
||||
pub struct TestCollector {
|
||||
docs: Vec<DocAddress>,
|
||||
current_segment: Option<SegmentId>,
|
||||
@@ -29,7 +26,6 @@ impl TestCollector {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, segment: &SegmentReader) {
|
||||
|
||||
@@ -23,7 +23,7 @@ use atomicwrites;
|
||||
use tempdir::TempDir;
|
||||
use std::io::Read;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct SegmentId(pub String);
|
||||
|
||||
pub fn generate_segment_name() -> SegmentId {
|
||||
|
||||
@@ -2,6 +2,8 @@ use core::directory::Directory;
|
||||
use core::directory::{Segment, SegmentId};
|
||||
use std::collections::BinaryHeap;
|
||||
use core::schema::Term;
|
||||
use core::store::StoreReader;
|
||||
use core::schema::Document;
|
||||
use fst::Streamer;
|
||||
use fst;
|
||||
use std::io;
|
||||
@@ -25,9 +27,9 @@ pub struct SegmentReader {
|
||||
segment: Segment,
|
||||
term_offsets: fst::Map,
|
||||
postings_data: MmapReadOnly,
|
||||
store_reader: StoreReader,
|
||||
}
|
||||
|
||||
|
||||
pub struct SegmentPostings {
|
||||
doc_id: usize,
|
||||
doc_ids: Vec<u32>,
|
||||
@@ -95,10 +97,6 @@ impl Iterator for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
impl SegmentReader {
|
||||
|
||||
pub fn id(&self,) -> SegmentId {
|
||||
@@ -114,14 +112,19 @@ impl SegmentReader {
|
||||
return Err(Error::FSTFormat(format!("The file {:?} does not seem to be a valid term to offset transducer.", filepath)));
|
||||
}
|
||||
};
|
||||
let store_reader = StoreReader::new(try!(segment.mmap(SegmentComponent::STORE)));
|
||||
let postings_shared_mmap = try!(segment.mmap(SegmentComponent::POSTINGS));
|
||||
Ok(SegmentReader {
|
||||
postings_data: postings_shared_mmap,
|
||||
term_offsets: term_offsets,
|
||||
segment: segment,
|
||||
store_reader: store_reader,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_doc(&self, doc_id: &DocId) -> Document {
|
||||
self.store_reader.get(doc_id)
|
||||
}
|
||||
|
||||
pub fn read_postings(&self, offset: usize) -> SegmentPostings {
|
||||
let postings_data = unsafe {&self.postings_data.as_slice()[offset..]};
|
||||
@@ -129,12 +132,9 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings> {
|
||||
match self.term_offsets.get(term.as_slice()) {
|
||||
Some(offset) => {
|
||||
Some(self.read_postings(offset as usize))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
self.term_offsets
|
||||
.get(term.as_slice())
|
||||
.map(|offset| self.read_postings(offset as usize))
|
||||
}
|
||||
|
||||
pub fn search(&self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings> {
|
||||
@@ -143,9 +143,11 @@ impl SegmentReader {
|
||||
for term in terms.iter() {
|
||||
match self.get_term(term) {
|
||||
Some(segment_posting) => {
|
||||
println!("term found {:?}", term);
|
||||
segment_postings.push(segment_posting);
|
||||
}
|
||||
None => {
|
||||
println!("not found {:?}", term);
|
||||
segment_postings.clear();
|
||||
segment_postings.push(SegmentPostings::empty());
|
||||
break;
|
||||
|
||||
@@ -1,35 +1,56 @@
|
||||
use core::reader::SegmentReader;
|
||||
use core::directory::Directory;
|
||||
use core::directory::SegmentId;
|
||||
use core::global::DocId;
|
||||
use core::schema::Document;
|
||||
use core::directory::Segment;
|
||||
use core::collector::Collector;
|
||||
use std::collections::HashMap;
|
||||
use core::schema::Term;
|
||||
use core::postings::Postings;
|
||||
use core::error::Result;
|
||||
|
||||
pub struct Searcher {
|
||||
segments: Vec<SegmentReader>,
|
||||
segments_idx: HashMap<SegmentId, usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DocAddress(pub SegmentId, pub DocId);
|
||||
|
||||
impl Searcher {
|
||||
pub fn for_directory(directory: Directory) -> Searcher {
|
||||
let mut segment_readers: Vec<SegmentReader> = Vec::new();
|
||||
for segment in directory.segments().into_iter() {
|
||||
match SegmentReader::open(segment.clone()) {
|
||||
Ok(segment_reader) => {
|
||||
segment_readers.push(segment_reader);
|
||||
}
|
||||
Err(err) => {
|
||||
// TODO return err
|
||||
println!("Error while opening {:?}, {:?}", segment, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_doc(&self, doc_address: &DocAddress) -> Document {
|
||||
// TODO err
|
||||
let DocAddress(ref segment_id, ref doc_id) = *doc_address;
|
||||
let segment_ord = self.segments_idx.get(&segment_id).unwrap();
|
||||
let segment_reader = &self.segments[segment_ord.clone()];
|
||||
segment_reader.get_doc(doc_id)
|
||||
}
|
||||
|
||||
fn add_segment(&mut self, segment: Segment) -> Result<()> {
|
||||
SegmentReader::open(segment.clone())
|
||||
.map(|segment_reader| {
|
||||
let segment_ord = self.segments.len();
|
||||
self.segments.push(segment_reader);
|
||||
self.segments_idx.insert(segment.id(), segment_ord);
|
||||
})
|
||||
}
|
||||
|
||||
pub fn new() -> Searcher {
|
||||
Searcher {
|
||||
segments: segment_readers
|
||||
segments: Vec::new(),
|
||||
segments_idx: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Searcher {
|
||||
pub fn for_directory(directory: Directory) -> Searcher {
|
||||
let mut searcher = Searcher::new();
|
||||
for segment in directory.segments().into_iter() {
|
||||
searcher.add_segment(segment);
|
||||
}
|
||||
searcher
|
||||
}
|
||||
|
||||
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
|
||||
for segment in &self.segments {
|
||||
|
||||
@@ -19,6 +19,8 @@ use std::io::Seek;
|
||||
use lz4;
|
||||
use tempfile;
|
||||
|
||||
// TODO cache uncompressed pages
|
||||
|
||||
const BLOCK_SIZE: usize = 262144;
|
||||
|
||||
pub struct StoreWriter {
|
||||
@@ -103,7 +105,7 @@ impl StoreWriter {
|
||||
}
|
||||
|
||||
|
||||
struct StoreReader {
|
||||
pub struct StoreReader {
|
||||
data: MmapReadOnly,
|
||||
offsets: Vec<OffsetIndex>,
|
||||
current_block: RefCell<Vec<u8>>,
|
||||
@@ -119,10 +121,11 @@ impl StoreReader {
|
||||
Vec::deserialize(&mut cursor).unwrap()
|
||||
}
|
||||
|
||||
fn block_offset(&self, doc_id: DocId) -> OffsetIndex {
|
||||
fn block_offset(&self, doc_id: &DocId) -> OffsetIndex {
|
||||
let mut offset = OffsetIndex(0, 0);
|
||||
for &OffsetIndex(first_doc_id, block_offset) in self.offsets.iter() {
|
||||
if first_doc_id > doc_id {
|
||||
println!("First doc id {}", first_doc_id);
|
||||
if first_doc_id > *doc_id {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
@@ -143,12 +146,13 @@ impl StoreReader {
|
||||
lz4_decoder.read_to_end(&mut current_block_mut);
|
||||
}
|
||||
|
||||
pub fn get(&self, doc_id: DocId) -> Document {
|
||||
pub fn get(&self, doc_id: &DocId) -> Document {
|
||||
let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id);
|
||||
self.read_block(block_offset as usize);
|
||||
let mut current_block_mut = self.current_block.borrow_mut();
|
||||
let mut cursor = Cursor::new(&mut current_block_mut[..]);
|
||||
for _ in first_doc_id..doc_id {
|
||||
println!("{} / {}", first_doc_id, doc_id);
|
||||
for _ in first_doc_id..*doc_id {
|
||||
let block_length = u32::deserialize(&mut cursor).unwrap();
|
||||
cursor.seek(SeekFrom::Current(block_length as i64));
|
||||
}
|
||||
@@ -209,5 +213,7 @@ fn test_store() {
|
||||
let store_mmap = MmapReadOnly::open(&store_file).unwrap();
|
||||
let store = StoreReader::new(store_mmap);
|
||||
assert_eq!(offsets, store.offsets);
|
||||
assert_eq!(store.get(4093).get_one(&field_title).unwrap(), "Doc 4093");
|
||||
for i in 0..10000 {
|
||||
assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,6 +155,7 @@ impl SegmentWriter {
|
||||
loop {
|
||||
match tokens.next() {
|
||||
Some(token) => {
|
||||
// println!("TOKEN :{}:", token);
|
||||
let term = Term::from_field_text(&field_value.field, token);
|
||||
self.suscribe(doc_id, term);
|
||||
self.num_tokens += 1;
|
||||
|
||||
Reference in New Issue
Block a user