mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 23:20:40 +00:00
fixed reading docfreq
This commit is contained in:
@@ -12,16 +12,13 @@ use datastruct::FstMap;
|
||||
use std::fmt;
|
||||
use rustc_serialize::json;
|
||||
use core::index::SegmentInfo;
|
||||
use common::OpenTimer;
|
||||
use schema::Field;
|
||||
use core::convert_to_ioerror;
|
||||
use postings::SegmentPostings;
|
||||
use postings::Postings;
|
||||
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
|
||||
use postings::intersection;
|
||||
use schema::FieldEntry;
|
||||
use schema::Schema;
|
||||
use schema::FieldValue;
|
||||
use postings::FreqHandler;
|
||||
|
||||
pub struct SegmentReader {
|
||||
|
||||
@@ -172,7 +172,7 @@ impl SegmentWriter {
|
||||
match tokens.next() {
|
||||
Some(token) => {
|
||||
let term = Term::from_field_text(field, token);
|
||||
self.postings_writer.suscribe(doc_id, pos.clone(), term);
|
||||
self.postings_writer.suscribe(doc_id, pos, term);
|
||||
pos += 1;
|
||||
},
|
||||
None => { break; }
|
||||
|
||||
29
src/lib.rs
29
src/lib.rs
@@ -71,6 +71,7 @@ mod tests {
|
||||
use super::*;
|
||||
use collector::TestCollector;
|
||||
use query::MultiTermQuery;
|
||||
use postings::Postings;
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
@@ -106,6 +107,32 @@ mod tests {
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_docfreq() {
|
||||
let mut schema = schema::Schema::new();
|
||||
let text_field = schema.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1).unwrap();
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_text(text_field, "af af af bc bc");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.wait().unwrap();
|
||||
}
|
||||
{
|
||||
let searcher = index.searcher().unwrap();
|
||||
let reader = &searcher.segments()[0];
|
||||
let mut postings = reader.read_postings(&Term::from_field_text(text_field, "af")).unwrap();
|
||||
assert!(postings.next());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.freq(), 3);
|
||||
assert!(!postings.next());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher() {
|
||||
let mut schema = schema::Schema::new();
|
||||
@@ -117,7 +144,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1).unwrap();
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_text(text_field, "af b");
|
||||
doc.add_text(text_field, "af af af b");
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
|
||||
@@ -6,6 +6,8 @@ pub enum FreqHandler {
|
||||
NoFreq,
|
||||
}
|
||||
|
||||
const EMPTY: [u32; 0] = [];
|
||||
|
||||
impl FreqHandler {
|
||||
|
||||
pub fn new_freq_reader() -> FreqHandler {
|
||||
@@ -34,39 +36,15 @@ impl FreqHandler {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
//
|
||||
//
|
||||
// pub struct FreqReader {
|
||||
// block_decoder: SIMDBlockDecoder,
|
||||
// }
|
||||
//
|
||||
// impl FreqReader {
|
||||
// fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
// self.block_decoder.uncompress_block_unsorted(data)
|
||||
// }
|
||||
//
|
||||
// fn term_freq_block(&self, doc: DocId) -> u32 {
|
||||
// self.block_decoder.output()[doc as usize]
|
||||
// }
|
||||
//
|
||||
// fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
|
||||
// self.block_decoder.uncompress_vint_unsorted(data, num_els)
|
||||
// }
|
||||
// }
|
||||
//
|
||||
//
|
||||
// pub struct NoFreqReader;
|
||||
//
|
||||
// impl FreqHandler for NoFreqReader {
|
||||
// fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
// data
|
||||
// }
|
||||
//
|
||||
// fn term_freq_block(&self, _doc: DocId) -> u32 {
|
||||
// 0
|
||||
// }
|
||||
//
|
||||
// fn read_freq_vint(&mut self, _data: &[u8], _num_els: usize) {
|
||||
// }
|
||||
// }
|
||||
pub fn output(&self,)-> &[u32] {
|
||||
match *self {
|
||||
FreqHandler::FreqReader(ref block_decoder) => {
|
||||
block_decoder.output()
|
||||
}
|
||||
FreqHandler::NoFreq => {
|
||||
&EMPTY
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -55,6 +55,10 @@ impl<'a> SegmentPostings<'a> {
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn freq(&self,) -> u32 {
|
||||
self.freq_handler.output()[self.cur.0]
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for SegmentPostings<'a> {
|
||||
@@ -99,4 +103,6 @@ impl<'a> Postings for SegmentPostings<'a> {
|
||||
fn doc_freq(&self,) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ use core::index::Segment;
|
||||
use std::io;
|
||||
use core::index::SegmentComponent;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
|
||||
|
||||
pub struct PostingsSerializer {
|
||||
@@ -93,7 +92,6 @@ impl PostingsSerializer {
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
{
|
||||
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
|
||||
for num in block_encoded {
|
||||
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
|
||||
}
|
||||
|
||||
@@ -3,8 +3,8 @@ use std::collections::BTreeMap;
|
||||
use schema::Term;
|
||||
use postings::PostingsSerializer;
|
||||
use std::io;
|
||||
pub use postings::Recorder;
|
||||
pub use postings::NothingRecorder;
|
||||
use postings::Recorder;
|
||||
use postings::TermFrequencyRecorder;
|
||||
|
||||
|
||||
|
||||
@@ -25,24 +25,22 @@ impl<Rec: Recorder> TermPostingsWriter<Rec> {
|
||||
self.recorder.close_doc();
|
||||
}
|
||||
|
||||
fn is_new_doc(&self, doc: &DocId) -> bool {
|
||||
match self.doc_ids.last() {
|
||||
Some(&last_doc) => last_doc != *doc,
|
||||
None => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.doc_ids.len() as u32
|
||||
}
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, pos: u32) {
|
||||
if self.is_new_doc(&doc) {
|
||||
// this is the first time we meet this term for this document
|
||||
// first close the previous document, and write its doc_freq.
|
||||
self.close_doc();
|
||||
self.doc_ids.push(doc);
|
||||
}
|
||||
match self.doc_ids.last() {
|
||||
Some(&last_doc) => {
|
||||
if last_doc != doc {
|
||||
self.close_doc();
|
||||
self.doc_ids.push(doc);
|
||||
}
|
||||
},
|
||||
None => {
|
||||
self.doc_ids.push(doc)
|
||||
},
|
||||
}
|
||||
self.recorder.record_position(pos);
|
||||
}
|
||||
|
||||
@@ -55,9 +53,10 @@ impl<Rec: Recorder> TermPostingsWriter<Rec> {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO use something faster than the TermFrequencyRecorder when possible.
|
||||
|
||||
pub struct PostingsWriter {
|
||||
postings: Vec<TermPostingsWriter<NothingRecorder>>,
|
||||
postings: Vec<TermPostingsWriter<TermFrequencyRecorder>>,
|
||||
term_index: BTreeMap<Term, usize>,
|
||||
}
|
||||
|
||||
@@ -77,11 +76,11 @@ impl PostingsWriter {
|
||||
}
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) {
|
||||
let doc_ids: &mut TermPostingsWriter<NothingRecorder> = self.get_term_postings(term);
|
||||
let doc_ids: &mut TermPostingsWriter<TermFrequencyRecorder> = self.get_term_postings(term);
|
||||
doc_ids.suscribe(doc, pos);
|
||||
}
|
||||
|
||||
fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<NothingRecorder> {
|
||||
fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<TermFrequencyRecorder> {
|
||||
match self.term_index.get(&term) {
|
||||
Some(unord_id) => {
|
||||
return &mut self.postings[*unord_id];
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Decoder;
|
||||
use rustc_serialize::Encoder;
|
||||
use schema::TextOptions;
|
||||
|
||||
Reference in New Issue
Block a user