fixed reading docfreq

This commit is contained in:
Paul Masurel
2016-06-21 10:07:12 +09:00
parent 36684d76c5
commit 0c230c9ebf
8 changed files with 66 additions and 62 deletions

View File

@@ -12,16 +12,13 @@ use datastruct::FstMap;
use std::fmt;
use rustc_serialize::json;
use core::index::SegmentInfo;
use common::OpenTimer;
use schema::Field;
use core::convert_to_ioerror;
use postings::SegmentPostings;
use postings::Postings;
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
use postings::intersection;
use schema::FieldEntry;
use schema::Schema;
use schema::FieldValue;
use postings::FreqHandler;
pub struct SegmentReader {

View File

@@ -172,7 +172,7 @@ impl SegmentWriter {
match tokens.next() {
Some(token) => {
let term = Term::from_field_text(field, token);
self.postings_writer.suscribe(doc_id, pos.clone(), term);
self.postings_writer.suscribe(doc_id, pos, term);
pos += 1;
},
None => { break; }

View File

@@ -71,6 +71,7 @@ mod tests {
use super::*;
use collector::TestCollector;
use query::MultiTermQuery;
use postings::Postings;
#[test]
fn test_indexing() {
@@ -106,6 +107,32 @@ mod tests {
}
#[test]
fn test_docfreq() {
let mut schema = schema::Schema::new();
let text_field = schema.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1).unwrap();
{
let mut doc = Document::new();
doc.add_text(text_field, "af af af bc bc");
index_writer.add_document(doc).unwrap();
}
index_writer.wait().unwrap();
}
{
let searcher = index.searcher().unwrap();
let reader = &searcher.segments()[0];
let mut postings = reader.read_postings(&Term::from_field_text(text_field, "af")).unwrap();
assert!(postings.next());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.freq(), 3);
assert!(!postings.next());
}
}
#[test]
fn test_searcher() {
let mut schema = schema::Schema::new();
@@ -117,7 +144,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1).unwrap();
{
let mut doc = Document::new();
doc.add_text(text_field, "af b");
doc.add_text(text_field, "af af af b");
index_writer.add_document(doc).unwrap();
}
{

View File

@@ -6,6 +6,8 @@ pub enum FreqHandler {
NoFreq,
}
const EMPTY: [u32; 0] = [];
impl FreqHandler {
pub fn new_freq_reader() -> FreqHandler {
@@ -34,39 +36,15 @@ impl FreqHandler {
}
}
//
//
// pub struct FreqReader {
// block_decoder: SIMDBlockDecoder,
// }
//
// impl FreqReader {
// fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
// self.block_decoder.uncompress_block_unsorted(data)
// }
//
// fn term_freq_block(&self, doc: DocId) -> u32 {
// self.block_decoder.output()[doc as usize]
// }
//
// fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
// self.block_decoder.uncompress_vint_unsorted(data, num_els)
// }
// }
//
//
// pub struct NoFreqReader;
//
// impl FreqHandler for NoFreqReader {
// fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
// data
// }
//
// fn term_freq_block(&self, _doc: DocId) -> u32 {
// 0
// }
//
// fn read_freq_vint(&mut self, _data: &[u8], _num_els: usize) {
// }
// }
pub fn output(&self,)-> &[u32] {
match *self {
FreqHandler::FreqReader(ref block_decoder) => {
block_decoder.output()
}
FreqHandler::NoFreq => {
&EMPTY
}
}
}
}

View File

@@ -55,6 +55,10 @@ impl<'a> SegmentPostings<'a> {
cur: Wrapping(usize::max_value()),
}
}
pub fn freq(&self,) -> u32 {
self.freq_handler.output()[self.cur.0]
}
}
impl<'a> Postings for SegmentPostings<'a> {
@@ -99,4 +103,6 @@ impl<'a> Postings for SegmentPostings<'a> {
fn doc_freq(&self,) -> usize {
self.doc_freq
}
}

View File

@@ -12,7 +12,6 @@ use core::index::Segment;
use std::io;
use core::index::SegmentComponent;
use common::BinarySerializable;
use common::VInt;
pub struct PostingsSerializer {
@@ -93,7 +92,6 @@ impl PostingsSerializer {
if self.text_indexing_options.is_termfreq_enabled() {
{
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
for num in block_encoded {
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
}

View File

@@ -3,8 +3,8 @@ use std::collections::BTreeMap;
use schema::Term;
use postings::PostingsSerializer;
use std::io;
pub use postings::Recorder;
pub use postings::NothingRecorder;
use postings::Recorder;
use postings::TermFrequencyRecorder;
@@ -25,24 +25,22 @@ impl<Rec: Recorder> TermPostingsWriter<Rec> {
self.recorder.close_doc();
}
fn is_new_doc(&self, doc: &DocId) -> bool {
match self.doc_ids.last() {
Some(&last_doc) => last_doc != *doc,
None => true,
}
}
pub fn doc_freq(&self) -> u32 {
self.doc_ids.len() as u32
}
pub fn suscribe(&mut self, doc: DocId, pos: u32) {
if self.is_new_doc(&doc) {
// this is the first time we meet this term for this document
// first close the previous document, and write its doc_freq.
self.close_doc();
self.doc_ids.push(doc);
}
match self.doc_ids.last() {
Some(&last_doc) => {
if last_doc != doc {
self.close_doc();
self.doc_ids.push(doc);
}
},
None => {
self.doc_ids.push(doc)
},
}
self.recorder.record_position(pos);
}
@@ -55,9 +53,10 @@ impl<Rec: Recorder> TermPostingsWriter<Rec> {
}
}
// TODO use something faster than the TermFrequencyRecorder when possible.
pub struct PostingsWriter {
postings: Vec<TermPostingsWriter<NothingRecorder>>,
postings: Vec<TermPostingsWriter<TermFrequencyRecorder>>,
term_index: BTreeMap<Term, usize>,
}
@@ -77,11 +76,11 @@ impl PostingsWriter {
}
pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) {
let doc_ids: &mut TermPostingsWriter<NothingRecorder> = self.get_term_postings(term);
let doc_ids: &mut TermPostingsWriter<TermFrequencyRecorder> = self.get_term_postings(term);
doc_ids.suscribe(doc, pos);
}
fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<NothingRecorder> {
fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<TermFrequencyRecorder> {
match self.term_index.get(&term) {
Some(unord_id) => {
return &mut self.postings[*unord_id];

View File

@@ -1,4 +1,3 @@
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use schema::TextOptions;