From 0c230c9ebf2c8c76754a281a6edbd5af102f48ee Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 21 Jun 2016 10:07:12 +0900 Subject: [PATCH] fixed reading docfreq --- src/core/segment_reader.rs | 3 -- src/core/writer.rs | 2 +- src/lib.rs | 29 +++++++++++++++++- src/postings/freq_handler.rs | 50 +++++++++----------------------- src/postings/segment_postings.rs | 6 ++++ src/postings/serializer.rs | 2 -- src/postings/writer.rs | 35 +++++++++++----------- src/schema/field_entry.rs | 1 - 8 files changed, 66 insertions(+), 62 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f0a95ba1b..3cd2a3a6b 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -12,16 +12,13 @@ use datastruct::FstMap; use std::fmt; use rustc_serialize::json; use core::index::SegmentInfo; -use common::OpenTimer; use schema::Field; use core::convert_to_ioerror; use postings::SegmentPostings; use postings::Postings; use fastfield::{U32FastFieldsReader, U32FastFieldReader}; -use postings::intersection; use schema::FieldEntry; use schema::Schema; -use schema::FieldValue; use postings::FreqHandler; pub struct SegmentReader { diff --git a/src/core/writer.rs b/src/core/writer.rs index db600654b..5a4df8f7e 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -172,7 +172,7 @@ impl SegmentWriter { match tokens.next() { Some(token) => { let term = Term::from_field_text(field, token); - self.postings_writer.suscribe(doc_id, pos.clone(), term); + self.postings_writer.suscribe(doc_id, pos, term); pos += 1; }, None => { break; } diff --git a/src/lib.rs b/src/lib.rs index f7bae468b..a54361765 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,6 +71,7 @@ mod tests { use super::*; use collector::TestCollector; use query::MultiTermQuery; + use postings::Postings; #[test] fn test_indexing() { @@ -106,6 +107,32 @@ mod tests { } + #[test] + fn test_docfreq() { + let mut schema = schema::Schema::new(); + let text_field = schema.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1).unwrap(); + { + let mut doc = Document::new(); + doc.add_text(text_field, "af af af bc bc"); + index_writer.add_document(doc).unwrap(); + } + index_writer.wait().unwrap(); + } + { + let searcher = index.searcher().unwrap(); + let reader = &searcher.segments()[0]; + let mut postings = reader.read_postings(&Term::from_field_text(text_field, "af")).unwrap(); + assert!(postings.next()); + assert_eq!(postings.doc(), 0); + assert_eq!(postings.freq(), 3); + assert!(!postings.next()); + } + } + #[test] fn test_searcher() { let mut schema = schema::Schema::new(); @@ -117,7 +144,7 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1).unwrap(); { let mut doc = Document::new(); - doc.add_text(text_field, "af b"); + doc.add_text(text_field, "af af af b"); index_writer.add_document(doc).unwrap(); } { diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 8cec21154..4cf804900 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -6,6 +6,8 @@ pub enum FreqHandler { NoFreq, } +const EMPTY: [u32; 0] = []; + impl FreqHandler { pub fn new_freq_reader() -> FreqHandler { @@ -34,39 +36,15 @@ impl FreqHandler { } -} -// -// -// pub struct FreqReader { -// block_decoder: SIMDBlockDecoder, -// } -// -// impl FreqReader { -// fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { -// self.block_decoder.uncompress_block_unsorted(data) -// } -// -// fn term_freq_block(&self, doc: DocId) -> u32 { -// self.block_decoder.output()[doc as usize] -// } -// -// fn read_freq_vint(&mut self, data: &[u8], num_els: usize) { -// self.block_decoder.uncompress_vint_unsorted(data, num_els) -// } -// } -// -// -// pub struct NoFreqReader; -// -// impl FreqHandler for NoFreqReader { -// fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { -// data -// } -// -// fn term_freq_block(&self, _doc: DocId) -> u32 { -// 0 -// } -// -// fn read_freq_vint(&mut self, _data: &[u8], _num_els: usize) { -// } -// } + pub fn output(&self,)-> &[u32] { + match *self { + FreqHandler::FreqReader(ref block_decoder) => { + block_decoder.output() + } + FreqHandler::NoFreq => { + &EMPTY + } + } + } + +} \ No newline at end of file diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index d49c573af..37e399e90 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -55,6 +55,10 @@ impl<'a> SegmentPostings<'a> { cur: Wrapping(usize::max_value()), } } + + pub fn freq(&self,) -> u32 { + self.freq_handler.output()[self.cur.0] + } } impl<'a> Postings for SegmentPostings<'a> { @@ -99,4 +103,6 @@ impl<'a> Postings for SegmentPostings<'a> { fn doc_freq(&self,) -> usize { self.doc_freq } + + } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index ed14311ab..5cec69252 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -12,7 +12,6 @@ use core::index::Segment; use std::io; use core::index::SegmentComponent; use common::BinarySerializable; -use common::VInt; pub struct PostingsSerializer { @@ -93,7 +92,6 @@ impl PostingsSerializer { if self.text_indexing_options.is_termfreq_enabled() { { let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]); - self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write)); for num in block_encoded { self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); } diff --git a/src/postings/writer.rs b/src/postings/writer.rs index 1cb7701a2..e73ddfb12 100644 --- a/src/postings/writer.rs +++ b/src/postings/writer.rs @@ -3,8 +3,8 @@ use std::collections::BTreeMap; use schema::Term; use postings::PostingsSerializer; use std::io; -pub use postings::Recorder; -pub use postings::NothingRecorder; +use postings::Recorder; +use postings::TermFrequencyRecorder; @@ -25,24 +25,22 @@ impl TermPostingsWriter { self.recorder.close_doc(); } - fn is_new_doc(&self, doc: &DocId) -> bool { - match self.doc_ids.last() { - Some(&last_doc) => last_doc != *doc, - None => true, - } - } - pub fn doc_freq(&self) -> u32 { self.doc_ids.len() as u32 } pub fn suscribe(&mut self, doc: DocId, pos: u32) { - if self.is_new_doc(&doc) { - // this is the first time we meet this term for this document - // first close the previous document, and write its doc_freq. - self.close_doc(); - self.doc_ids.push(doc); - } + match self.doc_ids.last() { + Some(&last_doc) => { + if last_doc != doc { + self.close_doc(); + self.doc_ids.push(doc); + } + }, + None => { + self.doc_ids.push(doc) + }, + } self.recorder.record_position(pos); } @@ -55,9 +53,10 @@ impl TermPostingsWriter { } } +// TODO use something faster than the TermFrequencyRecorder when possible. pub struct PostingsWriter { - postings: Vec>, + postings: Vec>, term_index: BTreeMap, } @@ -77,11 +76,11 @@ impl PostingsWriter { } pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) { - let doc_ids: &mut TermPostingsWriter = self.get_term_postings(term); + let doc_ids: &mut TermPostingsWriter = self.get_term_postings(term); doc_ids.suscribe(doc, pos); } - fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter { + fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter { match self.term_index.get(&term) { Some(unord_id) => { return &mut self.postings[*unord_id]; diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 86bdb7e5b..4a03e34ab 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,4 +1,3 @@ -use rustc_serialize::Encodable; use rustc_serialize::Decoder; use rustc_serialize::Encoder; use schema::TextOptions;