Positions almost working.

This commit is contained in:
Paul Masurel
2017-08-05 23:17:35 +09:00
parent 63b35dd87b
commit 236fa74767
4 changed files with 103 additions and 22 deletions

View File

@@ -24,7 +24,6 @@ use postings::SegmentPostingsOption;
use postings::{SegmentPostings, BlockSegmentPostings};
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
use schema::Schema;
use postings::FreqHandler;
@@ -198,10 +197,10 @@ impl SegmentReader {
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self,
pub fn read_postings<'a>(&'a self,
term: &Term,
option: SegmentPostingsOption)
-> Option<SegmentPostings> {
-> Option<SegmentPostings<'a>> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));

View File

@@ -12,7 +12,6 @@ mod term_info;
mod vec_postings;
mod segment_postings;
mod intersection;
mod freq_handler;
mod docset;
mod segment_postings_option;
@@ -28,7 +27,6 @@ pub use self::vec_postings::VecPostings;
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
pub use self::segment_postings_option::SegmentPostingsOption;
pub use common::HasLen;
@@ -63,18 +61,18 @@ mod tests {
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
posting_serializer.new_field(text_field);
posting_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..3u32 {
let positions = vec![1, 2, 3, 2];
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
posting_serializer.write_doc(doc_id, 2, &delta_positions).unwrap();
}
posting_serializer.close_term().unwrap();
posting_serializer.close().unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert!(read.len() <= 16);
assert!(read.len() <= 140);
}
#[test]
pub fn test_position_and_fieldnorm() {
pub fn test_position_and_fieldnorm1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -144,6 +142,7 @@ mod tests {
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1);

View File

@@ -4,9 +4,65 @@ use postings::{Postings, DocSet, HasLen, SkipResult};
use std::cmp;
use fst::Streamer;
use fastfield::DeleteBitSet;
use std::cell::UnsafeCell;
const EMPTY_DATA: [u8; 0] = [0u8; 0];
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
struct PositionComputer<'a> {
// store the amount of position int
// before reading positions.
//
// if none, position are already loaded in
// the positions vec.
position_to_skip: Option<usize>,
delta_positions: Vec<u32>,
positions: Vec<u32>,
positions_stream: CompressedIntStream<'a>,
}
impl<'a> PositionComputer<'a> {
pub fn new(positions_stream: CompressedIntStream<'a>) -> PositionComputer<'a> {
PositionComputer {
position_to_skip: None,
positions: vec!(),
delta_positions: vec!(),
positions_stream: positions_stream,
}
}
pub fn add_skip(&mut self, num_skip: usize) {
self.position_to_skip = Some(
self.position_to_skip
.map(|prev_skip| prev_skip + num_skip)
.unwrap_or(0)
);
}
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
self.delta_positions(term_freq);
&self.positions[..term_freq]
}
pub fn delta_positions(&mut self, term_freq: usize) -> &[u32] {
if let Some(num_skip) = self.position_to_skip {
self.delta_positions.resize(term_freq, 0u32);
self.positions_stream.skip(num_skip);
self.positions_stream.read(&mut self.delta_positions[..term_freq]);
self.positions.resize(term_freq, 0u32);
let mut cum = 0u32;
for i in 0..term_freq as usize {
cum += self.delta_positions[i];
self.positions[i] = cum;
}
self.position_to_skip = None;
}
&self.delta_positions[..term_freq]
}
}
/// `SegmentPostings` represents the inverted list or postings associated to
@@ -18,9 +74,11 @@ pub struct SegmentPostings<'a> {
block_cursor: BlockSegmentPostings<'a>,
cur: usize,
delete_bitset: DeleteBitSet,
positions_stream: Option<CompressedIntStream<'a>>,
position_computer: Option<UnsafeCell<PositionComputer<'a>>>,
}
impl<'a> SegmentPostings<'a> {
/// Reads a Segment postings from an &[u8]
///
@@ -30,24 +88,27 @@ impl<'a> SegmentPostings<'a> {
/// frequencies and/or positions
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
delete_bitset: DeleteBitSet,
positions_stream: Option<CompressedIntStream<'a>>)
positions_stream_opt: Option<CompressedIntStream<'a>>)
-> SegmentPostings<'a> {
let position_computer = positions_stream_opt.map(|stream| {
UnsafeCell::new(PositionComputer::new(stream))
});
SegmentPostings {
block_cursor: segment_block_postings,
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
delete_bitset: delete_bitset,
positions_stream: positions_stream
position_computer: position_computer,
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
pub fn empty() -> SegmentPostings<'a> {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
block_cursor: empty_block_cursor,
delete_bitset: DeleteBitSet::empty(),
cur: NUM_DOCS_PER_BLOCK,
positions_stream: None,
position_computer: None,
}
}
}
@@ -58,7 +119,9 @@ impl<'a> DocSet for SegmentPostings<'a> {
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
let mut pos_to_skip = 0u32;
loop {
pos_to_skip += self.term_freq();
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
@@ -68,6 +131,11 @@ impl<'a> DocSet for SegmentPostings<'a> {
}
}
if !self.delete_bitset.is_deleted(self.doc()) {
if let Some(ref mut position_computer) = self.position_computer.as_mut() {
unsafe {
(*position_computer.get()).add_skip(pos_to_skip as usize);
}
}
return true;
}
}
@@ -181,11 +249,26 @@ impl<'a> Postings for SegmentPostings<'a> {
}
fn positions(&self) -> &[u32] {
unimplemented!();
let term_freq = self.term_freq();
let position_computer_ptr: *mut PositionComputer = self.position_computer
.as_ref()
.expect("Segment reader does not have positions.")
.get();
unsafe {
(&mut *position_computer_ptr).positions(term_freq as usize)
}
}
fn delta_positions(&self) -> &[u32] {
unimplemented!();
let term_freq = self.term_freq();
self.position_computer
.as_ref()
.map(|position_computer| {
unsafe {
(&mut *position_computer.get()).delta_positions(term_freq as usize)
}
})
.unwrap_or(&EMPTY_POSITIONS[..])
}
}
@@ -333,7 +416,7 @@ impl<'a> BlockSegmentPostings<'a> {
num_vint_docs: 0,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: false,
remaining_data: &EMPTY_DATA,

View File

@@ -27,13 +27,13 @@ impl TermWeight {
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
}
pub fn specialized_scorer<'a>(&'a self,
pub fn specialized_scorer<'a>(&self,
reader: &'a SegmentReader)
-> Result<TermScorer<SegmentPostings<'a>>> {
let field = self.term.field();
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
Ok(reader
.read_postings(&self.term, self.segment_postings_options)
let postings: Option<SegmentPostings<'a>> = reader.read_postings(&self.term, self.segment_postings_options);
Ok(postings
.map(|segment_postings| {
TermScorer {
idf: self.idf(),