mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-15 15:50:41 +00:00
Positions almost working.
This commit is contained in:
@@ -24,7 +24,6 @@ use postings::SegmentPostingsOption;
|
||||
use postings::{SegmentPostings, BlockSegmentPostings};
|
||||
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
|
||||
use schema::Schema;
|
||||
use postings::FreqHandler;
|
||||
|
||||
|
||||
|
||||
@@ -198,10 +197,10 @@ impl SegmentReader {
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self,
|
||||
pub fn read_postings<'a>(&'a self,
|
||||
term: &Term,
|
||||
option: SegmentPostingsOption)
|
||||
-> Option<SegmentPostings> {
|
||||
-> Option<SegmentPostings<'a>> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
|
||||
@@ -12,7 +12,6 @@ mod term_info;
|
||||
mod vec_postings;
|
||||
mod segment_postings;
|
||||
mod intersection;
|
||||
mod freq_handler;
|
||||
mod docset;
|
||||
mod segment_postings_option;
|
||||
|
||||
@@ -28,7 +27,6 @@ pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::freq_handler::FreqHandler;
|
||||
pub use self::segment_postings_option::SegmentPostingsOption;
|
||||
pub use common::HasLen;
|
||||
|
||||
@@ -63,18 +61,18 @@ mod tests {
|
||||
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
|
||||
posting_serializer.new_field(text_field);
|
||||
posting_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..3u32 {
|
||||
let positions = vec![1, 2, 3, 2];
|
||||
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
|
||||
for doc_id in 0u32..120u32 {
|
||||
let delta_positions = vec![1, 2, 3, 2];
|
||||
posting_serializer.write_doc(doc_id, 2, &delta_positions).unwrap();
|
||||
}
|
||||
posting_serializer.close_term().unwrap();
|
||||
posting_serializer.close().unwrap();
|
||||
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
|
||||
assert!(read.len() <= 16);
|
||||
assert!(read.len() <= 140);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm() {
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -144,6 +142,7 @@ mod tests {
|
||||
assert_eq!(postings_a.doc(), 0);
|
||||
assert_eq!(postings_a.term_freq(), 6);
|
||||
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
|
||||
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 1u32);
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
|
||||
@@ -4,9 +4,65 @@ use postings::{Postings, DocSet, HasLen, SkipResult};
|
||||
use std::cmp;
|
||||
use fst::Streamer;
|
||||
use fastfield::DeleteBitSet;
|
||||
|
||||
use std::cell::UnsafeCell;
|
||||
|
||||
const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
|
||||
|
||||
struct PositionComputer<'a> {
|
||||
// store the amount of position int
|
||||
// before reading positions.
|
||||
//
|
||||
// if none, position are already loaded in
|
||||
// the positions vec.
|
||||
position_to_skip: Option<usize>,
|
||||
|
||||
delta_positions: Vec<u32>,
|
||||
positions: Vec<u32>,
|
||||
positions_stream: CompressedIntStream<'a>,
|
||||
}
|
||||
|
||||
impl<'a> PositionComputer<'a> {
|
||||
|
||||
pub fn new(positions_stream: CompressedIntStream<'a>) -> PositionComputer<'a> {
|
||||
PositionComputer {
|
||||
position_to_skip: None,
|
||||
positions: vec!(),
|
||||
delta_positions: vec!(),
|
||||
positions_stream: positions_stream,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_skip(&mut self, num_skip: usize) {
|
||||
self.position_to_skip = Some(
|
||||
self.position_to_skip
|
||||
.map(|prev_skip| prev_skip + num_skip)
|
||||
.unwrap_or(0)
|
||||
);
|
||||
}
|
||||
|
||||
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
|
||||
self.delta_positions(term_freq);
|
||||
&self.positions[..term_freq]
|
||||
}
|
||||
|
||||
pub fn delta_positions(&mut self, term_freq: usize) -> &[u32] {
|
||||
if let Some(num_skip) = self.position_to_skip {
|
||||
self.delta_positions.resize(term_freq, 0u32);
|
||||
self.positions_stream.skip(num_skip);
|
||||
self.positions_stream.read(&mut self.delta_positions[..term_freq]);
|
||||
self.positions.resize(term_freq, 0u32);
|
||||
let mut cum = 0u32;
|
||||
for i in 0..term_freq as usize {
|
||||
cum += self.delta_positions[i];
|
||||
self.positions[i] = cum;
|
||||
}
|
||||
self.position_to_skip = None;
|
||||
}
|
||||
&self.delta_positions[..term_freq]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
@@ -18,9 +74,11 @@ pub struct SegmentPostings<'a> {
|
||||
block_cursor: BlockSegmentPostings<'a>,
|
||||
cur: usize,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_stream: Option<CompressedIntStream<'a>>,
|
||||
|
||||
position_computer: Option<UnsafeCell<PositionComputer<'a>>>,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
@@ -30,24 +88,27 @@ impl<'a> SegmentPostings<'a> {
|
||||
/// frequencies and/or positions
|
||||
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_stream: Option<CompressedIntStream<'a>>)
|
||||
positions_stream_opt: Option<CompressedIntStream<'a>>)
|
||||
-> SegmentPostings<'a> {
|
||||
let position_computer = positions_stream_opt.map(|stream| {
|
||||
UnsafeCell::new(PositionComputer::new(stream))
|
||||
});
|
||||
SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
|
||||
delete_bitset: delete_bitset,
|
||||
positions_stream: positions_stream
|
||||
position_computer: position_computer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings<'static> {
|
||||
pub fn empty() -> SegmentPostings<'a> {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: DeleteBitSet::empty(),
|
||||
cur: NUM_DOCS_PER_BLOCK,
|
||||
positions_stream: None,
|
||||
position_computer: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -58,7 +119,9 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
let mut pos_to_skip = 0u32;
|
||||
loop {
|
||||
pos_to_skip += self.term_freq();
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
@@ -68,6 +131,11 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
}
|
||||
}
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
if let Some(ref mut position_computer) = self.position_computer.as_mut() {
|
||||
unsafe {
|
||||
(*position_computer.get()).add_skip(pos_to_skip as usize);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -181,11 +249,26 @@ impl<'a> Postings for SegmentPostings<'a> {
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
unimplemented!();
|
||||
let term_freq = self.term_freq();
|
||||
let position_computer_ptr: *mut PositionComputer = self.position_computer
|
||||
.as_ref()
|
||||
.expect("Segment reader does not have positions.")
|
||||
.get();
|
||||
unsafe {
|
||||
(&mut *position_computer_ptr).positions(term_freq as usize)
|
||||
}
|
||||
}
|
||||
|
||||
fn delta_positions(&self) -> &[u32] {
|
||||
unimplemented!();
|
||||
let term_freq = self.term_freq();
|
||||
self.position_computer
|
||||
.as_ref()
|
||||
.map(|position_computer| {
|
||||
unsafe {
|
||||
(&mut *position_computer.get()).delta_positions(term_freq as usize)
|
||||
}
|
||||
})
|
||||
.unwrap_or(&EMPTY_POSITIONS[..])
|
||||
}
|
||||
|
||||
}
|
||||
@@ -333,7 +416,7 @@ impl<'a> BlockSegmentPostings<'a> {
|
||||
num_vint_docs: 0,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
has_freq: false,
|
||||
|
||||
remaining_data: &EMPTY_DATA,
|
||||
|
||||
@@ -27,13 +27,13 @@ impl TermWeight {
|
||||
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
|
||||
}
|
||||
|
||||
pub fn specialized_scorer<'a>(&'a self,
|
||||
pub fn specialized_scorer<'a>(&self,
|
||||
reader: &'a SegmentReader)
|
||||
-> Result<TermScorer<SegmentPostings<'a>>> {
|
||||
let field = self.term.field();
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
Ok(reader
|
||||
.read_postings(&self.term, self.segment_postings_options)
|
||||
let postings: Option<SegmentPostings<'a>> = reader.read_postings(&self.term, self.segment_postings_options);
|
||||
Ok(postings
|
||||
.map(|segment_postings| {
|
||||
TermScorer {
|
||||
idf: self.idf(),
|
||||
|
||||
Reference in New Issue
Block a user