From bf59180337ffb86bbfef47bdc6084e3250266c54 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 23 Apr 2016 22:38:49 +0900 Subject: [PATCH] starting working on positions --- src/core/index.rs | 4 +- src/core/postings.rs | 256 ++++++++++++++++++++++++++++--------------- src/core/reader.rs | 1 - src/core/writer.rs | 6 +- 4 files changed, 174 insertions(+), 93 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index a3d7abe78..19c268fef 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -238,7 +238,7 @@ pub struct SegmentInfo { pub enum SegmentComponent { INFO, POSTINGS, - // POSITIONS, + POSITIONS, FASTFIELDS, TERMS, STORE, @@ -264,7 +264,7 @@ impl Segment { fn path_suffix(component: &SegmentComponent)-> &'static str { match *component { - // SegmentComponent::POSITIONS => ".pos", + SegmentComponent::POSITIONS => ".pos", SegmentComponent::INFO => ".info", SegmentComponent::POSTINGS => ".idx", SegmentComponent::TERMS => ".term", diff --git a/src/core/postings.rs b/src/core/postings.rs index 7401d5076..e11b68117 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -35,8 +35,85 @@ impl BinarySerializable for TermInfo { } +pub trait U32sRecorder { + fn new() -> Self; + fn record(&mut self, val: u32); +} + +pub struct VecRecorder(Vec); + +impl U32sRecorder for VecRecorder { + fn new() -> VecRecorder { + VecRecorder(Vec::new()) + } + fn record(&mut self, val: u32) { + self.0.push(val); + } +} + +pub struct ObliviousRecorder; + +impl U32sRecorder for ObliviousRecorder { + fn new() -> ObliviousRecorder { + ObliviousRecorder + } + fn record(&mut self, val: u32) { + } +} + +struct TermPostingsWriter { + doc_ids: Vec, + term_freqs: TermFreqsRec, + positions: PositionsRec, + current_freq: u32, +} + +impl TermPostingsWriter { + pub fn new() -> TermPostingsWriter { + TermPostingsWriter { + doc_ids: Vec::new(), + term_freqs: TermFreqsRec::new(), + positions: PositionsRec::new(), + current_freq: 0, + } + } + + fn close_doc(&mut self,) { + self.term_freqs.record(self.current_freq); + self.current_freq = 0; + } + + fn close(&mut self,) { + if self.current_freq > 0 { + self.close_doc(); + } + } + + fn is_new_doc(&self, doc: &DocId) -> bool { + match self.doc_ids.last() { + Some(&last_doc) => last_doc != *doc, + None => true, + } + } + + pub fn doc_freq(&self) -> u32 { + self.doc_ids.len() as u32 + } + + pub fn suscribe(&mut self, doc: DocId, pos: u32) { + if self.is_new_doc(&doc) { + // this is the first time we meet this term for this document + // first close the previous document, and write its doc_freq. + self.close_doc(); + self.doc_ids.push(doc); + } + self.current_freq += 1; + self.positions.record(pos); + } +} + pub struct PostingsWriter { - postings: Vec>, + postings: Vec>, term_index: BTreeMap, } @@ -49,14 +126,12 @@ impl PostingsWriter { } } - pub fn suscribe(&mut self, doc: DocId, term: Term) { - let doc_ids: &mut Vec = self.get_term_postings(term); - if doc_ids.len() == 0 || doc_ids[doc_ids.len() - 1] < doc { - doc_ids.push(doc); - } + pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) { + let doc_ids: &mut TermPostingsWriter = self.get_term_postings(term); + doc_ids.suscribe(doc, pos); } - fn get_term_postings(&mut self, term: Term) -> &mut Vec { + fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter { match self.term_index.get(&term) { Some(unord_id) => { return &mut self.postings[*unord_id]; @@ -64,17 +139,17 @@ impl PostingsWriter { None => {} } let unord_id = self.term_index.len(); - self.postings.push(Vec::new()); + self.postings.push(TermPostingsWriter::new()); self.term_index.insert(term, unord_id.clone()); &mut self.postings[unord_id] } pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> { for (term, postings_id) in self.term_index.iter() { - let doc_ids = &self.postings[postings_id.clone()]; - let term_docfreq = doc_ids.len() as u32; + let term_postings_writer = &self.postings[postings_id.clone()]; + let term_docfreq = term_postings_writer.doc_freq(); try!(serializer.new_term(&term, term_docfreq)); - try!(serializer.write_docs(&doc_ids)); + try!(serializer.write_docs(&term_postings_writer.doc_ids)); } Ok(()) } @@ -93,60 +168,62 @@ pub trait Postings: Iterator { fn skip_next(&mut self, target: DocId) -> Option; } -pub struct IntersectionPostings { - postings: Vec, -} - -impl IntersectionPostings { - pub fn from_postings(postings: Vec) -> IntersectionPostings { - IntersectionPostings { - postings: postings, - } - } -} - -impl Iterator for IntersectionPostings { - type Item = DocId; - fn next(&mut self,) -> Option { - let mut candidate; - match self.postings[0].next() { - Some(val) => { - candidate = val; - }, - None => { - return None; - } - } - 'outer: loop { - for i in 1..self.postings.len() { - let skip_result = self.postings[i].skip_next(candidate); - match skip_result { - None => { - return None; - }, - Some(x) if x == candidate => { - }, - Some(greater) => { - unsafe { - let pa: *mut T = &mut self.postings[i]; - let pb: *mut T = &mut self.postings[0]; - ptr::swap(pa, pb); - } - candidate = greater; - continue 'outer; - }, - } - } - return Some(candidate); - } - - } -} +// pub struct IntersectionPostings { +// postings: Vec, +// } +// +// impl IntersectionPostings { +// pub fn from_postings(postings: Vec) -> IntersectionPostings { +// IntersectionPostings { +// postings: postings, +// } +// } +// } +// +// impl Iterator for IntersectionPostings { +// type Item = DocId; +// fn next(&mut self,) -> Option { +// let mut candidate; +// match self.postings[0].next() { +// Some(val) => { +// candidate = val; +// }, +// None => { +// return None; +// } +// } +// 'outer: loop { +// for i in 1..self.postings.len() { +// let skip_result = self.postings[i].skip_next(candidate); +// match skip_result { +// None => { +// return None; +// }, +// Some(x) if x == candidate => { +// }, +// Some(greater) => { +// unsafe { +// let pa: *mut T = &mut self.postings[i]; +// let pb: *mut T = &mut self.postings[0]; +// ptr::swap(pa, pb); +// } +// candidate = greater; +// continue 'outer; +// }, +// } +// } +// return Some(candidate); +// } +// +// } +// } pub struct PostingsSerializer { terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" postings_write: WritePtr, + positions_write: WritePtr, written_bytes_postings: usize, + written_bytes_positions: usize, encoder: simdcompression::Encoder, } @@ -156,10 +233,13 @@ impl PostingsSerializer { let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); + let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, postings_write: postings_write, + positions_write: positions_write, written_bytes_postings: 0, + written_bytes_positions: 0, encoder: simdcompression::Encoder::new(), }) } @@ -246,32 +326,32 @@ mod tests { } } } - - #[test] - fn test_intersection() { - { - let left = VecPostings::new(vec!(1, 3, 9)); - let right = VecPostings::new(vec!(3, 4, 9, 18)); - let inter = IntersectionPostings::from_postings(vec!(left, right)); - let vals: Vec = inter.collect(); - assert_eq!(vals, vec!(3, 9)); - } - { - let a = VecPostings::new(vec!(1, 3, 9)); - let b = VecPostings::new(vec!(3, 4, 9, 18)); - let c = VecPostings::new(vec!(1, 5, 9, 111)); - let inter = IntersectionPostings::from_postings(vec!(a, b, c)); - let vals: Vec = inter.collect(); - assert_eq!(vals, vec!(9)); - } - } - - #[bench] - fn bench_single_intersection(b: &mut Bencher) { - b.iter(|| { - let docs = VecPostings::new((0..1_000_000).collect()); - let intersection = IntersectionPostings::from_postings(vec!(docs)); - intersection.count() - }); - } + // + // #[test] + // fn test_intersection() { + // { + // let left = VecPostings::new(vec!(1, 3, 9)); + // let right = VecPostings::new(vec!(3, 4, 9, 18)); + // let inter = IntersectionPostings::from_postings(vec!(left, right)); + // let vals: Vec = inter.collect(); + // assert_eq!(vals, vec!(3, 9)); + // } + // { + // let a = VecPostings::new(vec!(1, 3, 9)); + // let b = VecPostings::new(vec!(3, 4, 9, 18)); + // let c = VecPostings::new(vec!(1, 5, 9, 111)); + // let inter = IntersectionPostings::from_postings(vec!(a, b, c)); + // let vals: Vec = inter.collect(); + // assert_eq!(vals, vec!(9)); + // } + // } + // + // #[bench] + // fn bench_single_intersection(b: &mut Bencher) { + // b.iter(|| { + // let docs = VecPostings::new((0..1_000_000).collect()); + // let intersection = IntersectionPostings::from_postings(vec!(docs)); + // intersection.count() + // }); + // } } diff --git a/src/core/reader.rs b/src/core/reader.rs index a9d7a15a5..57f1c405f 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -2,7 +2,6 @@ use core::index::{Segment, SegmentId}; use core::schema::Term; use core::store::StoreReader; use core::schema::Document; -use core::postings::IntersectionPostings; use core::directory::ReadOnlySource; use std::io::Cursor; use core::schema::DocId; diff --git a/src/core/writer.rs b/src/core/writer.rs index 446dabf65..6fd2333e6 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -159,11 +159,13 @@ impl SegmentWriter { let field_options = schema.text_field_options(&field_value.field); if field_options.is_tokenized_indexed() { let mut tokens = self.tokenizer.tokenize(&field_value.text); + let mut pos = 0u32; loop { match tokens.next() { Some(token) => { let term = Term::from_field_text(&field_value.field, token); - self.postings_writer.suscribe(doc_id, term); + self.postings_writer.suscribe(doc_id, pos.clone(), term); + pos += 1; }, None => { break; } } @@ -174,7 +176,7 @@ impl SegmentWriter { let field_options = schema.u32_field_options(&field_value.field); if field_options.is_indexed() { let term = Term::from_field_u32(&field_value.field, field_value.value); - self.postings_writer.suscribe(doc_id, term); + self.postings_writer.suscribe(doc_id, 0.clone(), term); } } self.fast_field_writers.add_document(&doc);