From d002ee2aafdde6754c5bfcdbbf9ce0fa5e71b501 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 20 Jan 2016 01:05:24 +0900 Subject: [PATCH] blip --- src/core/global.rs | 2 +- src/core/postings.rs | 2 +- src/core/reader.rs | 49 ++++++++++++++++++++++++++++++++++++-------- src/core/serial.rs | 2 +- src/core/writer.rs | 18 +++++++--------- 5 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/core/global.rs b/src/core/global.rs index 84bdc0f57..987812fe5 100644 --- a/src/core/global.rs +++ b/src/core/global.rs @@ -1,7 +1,7 @@ use std::io::{BufWriter, Write}; use std::io; -pub type DocId = usize; +pub type DocId = u32; pub type FieldId = u8; #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] diff --git a/src/core/postings.rs b/src/core/postings.rs index 72cb134cd..b44799ae7 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -57,7 +57,7 @@ impl Iterator for SimplePostingsIterator { Ok(num_bytes) => { if num_bytes == 8 { unsafe { - let val = *(*buf.as_ptr() as *const usize); + let val = *(*buf.as_ptr() as *const u32); return Some(val) } } diff --git a/src/core/reader.rs b/src/core/reader.rs index c1f6e9dee..6aa13ac64 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -4,7 +4,11 @@ use core::schema::Term; use core::directory::SharedMmapMemory; use fst::Streamer; use fst; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::borrow::Borrow; +use std::io::Cursor; +use core::global::DocId; +use core::serial::DocCursor; pub struct SegmentIndexReader { segment: Segment, @@ -12,9 +16,7 @@ pub struct SegmentIndexReader { postings_data: SharedMmapMemory, } - impl SegmentIndexReader { - fn term_cursor<'a>(&'a self) -> SegmentTermCur<'a> { SegmentTermCur { segment: &self.segment, @@ -22,13 +24,37 @@ impl SegmentIndexReader { postings_data: self.postings_data.borrow(), } } - } - pub struct SegmentDocCursor<'a> { - postings_data: &'a [u8], - offset: usize, + postings_data: Cursor<&'a [u8]>, + num_docs: DocId, + current_doc: DocId, +} + +impl<'a> Iterator for SegmentDocCursor<'a> { + type Item = DocId; + + fn next(&mut self) -> Option { + if self.num_docs == 0 { + None + } + else { + self.num_docs -= 1; + self.current_doc = self.postings_data.read_u32::().unwrap(); + Some(self.current_doc) + } + } +} + +impl<'a> DocCursor for SegmentDocCursor<'a> { + fn doc(&self) -> DocId{ + self.current_doc + } + + fn len(&self) -> DocId { + self.num_docs + } } struct SegmentTermCur<'a> { @@ -41,11 +67,16 @@ impl<'a> SegmentTermCur<'a> { fn next(&mut self,) -> Option<(Term, SegmentDocCursor<'a>)> { match self.fst_streamer.next() { - Some((k, offset)) => { + Some((k, offset_u64)) => { let term = Term::from(k); + let offset = offset_u64 as usize; + let data = &self.postings_data[offset..]; + let mut cursor = Cursor::new(data); + let num_docs = cursor.read_u32::().unwrap(); let doc_cursor = SegmentDocCursor { - postings_data: self.postings_data, - offset: offset as usize, + postings_data: cursor, + num_docs: num_docs, + current_doc: 0, }; Some((term, doc_cursor)) }, diff --git a/src/core/serial.rs b/src/core/serial.rs index f40387768..fc4d7d4af 100644 --- a/src/core/serial.rs +++ b/src/core/serial.rs @@ -9,7 +9,7 @@ pub trait SerializableSegment<'a> { pub trait DocCursor: Iterator { fn doc(&self) -> DocId; - fn len(&self) -> usize; + fn len(&self) -> DocId; } // TODO make iteration over Fields somehow sorted diff --git a/src/core/writer.rs b/src/core/writer.rs index 40a8282ff..4ecff9352 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -70,7 +70,7 @@ impl FieldWriter { } pub struct IndexWriter { - max_doc: usize, + max_doc: DocId, term_writers: HashMap, directory: Directory, } @@ -169,11 +169,9 @@ impl<'a> CIWTermCursor<'a> { } fn get_term(&self) -> Term { - Term::from_field_text(self.field.clone(), self.current_form_postings.as_ref().unwrap().form) - // Term { - // field: self.field.clone(), - // text: self.current_form_postings.as_ref().unwrap().form, - // } + let field = self.field.clone(); + let value = self.current_form_postings.as_ref().unwrap().form; + Term::from_field_text(field, value) } fn doc_cursor(&self,) -> CIWDocCursor<'a> { @@ -181,7 +179,7 @@ impl<'a> CIWTermCursor<'a> { .as_ref() .unwrap() .postings; - let num_docs = postings.doc_ids.len(); + let num_docs = postings.doc_ids.len() as DocId; CIWDocCursor { num_docs: num_docs, docs_it: postings @@ -191,7 +189,6 @@ impl<'a> CIWTermCursor<'a> { } } - fn next_form(&mut self,) -> bool { match self.form_it.next() { Some(form_postings) => { @@ -220,7 +217,6 @@ impl<'a> CIWTermCursor<'a> { } } - impl<'a> TermCursor for CIWTermCursor<'a> { type DocCur = CIWDocCursor<'a>; @@ -263,7 +259,7 @@ impl<'a> SerializableSegment<'a> for IndexWriter { pub struct CIWDocCursor<'a> { docs_it: slice::Iter<'a, DocId>, current: Option, - num_docs: usize, + num_docs: DocId, } impl<'a> Iterator for CIWDocCursor<'a> { @@ -281,7 +277,7 @@ impl<'a> DocCursor for CIWDocCursor<'a> { self.current.unwrap() } - fn len(&self) -> usize { + fn len(&self) -> DocId { self.num_docs } }