This commit is contained in:
Paul Masurel
2016-02-24 08:57:39 +09:00
parent 5d4c3ba065
commit 4152de6c0d
6 changed files with 63 additions and 38 deletions

View File

@@ -10,6 +10,7 @@ use core::schema::Term;
use core::schema::DocId;
use core::store::StoreWriter;
use std::fs::File;
use core::serialize::BinarySerializable;
use fst;
use core::simdcompression;
use std::convert::From;
@@ -49,18 +50,16 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
self.cur_term_num_docs = doc_freq;
// writing the size of the posting list
try!(self.postings_write.write_u32::<BigEndian>(doc_freq));
self.written_bytes_postings += 4;
self.written_bytes_postings += try!((doc_freq as u32).serialize(&mut self.postings_write));
Ok(())
}
fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<(), IOError> {
// TODO write_all transmuted [u8]
let docs_data = self.encoder.encode(doc_ids);
try!(self.postings_write.write_u32::<BigEndian>(docs_data.len() as u32));
self.written_bytes_postings += 4;
self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write));
for num in docs_data {
try!(self.postings_write.write_u32::<BigEndian>(num.clone() as u32));
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
}
Ok(())
}

View File

@@ -6,7 +6,6 @@ pub mod analyzer;
pub mod serial;
pub mod reader;
pub mod codec;
pub mod error;
pub mod searcher;
pub mod collector;
pub mod skip;

View File

@@ -5,6 +5,7 @@ use core::schema::Document;
use fst;
use core::postings::IntersectionPostings;
use byteorder::{BigEndian, ReadBytesExt};
use core::serialize::BinarySerializable;
use std::io::Cursor;
use core::schema::DocId;
use core::directory::SegmentComponent;
@@ -39,14 +40,14 @@ impl SegmentPostings {
pub fn from_data(data: &[u8]) -> SegmentPostings {
let mut cursor = Cursor::new(data);
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
let doc_freq: u32 = u32::deserialize(&mut cursor).unwrap();
let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize;
// TODO remove allocs
let mut data = Vec::with_capacity(data_size);
for _ in 0..data_size {
data.push(cursor.read_u32::<BigEndian>().unwrap());
}
let mut doc_ids: Vec<u32> = (0..doc_freq as u32 ).collect();
let mut doc_ids: Vec<u32> = (0..doc_freq as u32).collect();
let decoder = Decoder::new();
decoder.decode(&data, &mut doc_ids);
SegmentPostings {
@@ -135,7 +136,6 @@ impl SegmentReader {
segment_postings.push(segment_posting);
}
None => {
println!("not found {:?}", term);
segment_postings.clear();
segment_postings.push(SegmentPostings::empty());
break;

View File

@@ -70,13 +70,15 @@ impl Decoder {
#[test]
fn test_encode_big() {
let mut encoder = Encoder::new();
let input: Vec<u32> = (0..10000).into_iter().collect();
let data = encoder.encode(&input);
assert_eq!(data.len(), 962);
let num_ints = 10000 as usize;
let expected_length = 1274;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Decoder::new();
let mut data_output: Vec<u32> = (0..10000).collect();
assert_eq!(10000, decoder.decode(&data[0..962], &mut data_output));
for i in 0..10000 {
assert_eq!(data_output[i], input[i]) ;
}
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
}

View File

@@ -4,7 +4,6 @@ use std::cell::RefCell;
use core::schema::DocId;
use core::schema::Document;
use core::schema::FieldValue;
use core::error;
use core::serialize::BinarySerializable;
use std::io::Write;
use std::io::Read;
@@ -193,7 +192,7 @@ mod tests {
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
{
let mut store_writer = StoreWriter::new(store_file.reopen().unwrap());
for i in 0..10000 {
for i in 0..1000 {
let mut fields: Vec<FieldValue> = Vec::new();
{
let field_value = FieldValue {
@@ -219,7 +218,7 @@ mod tests {
let store_mmap = MmapReadOnly::open(&store_file).unwrap();
let store = StoreReader::new(store_mmap);
assert_eq!(offsets, store.offsets);
for i in 0..1000 {
for i in (0..10).map(|i| i * 3 / 2) {
assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i));
}
}

View File

@@ -12,36 +12,35 @@ use regex::Regex;
pub struct TestCollector {
docs: Vec<DocAddress>,
current_segment: Option<SegmentId>,
// only make sense for a single segment
struct TestCollector {
docs: Vec<DocId>,
}
impl TestCollector {
pub fn new() -> TestCollector {
TestCollector {
docs: Vec::new(),
current_segment: None,
}
}
pub fn docs(self,) -> Vec<DocAddress> {
pub fn docs(self,) -> Vec<DocId> {
self.docs
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, segment: &SegmentReader) {
self.current_segment = Some(segment.id());
}
fn set_segment(&mut self, segment: &SegmentReader) {}
fn collect(&mut self, doc_id: DocId) {
self.docs.push(DocAddress(self.current_segment.clone().unwrap(), doc_id));
self.docs.push(doc_id);
}
}
#[test]
fn test_indexing() {
let mut schema = Schema::new();
@@ -111,16 +110,43 @@ fn test_searcher() {
let segment = commit_result.unwrap();
}
{
let searcher = Searcher::for_directory(directory);
let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), );
let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector);
let vals: Vec<DocId> = collector
.docs()
.iter()
.map(|doc| doc.1)
.collect::<Vec<DocId>>();
assert_eq!(vals, [1, 2]);
let get_doc_ids = |terms: Vec<Term>| {
let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector);
collector.docs()
};
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "a"))),
vec!(1, 2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "af"))),
vec!(0));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "b"))),
vec!(0, 1, 2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "c"))),
vec!(1, 2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "d"))),
vec!(2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), )),
vec!(1, 2));
}
}
}