This commit is contained in:
Paul Masurel
2016-02-24 08:57:39 +09:00
parent 5d4c3ba065
commit 4152de6c0d
6 changed files with 63 additions and 38 deletions

View File

@@ -10,6 +10,7 @@ use core::schema::Term;
use core::schema::DocId; use core::schema::DocId;
use core::store::StoreWriter; use core::store::StoreWriter;
use std::fs::File; use std::fs::File;
use core::serialize::BinarySerializable;
use fst; use fst;
use core::simdcompression; use core::simdcompression;
use std::convert::From; use std::convert::From;
@@ -49,18 +50,16 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64); self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
self.cur_term_num_docs = doc_freq; self.cur_term_num_docs = doc_freq;
// writing the size of the posting list // writing the size of the posting list
try!(self.postings_write.write_u32::<BigEndian>(doc_freq)); self.written_bytes_postings += try!((doc_freq as u32).serialize(&mut self.postings_write));
self.written_bytes_postings += 4;
Ok(()) Ok(())
} }
fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<(), IOError> { fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<(), IOError> {
// TODO write_all transmuted [u8] // TODO write_all transmuted [u8]
let docs_data = self.encoder.encode(doc_ids); let docs_data = self.encoder.encode(doc_ids);
try!(self.postings_write.write_u32::<BigEndian>(docs_data.len() as u32)); self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write));
self.written_bytes_postings += 4;
for num in docs_data { for num in docs_data {
try!(self.postings_write.write_u32::<BigEndian>(num.clone() as u32)); self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
} }
Ok(()) Ok(())
} }

View File

@@ -6,7 +6,6 @@ pub mod analyzer;
pub mod serial; pub mod serial;
pub mod reader; pub mod reader;
pub mod codec; pub mod codec;
pub mod error;
pub mod searcher; pub mod searcher;
pub mod collector; pub mod collector;
pub mod skip; pub mod skip;

View File

@@ -5,6 +5,7 @@ use core::schema::Document;
use fst; use fst;
use core::postings::IntersectionPostings; use core::postings::IntersectionPostings;
use byteorder::{BigEndian, ReadBytesExt}; use byteorder::{BigEndian, ReadBytesExt};
use core::serialize::BinarySerializable;
use std::io::Cursor; use std::io::Cursor;
use core::schema::DocId; use core::schema::DocId;
use core::directory::SegmentComponent; use core::directory::SegmentComponent;
@@ -39,14 +40,14 @@ impl SegmentPostings {
pub fn from_data(data: &[u8]) -> SegmentPostings { pub fn from_data(data: &[u8]) -> SegmentPostings {
let mut cursor = Cursor::new(data); let mut cursor = Cursor::new(data);
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize; let doc_freq: u32 = u32::deserialize(&mut cursor).unwrap();
let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize; let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize;
// TODO remove allocs // TODO remove allocs
let mut data = Vec::with_capacity(data_size); let mut data = Vec::with_capacity(data_size);
for _ in 0..data_size { for _ in 0..data_size {
data.push(cursor.read_u32::<BigEndian>().unwrap()); data.push(cursor.read_u32::<BigEndian>().unwrap());
} }
let mut doc_ids: Vec<u32> = (0..doc_freq as u32 ).collect(); let mut doc_ids: Vec<u32> = (0..doc_freq as u32).collect();
let decoder = Decoder::new(); let decoder = Decoder::new();
decoder.decode(&data, &mut doc_ids); decoder.decode(&data, &mut doc_ids);
SegmentPostings { SegmentPostings {
@@ -135,7 +136,6 @@ impl SegmentReader {
segment_postings.push(segment_posting); segment_postings.push(segment_posting);
} }
None => { None => {
println!("not found {:?}", term);
segment_postings.clear(); segment_postings.clear();
segment_postings.push(SegmentPostings::empty()); segment_postings.push(SegmentPostings::empty());
break; break;

View File

@@ -70,13 +70,15 @@ impl Decoder {
#[test] #[test]
fn test_encode_big() { fn test_encode_big() {
let mut encoder = Encoder::new(); let mut encoder = Encoder::new();
let input: Vec<u32> = (0..10000).into_iter().collect(); let num_ints = 10000 as usize;
let data = encoder.encode(&input); let expected_length = 1274;
assert_eq!(data.len(), 962); let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Decoder::new(); let decoder = Decoder::new();
let mut data_output: Vec<u32> = (0..10000).collect(); let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(10000, decoder.decode(&data[0..962], &mut data_output)); assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
for i in 0..10000 { assert_eq!(decoded_data, input);
assert_eq!(data_output[i], input[i]) ;
}
} }

View File

@@ -4,7 +4,6 @@ use std::cell::RefCell;
use core::schema::DocId; use core::schema::DocId;
use core::schema::Document; use core::schema::Document;
use core::schema::FieldValue; use core::schema::FieldValue;
use core::error;
use core::serialize::BinarySerializable; use core::serialize::BinarySerializable;
use std::io::Write; use std::io::Write;
use std::io::Read; use std::io::Read;
@@ -193,7 +192,7 @@ mod tests {
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."); let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
{ {
let mut store_writer = StoreWriter::new(store_file.reopen().unwrap()); let mut store_writer = StoreWriter::new(store_file.reopen().unwrap());
for i in 0..10000 { for i in 0..1000 {
let mut fields: Vec<FieldValue> = Vec::new(); let mut fields: Vec<FieldValue> = Vec::new();
{ {
let field_value = FieldValue { let field_value = FieldValue {
@@ -219,7 +218,7 @@ mod tests {
let store_mmap = MmapReadOnly::open(&store_file).unwrap(); let store_mmap = MmapReadOnly::open(&store_file).unwrap();
let store = StoreReader::new(store_mmap); let store = StoreReader::new(store_mmap);
assert_eq!(offsets, store.offsets); assert_eq!(offsets, store.offsets);
for i in 0..1000 { for i in (0..10).map(|i| i * 3 / 2) {
assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i)); assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i));
} }
} }

View File

@@ -12,36 +12,35 @@ use regex::Regex;
pub struct TestCollector { // only make sense for a single segment
docs: Vec<DocAddress>, struct TestCollector {
current_segment: Option<SegmentId>, docs: Vec<DocId>,
} }
impl TestCollector { impl TestCollector {
pub fn new() -> TestCollector { pub fn new() -> TestCollector {
TestCollector { TestCollector {
docs: Vec::new(), docs: Vec::new(),
current_segment: None,
} }
} }
pub fn docs(self,) -> Vec<DocAddress> { pub fn docs(self,) -> Vec<DocId> {
self.docs self.docs
} }
} }
impl Collector for TestCollector { impl Collector for TestCollector {
fn set_segment(&mut self, segment: &SegmentReader) { fn set_segment(&mut self, segment: &SegmentReader) {}
self.current_segment = Some(segment.id());
}
fn collect(&mut self, doc_id: DocId) { fn collect(&mut self, doc_id: DocId) {
self.docs.push(DocAddress(self.current_segment.clone().unwrap(), doc_id)); self.docs.push(doc_id);
} }
} }
#[test] #[test]
fn test_indexing() { fn test_indexing() {
let mut schema = Schema::new(); let mut schema = Schema::new();
@@ -111,16 +110,43 @@ fn test_searcher() {
let segment = commit_result.unwrap(); let segment = commit_result.unwrap();
} }
{ {
let searcher = Searcher::for_directory(directory); let searcher = Searcher::for_directory(directory);
let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), ); let get_doc_ids = |terms: Vec<Term>| {
let mut collector = TestCollector::new(); let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector); searcher.search(&terms, &mut collector);
let vals: Vec<DocId> = collector collector.docs()
.docs() };
.iter() {
.map(|doc| doc.1) assert_eq!(
.collect::<Vec<DocId>>(); get_doc_ids(vec!(Term::from_field_text(&text_field, "a"))),
assert_eq!(vals, [1, 2]); vec!(1, 2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "af"))),
vec!(0));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "b"))),
vec!(0, 1, 2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "c"))),
vec!(1, 2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "d"))),
vec!(2));
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), )),
vec!(1, 2));
}
} }
} }