This commit is contained in:
Paul Masurel
2016-03-18 08:59:31 +09:00
parent 49fae017ed
commit eb9fa42785
4 changed files with 86 additions and 39 deletions

View File

@@ -13,6 +13,7 @@ use core::serialize::BinarySerializable;
use core::simdcompression;
use core::schema::TextFieldValue;
use core::convert_to_ioerror;
use core::fastfield::FastFieldWriters;
#[derive(Debug)]
@@ -53,7 +54,6 @@ pub struct SegmentSerializer {
impl SegmentSerializer {
pub fn for_segment(segment: &Segment) -> io::Result<SegmentSerializer> {
let term_write = try!(segment.open_write(SegmentComponent::TERMS));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
@@ -89,6 +89,10 @@ impl SegmentSerializer {
.insert(term.as_slice(), &term_info)
}
pub fn write_fast_field(&mut self, vals: &Vec<u32>) {
}
pub fn write_docs(&mut self, doc_ids: &[DocId]) -> io::Result<()> {
// TODO write_all transmuted [u8]
let docs_data = self.encoder.encode_sorted(doc_ids);

View File

@@ -1,10 +1,12 @@
use std::io::Write;
use std::io;
use std::io::Cursor;
use core::directory::WritePtr;
use core::serialize::BinarySerializable;
use core::directory::ReadOnlySource;
use core::schema::DocId;
use core::schema::Schema;
use core::schema::Document;
use std::ops::Deref;
use core::fastdivide::count_leading_zeros;
use core::fastdivide::DividerU32;
@@ -14,12 +16,13 @@ pub fn compute_num_bits(amplitude: u32) -> u8 {
32 - count_leading_zeros(amplitude)
}
fn serialize_packed_ints<I: Iterator<Item=u32>>(vals_it: I, num_bits: u8, write: &mut Write) -> io::Result<()> {
fn serialize_packed_ints<I: Iterator<Item=u32>>(vals_it: I, num_bits: u8, write: &mut Write) -> io::Result<usize> {
let mut mini_buffer_written = 0;
let mut mini_buffer = 0u64;
let mut written_size = 0;
for val in vals_it {
if mini_buffer_written + num_bits > 64 {
try!(mini_buffer.serialize(write));
written_size += try!(mini_buffer.serialize(write));
mini_buffer = 0;
mini_buffer_written = 0;
}
@@ -27,33 +30,64 @@ fn serialize_packed_ints<I: Iterator<Item=u32>>(vals_it: I, num_bits: u8, write:
mini_buffer_written += num_bits;
}
if mini_buffer_written > 0 {
try!(mini_buffer.serialize(write));
written_size += try!(mini_buffer.serialize(write));
}
Ok(())
Ok(written_size)
}
pub struct FastFieldWriters {
u32_fast_fields: Vec<U32Field>,
u32_fast_field_writers: Vec<U32FastFieldWriter>,
write: WritePtr,
}
impl FastFieldWriters {
pub fn from_schema(schema: &Schema) -> FastFieldWriters {
let u32_fast_fields: Vec<U32Field> = schema
.get_u32_fields()
.iter()
.enumerate()
.filter(|&(i, u32_field_entry)| u32_field_entry.option.is_fast())
.map(|(i, u32_field_entry)| U32Field(i as u8))
.collect();
let num_32_fast_fields = u32_fast_fields.len();
pub fn with_num_fields(write: WritePtr,) -> FastFieldWriters {
FastFieldWriters {
u32_fast_fields: u32_fast_fields,
u32_fast_field_writers: (0..num_32_fast_fields)
.map(|_| U32FastFieldWriter::new())
.collect()
write: write
}
}
//
//
// pub fn write_fast_field(&mut self, vals: &Vec<u32>) -> io::Result<()> {
// let u32_fast_field_writer = U32FastFieldWriter::new();
// for val in vals {
// u32_fast_field_writer.add(*val);
// }
// self.u32_fast_field_writers.finish();
// }
// pub fn from_schema(schema: &Schema) -> FastFieldWriters {
// let u32_fast_fields: Vec<U32Field> = schema
// .get_u32_fields()
// .iter()
// .enumerate()
// .filter(|&(_, u32_field_entry)| u32_field_entry.option.is_fast())
// .map(|(i, _)| U32Field(i as u8))
// .collect();
// let num_32_fast_fields = u32_fast_fields.len();
// FastFieldWriters {
// u32_fast_fields: u32_fast_fields,
// u32_fast_field_writers: (0..num_32_fast_fields)
// .map(|_| U32FastFieldWriter::new())
// .collect()
// }
// }
// pub fn add_doc(&mut self, doc: &Document) -> io::Result<()> {
// for (field, field_writer) in self.u32_fast_fields.iter().zip(self.u32_fast_field_writers.iter_mut()) {
// let some_val = doc.get_u32(field);
// match some_val {
// Some(v) => {
// field_writer.add(v);
// }
// None => {
// return Err(io::Error::new(io::ErrorKind::InvalidData, "u32 fast field missing"));
// }
// }
// }
// Ok(())
// }
}
@@ -73,22 +107,24 @@ impl U32FastFieldWriter {
self.vals.push(val);
}
pub fn close(&self, write: &mut Write) -> io::Result<()> {
pub fn close(&self, write: &mut Write) -> io::Result<usize> {
if self.vals.is_empty() {
return Ok(())
return Ok((0))
}
let mut written_size = 0;
let min = self.vals.iter().min().unwrap();
let max = self.vals.iter().max().unwrap();
try!(min.serialize(write));
written_size += try!(min.serialize(write));
let amplitude: u32 = max - min;
let num_bits: u8 = compute_num_bits(amplitude);
try!(num_bits.serialize(write));
written_size += try!(num_bits.serialize(write));
let vals_it = self.vals.iter().map(|i| i-min);
serialize_packed_ints(vals_it, num_bits, write)
written_size += try!(serialize_packed_ints(vals_it, num_bits, write));
Ok(written_size)
}
}
pub struct IntFastFieldReader {
pub struct U32FastFieldReader {
_data: ReadOnlySource,
data_ptr: *const u64,
min_val: u32,
@@ -98,15 +134,15 @@ pub struct IntFastFieldReader {
divider: DividerU32,
}
impl IntFastFieldReader {
pub fn open(data: &ReadOnlySource) -> io::Result<IntFastFieldReader> {
impl U32FastFieldReader {
pub fn open(data: &ReadOnlySource) -> io::Result<U32FastFieldReader> {
let mut cursor: Cursor<&[u8]> = Cursor::new(&*data);
let min_val = try!(u32::deserialize(&mut cursor));
let num_bits = try!(u8::deserialize(&mut cursor));
let mask = (1 << num_bits) - 1;
let num_in_pack = 64u32 / (num_bits as u32);
let ptr: *const u8 = &(data.deref()[5]);
Ok(IntFastFieldReader {
Ok(U32FastFieldReader {
_data: data.slice(5, data.len()),
data_ptr: ptr as *const u64,
min_val: min_val,
@@ -132,7 +168,7 @@ mod tests {
use super::compute_num_bits;
use super::U32FastFieldWriter;
use super::IntFastFieldReader;
use super::U32FastFieldReader;
use core::directory::ReadOnlySource;
use test::Bencher;
use test;
@@ -165,7 +201,7 @@ mod tests {
}
{
let source = ReadOnlySource::Anonymous(buffer);
let fast_field_reader = IntFastFieldReader::open(&source).unwrap();
let fast_field_reader = U32FastFieldReader::open(&source).unwrap();
assert_eq!(fast_field_reader.get(0), 4u32);
assert_eq!(fast_field_reader.get(1), 14u32);
assert_eq!(fast_field_reader.get(2), 2u32);
@@ -186,7 +222,7 @@ mod tests {
}
{
let source = ReadOnlySource::Anonymous(buffer);
let fast_field_reader = IntFastFieldReader::open(&source).unwrap();
let fast_field_reader = U32FastFieldReader::open(&source).unwrap();
assert_eq!(fast_field_reader.get(0), 4u32);
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
assert_eq!(fast_field_reader.get(2), 3_052u32);
@@ -213,7 +249,7 @@ mod tests {
int_fast_field_writer.close(&mut buffer).unwrap();
}
let source = ReadOnlySource::Anonymous(buffer);
let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap();
let int_fast_field_reader = U32FastFieldReader::open(&source).unwrap();
let n = test::black_box(100);
let mut a = 0u32;
@@ -261,7 +297,7 @@ mod tests {
int_fast_field_writer.close(&mut buffer).unwrap();
}
let source = ReadOnlySource::Anonymous(buffer);
let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap();
let int_fast_field_reader = U32FastFieldReader::open(&source).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
@@ -284,7 +320,7 @@ mod tests {
int_fast_field_writer.close(&mut buffer).unwrap();
}
let source = ReadOnlySource::Anonymous(buffer);
let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap();
let int_fast_field_reader = U32FastFieldReader::open(&source).unwrap();
b.iter(|| {
let n = test::black_box(1000);
let mut a = 0u32;

View File

@@ -4,7 +4,6 @@ use std::slice;
use std::fmt;
use std::io;
use std::io::Read;
use std::str;
use core::serialize::BinarySerializable;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
@@ -370,7 +369,7 @@ impl Term {
let U32Field(field_idx) = *field;
buffer.clear();
buffer.push(128 | field_idx);
val.serialize(&mut buffer);
val.serialize(&mut buffer).unwrap();
Term {
data: buffer,
}
@@ -465,6 +464,15 @@ impl Document {
self.u32_field_values.iter()
}
pub fn get_u32(&self, field: &U32Field) -> Option<u32> {
self.u32_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.value)
.cloned()
.next()
}
pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> {
self.text_field_values
.iter()

View File

@@ -61,7 +61,6 @@ pub struct SegmentWriter {
max_doc: DocId,
tokenizer: SimpleTokenizer,
postings_writer: PostingsWriter,
fastfield_writers: FastFieldWriters,
segment_serializer: SegmentSerializer,
}
@@ -98,7 +97,6 @@ impl SegmentWriter {
postings_writer: PostingsWriter::new(),
segment_serializer: segment_serializer,
tokenizer: SimpleTokenizer::new(),
fastfield_writers: FastFieldWriters::from_schema(schema),
})
}
@@ -129,6 +127,7 @@ impl SegmentWriter {
let mut stored_fieldvalues_it = doc.text_fields().filter(|text_field_value| {
schema.text_field_options(&text_field_value.field).is_stored()
});
// try!(self.fastfield_writers.add_doc(&doc));
try!(self.segment_serializer.store_doc(&mut stored_fieldvalues_it));
self.max_doc += 1;
Ok(())