From eb9fa42785e2effaca93453afa7cddf43760e571 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 18 Mar 2016 08:59:31 +0900 Subject: [PATCH] blop --- src/core/codec.rs | 6 ++- src/core/fastfield.rs | 104 ++++++++++++++++++++++++++++-------------- src/core/schema.rs | 12 ++++- src/core/writer.rs | 3 +- 4 files changed, 86 insertions(+), 39 deletions(-) diff --git a/src/core/codec.rs b/src/core/codec.rs index 451b87ac7..c86e22055 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -13,6 +13,7 @@ use core::serialize::BinarySerializable; use core::simdcompression; use core::schema::TextFieldValue; use core::convert_to_ioerror; +use core::fastfield::FastFieldWriters; #[derive(Debug)] @@ -53,7 +54,6 @@ pub struct SegmentSerializer { impl SegmentSerializer { - pub fn for_segment(segment: &Segment) -> io::Result { let term_write = try!(segment.open_write(SegmentComponent::TERMS)); let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); @@ -89,6 +89,10 @@ impl SegmentSerializer { .insert(term.as_slice(), &term_info) } + pub fn write_fast_field(&mut self, vals: &Vec) { + + } + pub fn write_docs(&mut self, doc_ids: &[DocId]) -> io::Result<()> { // TODO write_all transmuted [u8] let docs_data = self.encoder.encode_sorted(doc_ids); diff --git a/src/core/fastfield.rs b/src/core/fastfield.rs index 82ac41c27..72f1dfc7e 100644 --- a/src/core/fastfield.rs +++ b/src/core/fastfield.rs @@ -1,10 +1,12 @@ use std::io::Write; use std::io; use std::io::Cursor; +use core::directory::WritePtr; use core::serialize::BinarySerializable; use core::directory::ReadOnlySource; use core::schema::DocId; use core::schema::Schema; +use core::schema::Document; use std::ops::Deref; use core::fastdivide::count_leading_zeros; use core::fastdivide::DividerU32; @@ -14,12 +16,13 @@ pub fn compute_num_bits(amplitude: u32) -> u8 { 32 - count_leading_zeros(amplitude) } -fn serialize_packed_ints>(vals_it: I, num_bits: u8, write: &mut Write) -> io::Result<()> { +fn serialize_packed_ints>(vals_it: I, num_bits: u8, write: &mut Write) -> io::Result { let mut mini_buffer_written = 0; let mut mini_buffer = 0u64; + let mut written_size = 0; for val in vals_it { if mini_buffer_written + num_bits > 64 { - try!(mini_buffer.serialize(write)); + written_size += try!(mini_buffer.serialize(write)); mini_buffer = 0; mini_buffer_written = 0; } @@ -27,33 +30,64 @@ fn serialize_packed_ints>(vals_it: I, num_bits: u8, write: mini_buffer_written += num_bits; } if mini_buffer_written > 0 { - try!(mini_buffer.serialize(write)); + written_size += try!(mini_buffer.serialize(write)); } - Ok(()) + Ok(written_size) } pub struct FastFieldWriters { - u32_fast_fields: Vec, - u32_fast_field_writers: Vec, + write: WritePtr, } impl FastFieldWriters { - pub fn from_schema(schema: &Schema) -> FastFieldWriters { - let u32_fast_fields: Vec = schema - .get_u32_fields() - .iter() - .enumerate() - .filter(|&(i, u32_field_entry)| u32_field_entry.option.is_fast()) - .map(|(i, u32_field_entry)| U32Field(i as u8)) - .collect(); - let num_32_fast_fields = u32_fast_fields.len(); + pub fn with_num_fields(write: WritePtr,) -> FastFieldWriters { FastFieldWriters { - u32_fast_fields: u32_fast_fields, - u32_fast_field_writers: (0..num_32_fast_fields) - .map(|_| U32FastFieldWriter::new()) - .collect() + write: write } } + // + // + // pub fn write_fast_field(&mut self, vals: &Vec) -> io::Result<()> { + // let u32_fast_field_writer = U32FastFieldWriter::new(); + // for val in vals { + // u32_fast_field_writer.add(*val); + // } + // self.u32_fast_field_writers.finish(); + // } + // pub fn from_schema(schema: &Schema) -> FastFieldWriters { + // let u32_fast_fields: Vec = schema + // .get_u32_fields() + // .iter() + // .enumerate() + // .filter(|&(_, u32_field_entry)| u32_field_entry.option.is_fast()) + // .map(|(i, _)| U32Field(i as u8)) + // .collect(); + // let num_32_fast_fields = u32_fast_fields.len(); + // FastFieldWriters { + // u32_fast_fields: u32_fast_fields, + // u32_fast_field_writers: (0..num_32_fast_fields) + // .map(|_| U32FastFieldWriter::new()) + // .collect() + // } + // } + + + + + // pub fn add_doc(&mut self, doc: &Document) -> io::Result<()> { + // for (field, field_writer) in self.u32_fast_fields.iter().zip(self.u32_fast_field_writers.iter_mut()) { + // let some_val = doc.get_u32(field); + // match some_val { + // Some(v) => { + // field_writer.add(v); + // } + // None => { + // return Err(io::Error::new(io::ErrorKind::InvalidData, "u32 fast field missing")); + // } + // } + // } + // Ok(()) + // } } @@ -73,22 +107,24 @@ impl U32FastFieldWriter { self.vals.push(val); } - pub fn close(&self, write: &mut Write) -> io::Result<()> { + pub fn close(&self, write: &mut Write) -> io::Result { if self.vals.is_empty() { - return Ok(()) + return Ok((0)) } + let mut written_size = 0; let min = self.vals.iter().min().unwrap(); let max = self.vals.iter().max().unwrap(); - try!(min.serialize(write)); + written_size += try!(min.serialize(write)); let amplitude: u32 = max - min; let num_bits: u8 = compute_num_bits(amplitude); - try!(num_bits.serialize(write)); + written_size += try!(num_bits.serialize(write)); let vals_it = self.vals.iter().map(|i| i-min); - serialize_packed_ints(vals_it, num_bits, write) + written_size += try!(serialize_packed_ints(vals_it, num_bits, write)); + Ok(written_size) } } -pub struct IntFastFieldReader { +pub struct U32FastFieldReader { _data: ReadOnlySource, data_ptr: *const u64, min_val: u32, @@ -98,15 +134,15 @@ pub struct IntFastFieldReader { divider: DividerU32, } -impl IntFastFieldReader { - pub fn open(data: &ReadOnlySource) -> io::Result { +impl U32FastFieldReader { + pub fn open(data: &ReadOnlySource) -> io::Result { let mut cursor: Cursor<&[u8]> = Cursor::new(&*data); let min_val = try!(u32::deserialize(&mut cursor)); let num_bits = try!(u8::deserialize(&mut cursor)); let mask = (1 << num_bits) - 1; let num_in_pack = 64u32 / (num_bits as u32); let ptr: *const u8 = &(data.deref()[5]); - Ok(IntFastFieldReader { + Ok(U32FastFieldReader { _data: data.slice(5, data.len()), data_ptr: ptr as *const u64, min_val: min_val, @@ -132,7 +168,7 @@ mod tests { use super::compute_num_bits; use super::U32FastFieldWriter; - use super::IntFastFieldReader; + use super::U32FastFieldReader; use core::directory::ReadOnlySource; use test::Bencher; use test; @@ -165,7 +201,7 @@ mod tests { } { let source = ReadOnlySource::Anonymous(buffer); - let fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + let fast_field_reader = U32FastFieldReader::open(&source).unwrap(); assert_eq!(fast_field_reader.get(0), 4u32); assert_eq!(fast_field_reader.get(1), 14u32); assert_eq!(fast_field_reader.get(2), 2u32); @@ -186,7 +222,7 @@ mod tests { } { let source = ReadOnlySource::Anonymous(buffer); - let fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + let fast_field_reader = U32FastFieldReader::open(&source).unwrap(); assert_eq!(fast_field_reader.get(0), 4u32); assert_eq!(fast_field_reader.get(1), 14_082_001u32); assert_eq!(fast_field_reader.get(2), 3_052u32); @@ -213,7 +249,7 @@ mod tests { int_fast_field_writer.close(&mut buffer).unwrap(); } let source = ReadOnlySource::Anonymous(buffer); - let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + let int_fast_field_reader = U32FastFieldReader::open(&source).unwrap(); let n = test::black_box(100); let mut a = 0u32; @@ -261,7 +297,7 @@ mod tests { int_fast_field_writer.close(&mut buffer).unwrap(); } let source = ReadOnlySource::Anonymous(buffer); - let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + let int_fast_field_reader = U32FastFieldReader::open(&source).unwrap(); b.iter(|| { let n = test::black_box(7000u32); let mut a = 0u32; @@ -284,7 +320,7 @@ mod tests { int_fast_field_writer.close(&mut buffer).unwrap(); } let source = ReadOnlySource::Anonymous(buffer); - let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + let int_fast_field_reader = U32FastFieldReader::open(&source).unwrap(); b.iter(|| { let n = test::black_box(1000); let mut a = 0u32; diff --git a/src/core/schema.rs b/src/core/schema.rs index f86b73d54..ea811c39f 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -4,7 +4,6 @@ use std::slice; use std::fmt; use std::io; use std::io::Read; -use std::str; use core::serialize::BinarySerializable; use rustc_serialize::Decodable; use rustc_serialize::Encodable; @@ -370,7 +369,7 @@ impl Term { let U32Field(field_idx) = *field; buffer.clear(); buffer.push(128 | field_idx); - val.serialize(&mut buffer); + val.serialize(&mut buffer).unwrap(); Term { data: buffer, } @@ -465,6 +464,15 @@ impl Document { self.u32_field_values.iter() } + pub fn get_u32(&self, field: &U32Field) -> Option { + self.u32_field_values + .iter() + .filter(|field_value| field_value.field == *field) + .map(|field_value| &field_value.value) + .cloned() + .next() + } + pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> { self.text_field_values .iter() diff --git a/src/core/writer.rs b/src/core/writer.rs index e14a23711..79a655f07 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -61,7 +61,6 @@ pub struct SegmentWriter { max_doc: DocId, tokenizer: SimpleTokenizer, postings_writer: PostingsWriter, - fastfield_writers: FastFieldWriters, segment_serializer: SegmentSerializer, } @@ -98,7 +97,6 @@ impl SegmentWriter { postings_writer: PostingsWriter::new(), segment_serializer: segment_serializer, tokenizer: SimpleTokenizer::new(), - fastfield_writers: FastFieldWriters::from_schema(schema), }) } @@ -129,6 +127,7 @@ impl SegmentWriter { let mut stored_fieldvalues_it = doc.text_fields().filter(|text_field_value| { schema.text_field_options(&text_field_value.field).is_stored() }); + // try!(self.fastfield_writers.add_doc(&doc)); try!(self.segment_serializer.store_doc(&mut stored_fieldvalues_it)); self.max_doc += 1; Ok(())