This commit is contained in:
Paul Masurel
2016-02-19 10:27:39 +09:00
parent b78e5320c3
commit c858b74b97
6 changed files with 54 additions and 19 deletions

View File

@@ -29,13 +29,6 @@ extern "C" {
num_els,
output,
output_length);
{
size_t num_ints = output_length;
uint32_t* uncompressed = new uint32_t[100];
codec -> decodeArray(output, output_length, uncompressed, num_ints);
delete uncompressed;
}
return output_length;
}

View File

@@ -11,6 +11,7 @@ use core::schema::Term;
use core::DocId;
use std::fs::File;
use core::simdcompression;
use core::schema::FieldValue;
pub struct SimpleCodec;
@@ -20,12 +21,18 @@ pub struct SimpleCodec;
pub struct SimpleSegmentSerializer {
written_bytes_postings: usize,
postings_write: File,
store_write: File,
term_fst_builder: MapBuilder<File>, // TODO find an alternative to work around the "move"
cur_term_num_docs: DocId,
encoder: simdcompression::Encoder,
}
impl SegmentSerializer<()> for SimpleSegmentSerializer {
fn store_doc(&mut self, field: &mut Iterator<Item=&FieldValue>) {
}
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> {
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
self.cur_term_num_docs = doc_freq;
@@ -80,11 +87,13 @@ impl SimpleCodec {
fn serializer(segment: &Segment) -> Result<SimpleSegmentSerializer> {
let term_write = try!(segment.open_writable(SegmentComponent::TERMS));
let postings_write = try!(segment.open_writable(SegmentComponent::POSTINGS));
let store_write = try!(segment.open_writable(SegmentComponent::STORE));
let term_fst_builder_result = MapBuilder::new(term_write);
let term_fst_builder = term_fst_builder_result.unwrap();
Ok(SimpleSegmentSerializer {
written_bytes_postings: 0,
postings_write: postings_write,
store_write: store_write,
term_fst_builder: term_fst_builder,
cur_term_num_docs: 0,
encoder: simdcompression::Encoder::new(),

View File

@@ -319,6 +319,7 @@ pub enum SegmentComponent {
POSTINGS,
// POSITIONS,
TERMS,
STORE,
}
#[derive(Debug, Clone)]
@@ -340,6 +341,7 @@ impl Segment {
SegmentComponent::POSTINGS => ".idx",
// SegmentComponent::POSITIONS => ".pos",
SegmentComponent::TERMS => ".term",
SegmentComponent::STORE => ".store",
}
}

View File

@@ -7,6 +7,7 @@ use std::fmt;
pub trait SegmentSerializer<Output> {
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>;
fn write_docs(&mut self, docs: &[DocId]) -> Result<()>; // TODO add size
fn store_doc(&mut self, field: &mut Iterator<Item=&FieldValue>);
fn close(self,) -> Result<Output>;
}
@@ -17,6 +18,7 @@ pub trait SerializableSegment {
pub struct DebugSegmentSerializer {
text: String,
num_docs: u32,
}
impl fmt::Debug for DebugSegmentSerializer {
@@ -35,6 +37,7 @@ impl DebugSegmentSerializer {
pub fn new() -> DebugSegmentSerializer {
DebugSegmentSerializer {
text: String::new(),
num_docs: 0,
}
}
}
@@ -46,6 +49,17 @@ impl SegmentSerializer<String> for DebugSegmentSerializer {
Ok(())
}
fn store_doc(&mut self, fields: &mut Iterator<Item=&FieldValue>) {
if self.num_docs == 0 {
self.text.push_str(&format!("# STORED DOC\n======\n"))
}
self.text.push_str(&format!("doc {}", self.num_docs));
for field_value in fields {
self.text.push_str(&format!("field {:?} |", field_value.field));
self.text.push_str(&format!("value {:?}\n", field_value.text));
}
}
fn write_docs(&mut self, docs: &[DocId]) -> Result<()> {
for doc in docs {
self.text.push_str(&format!(" - Doc {:?}\n", doc));

View File

@@ -24,14 +24,16 @@ impl Encoder {
}
pub fn encode(&mut self, input: &[u32]) -> &[u32] {
self.input_buffer.clear();
let input_len = input.len();
if input_len > self.input_buffer.len() {
println!("resising {}", input_len);
self.input_buffer = (0..input_len as u32 + 10 ).collect();
self.output_buffer = (0..input_len as u32 + 10).collect();
// TODO use resize when available
}
println!("self.input_buffer {}", self.input_buffer.len());
unsafe {
self.input_buffer.clear();
let input_len = input.len();
if input_len > self.input_buffer.len() {
self.input_buffer = (0..input_len as u32 + 10 ).collect();
self.output_buffer = (0..input_len as u32 + 10).collect();
// TODO use resize when available
}
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
// TODO use clone_from when available
let written_size = encode_native(
@@ -76,9 +78,25 @@ fn test_encode_decode() {
let input: Vec<u32> = vec!(2,3,5,7,11,13,17,19,23);
let data = encoder.encode(&input);
assert_eq!(data.len(), 4);
// let decoder = Decoder::new();
// let mut data_output: Vec<u32> = (0..100).collect();
// assert_eq!(9, decoder.decode(&data[0..4], &mut data_output));
// for i in 0..9 {
// assert_eq!(data_output[i], input[i]) ;
// }
}
#[test]
fn test_encode_decode_big() {
let mut encoder = Encoder::new();
let input: Vec<u32> = (0..1_000_000).collect();
let data = encoder.encode(&input);
assert_eq!(data.len(), 95718);
let decoder = Decoder::new();
let mut data_output: Vec<u32> = (0..100).collect();
assert_eq!(9, decoder.decode(&data[0..4], &mut data_output));
let mut data_output: Vec<u32> = (0..1_000_000).collect();
assert_eq!(1_000_000, decoder.decode(&data[0..95718], &mut data_output));
for i in 0..9 {
assert_eq!(data_output[i], input[i]) ;
}

View File

@@ -8,7 +8,7 @@ use core::directory::Directory;
use core::analyzer::SimpleTokenizer;
use std::collections::{HashMap, BTreeMap};
use std::collections::{hash_map, btree_map};
use std::io::{BufWriter, Write};
use std::io::{Write};
use std::sync::Arc;
use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
@@ -121,6 +121,7 @@ impl SegmentWriter {
}
}
}
self.max_doc += 1;
}
@@ -139,9 +140,7 @@ impl SegmentWriter {
pub fn suscribe(&mut self, doc: DocId, term: Term) {
self.get_postings_writer(term).suscribe(doc);
}
}
impl SerializableSegment for SegmentWriter {