From c858b74b97f805e9cb39b7f43936f6a4e77ca8f7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 19 Feb 2016 10:27:39 +0900 Subject: [PATCH] bug fix. --- cpp/encode.cpp | 7 ------- src/core/codec.rs | 9 +++++++++ src/core/directory.rs | 2 ++ src/core/serial.rs | 14 ++++++++++++++ src/core/simdcompression.rs | 36 +++++++++++++++++++++++++++--------- src/core/writer.rs | 5 ++--- 6 files changed, 54 insertions(+), 19 deletions(-) diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 462f81a38..233cb7058 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -29,13 +29,6 @@ extern "C" { num_els, output, output_length); - { - size_t num_ints = output_length; - uint32_t* uncompressed = new uint32_t[100]; - codec -> decodeArray(output, output_length, uncompressed, num_ints); - delete uncompressed; - } - return output_length; } diff --git a/src/core/codec.rs b/src/core/codec.rs index 9e750495f..a3adf4df4 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -11,6 +11,7 @@ use core::schema::Term; use core::DocId; use std::fs::File; use core::simdcompression; +use core::schema::FieldValue; pub struct SimpleCodec; @@ -20,12 +21,18 @@ pub struct SimpleCodec; pub struct SimpleSegmentSerializer { written_bytes_postings: usize, postings_write: File, + store_write: File, term_fst_builder: MapBuilder, // TODO find an alternative to work around the "move" cur_term_num_docs: DocId, encoder: simdcompression::Encoder, } impl SegmentSerializer<()> for SimpleSegmentSerializer { + + fn store_doc(&mut self, field: &mut Iterator) { + + } + fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> { self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64); self.cur_term_num_docs = doc_freq; @@ -80,11 +87,13 @@ impl SimpleCodec { fn serializer(segment: &Segment) -> Result { let term_write = try!(segment.open_writable(SegmentComponent::TERMS)); let postings_write = try!(segment.open_writable(SegmentComponent::POSTINGS)); + let store_write = try!(segment.open_writable(SegmentComponent::STORE)); let term_fst_builder_result = MapBuilder::new(term_write); let term_fst_builder = term_fst_builder_result.unwrap(); Ok(SimpleSegmentSerializer { written_bytes_postings: 0, postings_write: postings_write, + store_write: store_write, term_fst_builder: term_fst_builder, cur_term_num_docs: 0, encoder: simdcompression::Encoder::new(), diff --git a/src/core/directory.rs b/src/core/directory.rs index 8a754a62f..c30bbed35 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -319,6 +319,7 @@ pub enum SegmentComponent { POSTINGS, // POSITIONS, TERMS, + STORE, } #[derive(Debug, Clone)] @@ -340,6 +341,7 @@ impl Segment { SegmentComponent::POSTINGS => ".idx", // SegmentComponent::POSITIONS => ".pos", SegmentComponent::TERMS => ".term", + SegmentComponent::STORE => ".store", } } diff --git a/src/core/serial.rs b/src/core/serial.rs index ea573ff79..f5c085e05 100644 --- a/src/core/serial.rs +++ b/src/core/serial.rs @@ -7,6 +7,7 @@ use std::fmt; pub trait SegmentSerializer { fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>; fn write_docs(&mut self, docs: &[DocId]) -> Result<()>; // TODO add size + fn store_doc(&mut self, field: &mut Iterator); fn close(self,) -> Result; } @@ -17,6 +18,7 @@ pub trait SerializableSegment { pub struct DebugSegmentSerializer { text: String, + num_docs: u32, } impl fmt::Debug for DebugSegmentSerializer { @@ -35,6 +37,7 @@ impl DebugSegmentSerializer { pub fn new() -> DebugSegmentSerializer { DebugSegmentSerializer { text: String::new(), + num_docs: 0, } } } @@ -46,6 +49,17 @@ impl SegmentSerializer for DebugSegmentSerializer { Ok(()) } + fn store_doc(&mut self, fields: &mut Iterator) { + if self.num_docs == 0 { + self.text.push_str(&format!("# STORED DOC\n======\n")) + } + self.text.push_str(&format!("doc {}", self.num_docs)); + for field_value in fields { + self.text.push_str(&format!("field {:?} |", field_value.field)); + self.text.push_str(&format!("value {:?}\n", field_value.text)); + } + } + fn write_docs(&mut self, docs: &[DocId]) -> Result<()> { for doc in docs { self.text.push_str(&format!(" - Doc {:?}\n", doc)); diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index b09dcbe38..d9a80ed1b 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -24,14 +24,16 @@ impl Encoder { } pub fn encode(&mut self, input: &[u32]) -> &[u32] { + self.input_buffer.clear(); + let input_len = input.len(); + if input_len > self.input_buffer.len() { + println!("resising {}", input_len); + self.input_buffer = (0..input_len as u32 + 10 ).collect(); + self.output_buffer = (0..input_len as u32 + 10).collect(); + // TODO use resize when available + } + println!("self.input_buffer {}", self.input_buffer.len()); unsafe { - self.input_buffer.clear(); - let input_len = input.len(); - if input_len > self.input_buffer.len() { - self.input_buffer = (0..input_len as u32 + 10 ).collect(); - self.output_buffer = (0..input_len as u32 + 10).collect(); - // TODO use resize when available - } ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); // TODO use clone_from when available let written_size = encode_native( @@ -76,9 +78,25 @@ fn test_encode_decode() { let input: Vec = vec!(2,3,5,7,11,13,17,19,23); let data = encoder.encode(&input); assert_eq!(data.len(), 4); + // let decoder = Decoder::new(); + // let mut data_output: Vec = (0..100).collect(); + // assert_eq!(9, decoder.decode(&data[0..4], &mut data_output)); + // for i in 0..9 { + // assert_eq!(data_output[i], input[i]) ; + // } +} + + + +#[test] +fn test_encode_decode_big() { + let mut encoder = Encoder::new(); + let input: Vec = (0..1_000_000).collect(); + let data = encoder.encode(&input); + assert_eq!(data.len(), 95718); let decoder = Decoder::new(); - let mut data_output: Vec = (0..100).collect(); - assert_eq!(9, decoder.decode(&data[0..4], &mut data_output)); + let mut data_output: Vec = (0..1_000_000).collect(); + assert_eq!(1_000_000, decoder.decode(&data[0..95718], &mut data_output)); for i in 0..9 { assert_eq!(data_output[i], input[i]) ; } diff --git a/src/core/writer.rs b/src/core/writer.rs index 9f613b757..d099b2122 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -8,7 +8,7 @@ use core::directory::Directory; use core::analyzer::SimpleTokenizer; use std::collections::{HashMap, BTreeMap}; use std::collections::{hash_map, btree_map}; -use std::io::{BufWriter, Write}; +use std::io::{Write}; use std::sync::Arc; use std::mem; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -121,6 +121,7 @@ impl SegmentWriter { } } } + self.max_doc += 1; } @@ -139,9 +140,7 @@ impl SegmentWriter { pub fn suscribe(&mut self, doc: DocId, term: Term) { self.get_postings_writer(term).suscribe(doc); - } - } impl SerializableSegment for SegmentWriter {