mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-21 02:30:43 +00:00
bug fix.
This commit is contained in:
@@ -29,13 +29,6 @@ extern "C" {
|
||||
num_els,
|
||||
output,
|
||||
output_length);
|
||||
{
|
||||
size_t num_ints = output_length;
|
||||
uint32_t* uncompressed = new uint32_t[100];
|
||||
codec -> decodeArray(output, output_length, uncompressed, num_ints);
|
||||
delete uncompressed;
|
||||
}
|
||||
|
||||
return output_length;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ use core::schema::Term;
|
||||
use core::DocId;
|
||||
use std::fs::File;
|
||||
use core::simdcompression;
|
||||
use core::schema::FieldValue;
|
||||
|
||||
pub struct SimpleCodec;
|
||||
|
||||
@@ -20,12 +21,18 @@ pub struct SimpleCodec;
|
||||
pub struct SimpleSegmentSerializer {
|
||||
written_bytes_postings: usize,
|
||||
postings_write: File,
|
||||
store_write: File,
|
||||
term_fst_builder: MapBuilder<File>, // TODO find an alternative to work around the "move"
|
||||
cur_term_num_docs: DocId,
|
||||
encoder: simdcompression::Encoder,
|
||||
}
|
||||
|
||||
impl SegmentSerializer<()> for SimpleSegmentSerializer {
|
||||
|
||||
fn store_doc(&mut self, field: &mut Iterator<Item=&FieldValue>) {
|
||||
|
||||
}
|
||||
|
||||
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()> {
|
||||
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
|
||||
self.cur_term_num_docs = doc_freq;
|
||||
@@ -80,11 +87,13 @@ impl SimpleCodec {
|
||||
fn serializer(segment: &Segment) -> Result<SimpleSegmentSerializer> {
|
||||
let term_write = try!(segment.open_writable(SegmentComponent::TERMS));
|
||||
let postings_write = try!(segment.open_writable(SegmentComponent::POSTINGS));
|
||||
let store_write = try!(segment.open_writable(SegmentComponent::STORE));
|
||||
let term_fst_builder_result = MapBuilder::new(term_write);
|
||||
let term_fst_builder = term_fst_builder_result.unwrap();
|
||||
Ok(SimpleSegmentSerializer {
|
||||
written_bytes_postings: 0,
|
||||
postings_write: postings_write,
|
||||
store_write: store_write,
|
||||
term_fst_builder: term_fst_builder,
|
||||
cur_term_num_docs: 0,
|
||||
encoder: simdcompression::Encoder::new(),
|
||||
|
||||
@@ -319,6 +319,7 @@ pub enum SegmentComponent {
|
||||
POSTINGS,
|
||||
// POSITIONS,
|
||||
TERMS,
|
||||
STORE,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -340,6 +341,7 @@ impl Segment {
|
||||
SegmentComponent::POSTINGS => ".idx",
|
||||
// SegmentComponent::POSITIONS => ".pos",
|
||||
SegmentComponent::TERMS => ".term",
|
||||
SegmentComponent::STORE => ".store",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::fmt;
|
||||
pub trait SegmentSerializer<Output> {
|
||||
fn new_term(&mut self, term: &Term, doc_freq: DocId) -> Result<()>;
|
||||
fn write_docs(&mut self, docs: &[DocId]) -> Result<()>; // TODO add size
|
||||
fn store_doc(&mut self, field: &mut Iterator<Item=&FieldValue>);
|
||||
fn close(self,) -> Result<Output>;
|
||||
}
|
||||
|
||||
@@ -17,6 +18,7 @@ pub trait SerializableSegment {
|
||||
|
||||
pub struct DebugSegmentSerializer {
|
||||
text: String,
|
||||
num_docs: u32,
|
||||
}
|
||||
|
||||
impl fmt::Debug for DebugSegmentSerializer {
|
||||
@@ -35,6 +37,7 @@ impl DebugSegmentSerializer {
|
||||
pub fn new() -> DebugSegmentSerializer {
|
||||
DebugSegmentSerializer {
|
||||
text: String::new(),
|
||||
num_docs: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -46,6 +49,17 @@ impl SegmentSerializer<String> for DebugSegmentSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn store_doc(&mut self, fields: &mut Iterator<Item=&FieldValue>) {
|
||||
if self.num_docs == 0 {
|
||||
self.text.push_str(&format!("# STORED DOC\n======\n"))
|
||||
}
|
||||
self.text.push_str(&format!("doc {}", self.num_docs));
|
||||
for field_value in fields {
|
||||
self.text.push_str(&format!("field {:?} |", field_value.field));
|
||||
self.text.push_str(&format!("value {:?}\n", field_value.text));
|
||||
}
|
||||
}
|
||||
|
||||
fn write_docs(&mut self, docs: &[DocId]) -> Result<()> {
|
||||
for doc in docs {
|
||||
self.text.push_str(&format!(" - Doc {:?}\n", doc));
|
||||
|
||||
@@ -24,14 +24,16 @@ impl Encoder {
|
||||
}
|
||||
|
||||
pub fn encode(&mut self, input: &[u32]) -> &[u32] {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len > self.input_buffer.len() {
|
||||
println!("resising {}", input_len);
|
||||
self.input_buffer = (0..input_len as u32 + 10 ).collect();
|
||||
self.output_buffer = (0..input_len as u32 + 10).collect();
|
||||
// TODO use resize when available
|
||||
}
|
||||
println!("self.input_buffer {}", self.input_buffer.len());
|
||||
unsafe {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len > self.input_buffer.len() {
|
||||
self.input_buffer = (0..input_len as u32 + 10 ).collect();
|
||||
self.output_buffer = (0..input_len as u32 + 10).collect();
|
||||
// TODO use resize when available
|
||||
}
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
// TODO use clone_from when available
|
||||
let written_size = encode_native(
|
||||
@@ -76,9 +78,25 @@ fn test_encode_decode() {
|
||||
let input: Vec<u32> = vec!(2,3,5,7,11,13,17,19,23);
|
||||
let data = encoder.encode(&input);
|
||||
assert_eq!(data.len(), 4);
|
||||
// let decoder = Decoder::new();
|
||||
// let mut data_output: Vec<u32> = (0..100).collect();
|
||||
// assert_eq!(9, decoder.decode(&data[0..4], &mut data_output));
|
||||
// for i in 0..9 {
|
||||
// assert_eq!(data_output[i], input[i]) ;
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_encode_decode_big() {
|
||||
let mut encoder = Encoder::new();
|
||||
let input: Vec<u32> = (0..1_000_000).collect();
|
||||
let data = encoder.encode(&input);
|
||||
assert_eq!(data.len(), 95718);
|
||||
let decoder = Decoder::new();
|
||||
let mut data_output: Vec<u32> = (0..100).collect();
|
||||
assert_eq!(9, decoder.decode(&data[0..4], &mut data_output));
|
||||
let mut data_output: Vec<u32> = (0..1_000_000).collect();
|
||||
assert_eq!(1_000_000, decoder.decode(&data[0..95718], &mut data_output));
|
||||
for i in 0..9 {
|
||||
assert_eq!(data_output[i], input[i]) ;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use core::directory::Directory;
|
||||
use core::analyzer::SimpleTokenizer;
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::collections::{hash_map, btree_map};
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Write};
|
||||
use std::sync::Arc;
|
||||
use std::mem;
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
@@ -121,6 +121,7 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.max_doc += 1;
|
||||
}
|
||||
|
||||
@@ -139,9 +140,7 @@ impl SegmentWriter {
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, term: Term) {
|
||||
self.get_postings_writer(term).suscribe(doc);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl SerializableSegment for SegmentWriter {
|
||||
|
||||
Reference in New Issue
Block a user