This commit is contained in:
Paul Masurel
2018-02-12 10:31:29 +09:00
parent 9370427ae2
commit a7ffc0e610
19 changed files with 173 additions and 181 deletions

View File

@@ -4,21 +4,25 @@ use common::serialize::BinarySerializable;
use std::mem;
use std::ops::Deref;
pub(crate) struct BitPacker {
mini_buffer: u64,
mini_buffer_written: usize
mini_buffer_written: usize,
}
impl BitPacker {
pub fn new() -> BitPacker {
BitPacker {
mini_buffer: 0u64,
mini_buffer_written: 0
mini_buffer_written: 0,
}
}
pub fn write<TWrite: Write>(&mut self, val: u64, num_bits: u8, output: &mut TWrite) -> io::Result<()> {
pub fn write<TWrite: Write>(
&mut self,
val: u64,
num_bits: u8,
output: &mut TWrite,
) -> io::Result<()> {
let val_u64 = val as u64;
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
@@ -58,8 +62,8 @@ impl BitPacker {
#[derive(Clone)]
pub struct BitUnpacker<Data>
where
Data: Deref<Target=[u8]>,
where
Data: Deref<Target = [u8]>,
{
num_bits: usize,
mask: u64,
@@ -67,16 +71,15 @@ pub struct BitUnpacker<Data>
}
impl<Data> BitUnpacker<Data>
where
Data: Deref<Target=[u8]>,
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: u8) -> BitUnpacker<Data> {
let mask: u64 =
if num_bits == 64 {
!0u64
} else {
(1u64 << num_bits) - 1u64
};
let mask: u64 = if num_bits == 64 {
!0u64
} else {
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: num_bits as usize,
mask,
@@ -102,8 +105,7 @@ impl<Data> BitUnpacker<Data>
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 =
unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
} else {
@@ -134,8 +136,7 @@ impl<Data> BitUnpacker<Data>
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 =
unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
@@ -148,7 +149,6 @@ impl<Data> BitUnpacker<Data>
mod test {
use super::{BitPacker, BitUnpacker};
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new();
@@ -157,10 +157,10 @@ mod test {
.map(|i| if max_val == 0 { 0 } else { i % max_val })
.collect();
for &val in &vals {
bitpacker.write(val, num_bits,&mut data).unwrap();
bitpacker.write(val, num_bits, &mut data).unwrap();
}
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize)* len + 7) / 8 + 7);
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(data, num_bits);
(bitunpacker, vals)
}

View File

@@ -27,7 +27,6 @@ impl IntoIterator for TinySet {
}
impl TinySet {
/// Returns an empty `TinySet`.
pub fn empty() -> TinySet {
TinySet(0u64)
@@ -38,7 +37,6 @@ impl TinySet {
TinySet(!self.0)
}
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(&self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
@@ -137,7 +135,6 @@ fn num_buckets(max_val: u32) -> u32 {
}
impl BitSet {
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val[`.
pub fn with_max_value(max_value: u32) -> BitSet {
@@ -146,7 +143,7 @@ impl BitSet {
BitSet {
tinysets: tinybisets,
len: 0,
max_value
max_value,
}
}
@@ -167,18 +164,16 @@ impl BitSet {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len +=
if self.tinysets[higher as usize].insert_mut(lower) {
1
} else {
0
};
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`.
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32)
.contains(el % 64)
self.tinyset(el / 64u32).contains(el % 64)
}
/// Returns the first non-empty `TinySet` associated to a bucket lower
@@ -206,7 +201,6 @@ impl BitSet {
}
}
#[cfg(test)]
mod tests {
@@ -229,9 +223,7 @@ mod tests {
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty()
.insert(1u32)
.insert(1u32);
let mut u = TinySet::empty().insert(1u32).insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
@@ -275,7 +267,6 @@ mod tests {
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
@@ -310,16 +301,27 @@ mod tests {
#[test]
fn test_tinyset_range() {
assert_eq!(TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(), [0, 1, 2]);
assert_eq!(
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
[0, 1, 2]
);
assert!(TinySet::range_lower(0).is_empty());
assert_eq!(
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
(0u32..63u32).collect::<Vec<_>>()
);
assert_eq!(TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(), [0]);
assert_eq!(TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(), [0, 1]);
assert_eq!(
TinySet::range_greater_or_equal(3).into_iter().collect::<Vec<u32>>(),
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
[0]
);
assert_eq!(
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
[0, 1]
);
assert_eq!(
TinySet::range_greater_or_equal(3)
.into_iter()
.collect::<Vec<u32>>(),
(3u32..64u32).collect::<Vec<_>>()
);
}
@@ -350,47 +352,31 @@ mod tests {
assert!(els.iter().all(|el| bitset.contains(*el)));
bitset.clear();
for el in 0u32..1000u32 {
assert!(!bitset.contains(el));
assert!(!bitset.contains(el));
}
}
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
test::black_box(TinySet::singleton(31u32))
.pop_lowest()
});
b.iter(|| test::black_box(TinySet::singleton(31u32)).pop_lowest());
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty()
.insert(10u32)
.insert(14u32)
.insert(21u32);
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(
test::black_box(tiny_set).into_iter().sum::<u32>(),
45u32);
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32] ;
b.iter(|| {
test::black_box(v)
.iter()
.cloned()
.sum::<u32>()
});
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| {
BitSet::with_max_value(1_000_000)
});
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -52,7 +52,6 @@ pub(crate) fn compute_num_bits(n: u64) -> u8 {
}
}
pub(crate) fn is_power_of_2(n: usize) -> bool {
(n > 0) && (n & (n - 1) == 0)
}
@@ -128,7 +127,6 @@ pub(crate) mod test {
}
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
@@ -141,4 +139,3 @@ pub(crate) mod test {
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
}

View File

@@ -14,7 +14,6 @@ pub trait BinarySerializable: fmt::Debug + Sized {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
}
/// `FixedSize` marks a `BinarySerializable` as
/// always serializing to the same size.
pub trait FixedSize: BinarySerializable {
@@ -103,7 +102,6 @@ impl FixedSize for i64 {
const SIZE_IN_BYTES: usize = 8;
}
impl BinarySerializable for u8 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u8(*self)
@@ -134,21 +132,18 @@ impl BinarySerializable for String {
}
}
#[cfg(test)]
pub mod test {
use common::VInt;
use super::*;
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
fn serialize_test<T: BinarySerializable + Eq>(v: T) -> usize {
let mut buffer: Vec<u8> = Vec::new();
v.serialize(&mut buffer).unwrap();
@@ -186,7 +181,10 @@ pub mod test {
fn test_serialize_string() {
assert_eq!(serialize_test(String::from("")), 1);
assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4);
assert_eq!(serialize_test(String::from("富士さん見える。")), 1 + 3 * 8);
assert_eq!(
serialize_test(String::from("富士さん見える。")),
1 + 3 * 8
);
}
#[test]

View File

@@ -25,7 +25,9 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
let mut bit_packer = BitPacker::new();
for val in vals {
bit_packer.write(*val as u64, num_bits,&mut counting_writer).unwrap();
bit_packer
.write(*val as u64, num_bits, &mut counting_writer)
.unwrap();
}
counting_writer.written_bytes()
}
@@ -63,7 +65,9 @@ impl BlockEncoder {
counting_writer.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new();
for val in vals {
bit_packer.write(*val as u64, num_bits, &mut counting_writer).unwrap();
bit_packer
.write(*val as u64, num_bits, &mut counting_writer)
.unwrap();
}
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
bit_packer

View File

@@ -25,9 +25,7 @@ fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
}
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
unsafe {
simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset)
}
unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
}
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {

View File

@@ -14,7 +14,8 @@ pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,
pub opstamp: u64,
#[serde(skip_serializing_if = "Option::is_none")] pub payload: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
}
impl IndexMeta {

View File

@@ -132,7 +132,7 @@ mod tests {
fn test_skiplist9() {
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(4);
for i in 0..4*4*4 {
for i in 0..4 * 4 * 4 {
skip_list_builder.insert(i, &i).unwrap();
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
@@ -145,7 +145,7 @@ mod tests {
// checking that void gets serialized to nothing.
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
for i in 0..((4*4*4) - 1) {
for i in 0..((4 * 4 * 4) - 1) {
skip_list_builder.insert(i, &()).unwrap();
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
@@ -158,7 +158,7 @@ mod tests {
// checking that void gets serialized to nothing.
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
for i in 0..(4*4) {
for i in 0..(4 * 4) {
skip_list_builder.insert(i, &()).unwrap();
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();

View File

@@ -1,9 +1,8 @@
use std::io::Write;
use common::{is_power_of_2, VInt, BinarySerializable};
use common::{BinarySerializable, VInt, is_power_of_2};
use std::marker::PhantomData;
use std::io;
struct LayerBuilder<T: BinarySerializable> {
period_mask: usize,
buffer: Vec<u8>,

View File

@@ -93,14 +93,15 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
write,
bit_packer,
min_value,
num_bits
num_bits,
})
}
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer.write(val_to_write, self.num_bits,&mut self.write)?;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}

View File

@@ -115,9 +115,6 @@
//! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) /
//! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs))
#[macro_use]
extern crate lazy_static;
@@ -286,7 +283,7 @@ mod tests {
use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader};
use Postings;
use rand::{Rng, SeedableRng, XorShiftRng};
use rand::distributions::{Range, IndependentSample};
use rand::distributions::{IndependentSample, Range};
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
@@ -306,7 +303,6 @@ mod tests {
.collect::<Vec<u32>>()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}

View File

@@ -29,7 +29,7 @@ impl FixedSize for TermInfo {
/// of the block are bitpacked.
///
/// See `TermInfoStore`.
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2*u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES;
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES;
}
impl BinarySerializable for TermInfo {

View File

@@ -50,14 +50,14 @@ impl DocSet for BitSetDocSet {
return true;
}
if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) {
self.go_to_bucket(cursor_bucket);
let lower = self.cursor_tinybitset.pop_lowest().unwrap();
self.doc = (cursor_bucket * 64u32) | lower;
true
self.go_to_bucket(cursor_bucket);
let lower = self.cursor_tinybitset.pop_lowest().unwrap();
self.doc = (cursor_bucket * 64u32) | lower;
true
} else {
false
}
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
// skip is required to advance.
@@ -232,14 +232,15 @@ mod tests {
}
}
#[bench]
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
use tests;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
b.iter(|| {
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els.iter().cloned() { bitset.insert(el); }
for el in els.iter().cloned() {
bitset.insert(el);
}
});
}
@@ -248,8 +249,10 @@ mod tests {
use tests;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els { bitset.insert(el); }
b.iter(|| { bitset.clone() });
for el in els {
bitset.insert(el);
}
b.iter(|| bitset.clone());
}
#[bench]
@@ -258,11 +261,12 @@ mod tests {
use DocSet;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els { bitset.insert(el); }
for el in els {
bitset.insert(el);
}
b.iter(|| {
let mut docset = BitSetDocSet::from(bitset.clone());
while docset.advance() {}
});
}
}

View File

@@ -11,18 +11,18 @@ use query::ConstScorer;
use std::collections::Bound;
use std::collections::range::RangeArgument;
fn map_bound<TFrom, Transform: Fn(TFrom)->Vec<u8> >(bound: Bound<TFrom>, transform: &Transform) -> Bound<Vec<u8>> {
fn map_bound<TFrom, Transform: Fn(TFrom) -> Vec<u8>>(
bound: Bound<TFrom>,
transform: &Transform,
) -> Bound<Vec<u8>> {
use self::Bound::*;
match bound {
Excluded(from_val) => Excluded(transform(from_val)),
Included(from_val) => Included(transform(from_val)),
Unbounded => Unbounded
Unbounded => Unbounded,
}
}
/// `RangeQuery` match all documents that have at least one term within a defined range.
///
/// Matched document will all get a constant `Score` of one.
@@ -88,40 +88,42 @@ pub struct RangeQuery {
}
impl RangeQuery {
/// Create a new `RangeQuery` over a `i64` field.
pub fn new_i64<TRangeArgument: RangeArgument<i64>>(field: Field, range: TRangeArgument) -> RangeQuery {
let make_term_val = |val: &i64| {
Term::from_field_i64(field, *val).value_bytes().to_owned()
};
pub fn new_i64<TRangeArgument: RangeArgument<i64>>(
field: Field,
range: TRangeArgument,
) -> RangeQuery {
let make_term_val = |val: &i64| Term::from_field_i64(field, *val).value_bytes().to_owned();
RangeQuery {
field,
left_bound: map_bound(range.start(), &make_term_val),
right_bound: map_bound(range.end(), &make_term_val)
right_bound: map_bound(range.end(), &make_term_val),
}
}
/// Create a new `RangeQuery` over a `u64` field.
pub fn new_u64<TRangeArgument: RangeArgument<u64>>(field: Field, range: TRangeArgument) -> RangeQuery {
let make_term_val = |val: &u64| {
Term::from_field_u64(field, *val).value_bytes().to_owned()
};
pub fn new_u64<TRangeArgument: RangeArgument<u64>>(
field: Field,
range: TRangeArgument,
) -> RangeQuery {
let make_term_val = |val: &u64| Term::from_field_u64(field, *val).value_bytes().to_owned();
RangeQuery {
field,
left_bound: map_bound(range.start(), &make_term_val),
right_bound: map_bound(range.end(), &make_term_val)
right_bound: map_bound(range.end(), &make_term_val),
}
}
/// Create a new `RangeQuery` over a `Str` field.
pub fn new_str<'b, TRangeArgument: RangeArgument<&'b str>>(field: Field, range: TRangeArgument) -> RangeQuery {
let make_term_val = |val: &&str| {
val.as_bytes().to_vec()
};
pub fn new_str<'b, TRangeArgument: RangeArgument<&'b str>>(
field: Field,
range: TRangeArgument,
) -> RangeQuery {
let make_term_val = |val: &&str| val.as_bytes().to_vec();
RangeQuery {
field,
left_bound: map_bound(range.start(), &make_term_val),
right_bound: map_bound(range.end(), &make_term_val)
right_bound: map_bound(range.end(), &make_term_val),
}
}
}
@@ -135,7 +137,7 @@ impl Query for RangeQuery {
Ok(box RangeWeight {
field: self.field,
left_bound: self.left_bound.clone(),
right_bound: self.right_bound.clone()
right_bound: self.right_bound.clone(),
})
}
}
@@ -148,8 +150,8 @@ pub struct RangeWeight {
impl RangeWeight {
fn term_range<'a, T>(&self, term_dict: &'a T) -> T::Streamer
where
T: TermDictionary<'a> + 'a,
where
T: TermDictionary<'a> + 'a,
{
use std::collections::Bound::*;
let mut term_stream_builder = term_dict.range();
@@ -203,10 +205,9 @@ mod tests {
#[test]
fn test_range_query_simple() {
fn run() -> Result<()> {
let mut schema_builder = SchemaBuilder::new();
let year_field= schema_builder.add_u64_field("year", INT_INDEXED);
let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -233,7 +234,6 @@ mod tests {
}
run().unwrap();
}
#[test]
@@ -271,22 +271,22 @@ mod tests {
count_collector.count()
};
assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 10..11)), 9);
assert_eq!(
count_multiples(RangeQuery::new_i64(int_field, 10..11)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_i64(int_field, (Bound::Included(10), Bound::Included(11)) )),
count_multiples(RangeQuery::new_i64(
int_field,
(Bound::Included(10), Bound::Included(11))
)),
18
);
assert_eq!(
count_multiples(RangeQuery::new_i64(int_field, (Bound::Excluded(9), Bound::Included(10)))),
count_multiples(RangeQuery::new_i64(
int_field,
(Bound::Excluded(9), Bound::Included(10))
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_i64(int_field, 9..)),
91
);
assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 9..)), 91);
}
}

View File

@@ -62,7 +62,6 @@ impl Scorer for EmptyScorer {
}
}
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
/// The `ConstScorer` is useful if you have a `DocSet` where
/// you needed a scorer.
@@ -75,7 +74,6 @@ pub struct ConstScorer<TDocSet: DocSet> {
}
impl<TDocSet: DocSet> ConstScorer<TDocSet> {
/// Creates a new `ConstScorer`.
pub fn new(docset: TDocSet) -> ConstScorer<TDocSet> {
ConstScorer {

View File

@@ -16,7 +16,8 @@ pub enum Cardinality {
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct IntOptions {
indexed: bool,
#[serde(skip_serializing_if = "Option::is_none")] fast: Option<Cardinality>,
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
stored: bool,
}

View File

@@ -10,10 +10,8 @@ use directory::ReadOnlySource;
use termdict::TermOrdinal;
use byteorder::ByteOrder;
const BLOCK_LEN: usize = 256;
#[derive(Debug, Eq, PartialEq, Default)]
struct TermInfoBlockMeta {
offset: u64,
@@ -27,9 +25,11 @@ impl BinarySerializable for TermInfoBlockMeta {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.offset.serialize(write)?;
self.ref_term_info.serialize(write)?;
write.write_all(&[self.doc_freq_nbits,
self.postings_offset_nbits,
self.positions_offset_nbits])?;
write.write_all(&[
self.doc_freq_nbits,
self.postings_offset_nbits,
self.positions_offset_nbits,
])?;
Ok(())
}
@@ -43,17 +43,17 @@ impl BinarySerializable for TermInfoBlockMeta {
ref_term_info,
doc_freq_nbits: buffer[0],
postings_offset_nbits: buffer[1],
positions_offset_nbits: buffer[2]
positions_offset_nbits: buffer[2],
})
}
}
impl FixedSize for TermInfoBlockMeta {
const SIZE_IN_BYTES: usize = u64::SIZE_IN_BYTES + TermInfo::SIZE_IN_BYTES + 3 * u8::SIZE_IN_BYTES;
const SIZE_IN_BYTES: usize =
u64::SIZE_IN_BYTES + TermInfo::SIZE_IN_BYTES + 3 * u8::SIZE_IN_BYTES;
}
impl TermInfoBlockMeta {
fn num_bits(&self) -> u8 {
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_offset_nbits + 7
}
@@ -82,11 +82,10 @@ impl TermInfoBlockMeta {
}
}
pub struct TermInfoStore {
num_terms: usize,
block_meta_source: ReadOnlySource,
term_info_source: ReadOnlySource
term_info_source: ReadOnlySource,
}
fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
@@ -109,7 +108,7 @@ impl TermInfoStore {
TermInfoStore {
num_terms,
block_meta_source,
term_info_source
term_info_source,
}
}
@@ -117,13 +116,17 @@ impl TermInfoStore {
let block_id = (term_ord as usize) / BLOCK_LEN;
let buffer = self.block_meta_source.as_slice();
let mut block_data: &[u8] = &buffer[block_id * TermInfoBlockMeta::SIZE_IN_BYTES..];
let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data).expect("Failed to deserialize terminfoblockmeta");
let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data)
.expect("Failed to deserialize terminfoblockmeta");
let inner_offset = (term_ord as usize) % BLOCK_LEN;
if inner_offset == 0 {
term_info_block_data.ref_term_info
} else {
let term_info_data = self.term_info_source.as_slice();
term_info_block_data.deserialize_term_info(&term_info_data[term_info_block_data.offset as usize..], inner_offset - 1)
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
}
}
@@ -140,13 +143,26 @@ pub struct TermInfoStoreWriter {
}
fn bitpack_serialize<W: Write>(
write: &mut W,
bit_packer: &mut BitPacker,
term_info_block_meta: &TermInfoBlockMeta,
term_info: &TermInfo) -> io::Result<()> {
bit_packer.write(term_info.doc_freq as u64, term_info_block_meta.doc_freq_nbits, write)?;
bit_packer.write(term_info.postings_offset, term_info_block_meta.postings_offset_nbits, write)?;
bit_packer.write(term_info.positions_offset, term_info_block_meta.positions_offset_nbits, write)?;
write: &mut W,
bit_packer: &mut BitPacker,
term_info_block_meta: &TermInfoBlockMeta,
term_info: &TermInfo,
) -> io::Result<()> {
bit_packer.write(
term_info.doc_freq as u64,
term_info_block_meta.doc_freq_nbits,
write,
)?;
bit_packer.write(
term_info.postings_offset,
term_info_block_meta.postings_offset_nbits,
write,
)?;
bit_packer.write(
term_info.positions_offset,
term_info_block_meta.positions_offset_nbits,
write,
)?;
bit_packer.write(term_info.positions_inner_offset as u64, 7, write)?;
Ok(())
}
@@ -157,7 +173,7 @@ impl TermInfoStoreWriter {
buffer_block_metas: Vec::new(),
buffer_term_infos: Vec::new(),
term_infos: Vec::with_capacity(BLOCK_LEN),
num_terms: 0u64
num_terms: 0u64,
}
}
@@ -199,7 +215,7 @@ impl TermInfoStoreWriter {
&mut self.buffer_term_infos,
&mut bit_packer,
&term_info_block_meta,
&term_info
&term_info,
)?;
}
@@ -276,11 +292,11 @@ mod tests {
doc_freq: 512,
postings_offset: 51,
positions_offset: 3584,
positions_inner_offset: 0
positions_inner_offset: 0,
},
doc_freq_nbits: 10,
postings_offset_nbits: 5,
positions_offset_nbits: 11
positions_offset_nbits: 11,
};
let mut buffer: Vec<u8> = Vec::new();
term_info_block_meta.serialize(&mut buffer).unwrap();
@@ -292,7 +308,7 @@ mod tests {
#[test]
fn test_pack() {
let mut store_writer = TermInfoStoreWriter::new();
let mut term_infos = vec!();
let mut term_infos = vec![];
for i in 0..1000 {
let term_info = TermInfo {
doc_freq: i as u32,
@@ -304,9 +320,7 @@ mod tests {
term_infos.push(term_info);
}
let mut buffer = Vec::new();
store_writer
.serialize(&mut buffer)
.unwrap();
store_writer.serialize(&mut buffer).unwrap();
let term_info_store = TermInfoStore::open(ReadOnlySource::from(buffer));
for i in 0..1000 {
assert_eq!(term_info_store.get(i as u64), term_infos[i]);
@@ -314,5 +328,3 @@ mod tests {
}
}

View File

@@ -7,7 +7,7 @@ use common::CountingWriter;
use schema::FieldType;
use postings::TermInfo;
use termdict::{TermDictionary, TermDictionaryBuilder, TermOrdinal};
use super::{TermStreamerBuilderImpl, TermStreamerImpl, TermInfoStoreWriter, TermInfoStore};
use super::{TermInfoStore, TermInfoStoreWriter, TermStreamerBuilderImpl, TermStreamerImpl};
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)

View File

@@ -87,6 +87,7 @@ mod tests {
use tokenizer::{Token, TokenStream, Tokenizer};
use super::FacetTokenizer;
use schema::Facet;
use std::str;
#[test]
fn test_facet_tokenizer() {
@@ -98,9 +99,7 @@ mod tests {
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe {
::std::str::from_utf8_unchecked(facet.encoded_bytes())
})
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
@@ -120,9 +119,7 @@ mod tests {
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe {
::std::str::from_utf8_unchecked(facet.encoded_bytes())
})
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);